// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package tcpip provides the interfaces and related types that users of the
// tcpip stack will use in order to create endpoints used to send and receive
// data over the network stack.
//
// The starting point is the creation and configuration of a stack. A stack can
// be created by calling the New() function of the tcpip/stack/stack package;
// configuring a stack involves creating NICs (via calls to Stack.CreateNIC()),
// adding network addresses (via calls to Stack.AddProtocolAddress()), and
// setting a route table (via a call to Stack.SetRouteTable()).
//
// Once a stack is configured, endpoints can be created by calling
// Stack.NewEndpoint(). Such endpoints can be used to send/receive data, connect
// to peers, listen for connections, accept connections, etc., depending on the
// transport protocol selected.
package tcpip

import (
	"bytes"
	"errors"
	"fmt"
	"io"
	"math"
	"math/bits"
	"net"
	"reflect"
	"strconv"
	"strings"
	"time"

	"gvisor.dev/gvisor/pkg/atomicbitops"
	"gvisor.dev/gvisor/pkg/rand"
	"gvisor.dev/gvisor/pkg/sync"
	"gvisor.dev/gvisor/pkg/waiter"
)

// Using the header package here would cause an import cycle.
const (
	ipv4AddressSize    = 4
	ipv4ProtocolNumber = 0x0800
	ipv6AddressSize    = 16
	ipv6ProtocolNumber = 0x86dd
)

const (
	// LinkAddressSize is the size of a MAC address.
	LinkAddressSize = 6
)

// Known IP address.
var (
	IPv4Zero = []byte{0, 0, 0, 0}
	IPv6Zero = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
)

// Errors related to Subnet
var (
	errSubnetLengthMismatch = errors.New("subnet length of address and mask differ")
	errSubnetAddressMasked  = errors.New("subnet address has bits set outside the mask")
)

// ErrSaveRejection indicates a failed save due to unsupported networking state.
// This type of errors is only used for save logic.
type ErrSaveRejection struct {
	Err error
}

// Error returns a sensible description of the save rejection error.
func (e *ErrSaveRejection) Error() string {
	return "save rejected due to unsupported networking state: " + e.Err.Error()
}

// MonotonicTime is a monotonic clock reading.
//
// +stateify savable
type MonotonicTime struct {
	nanoseconds int64
}

// String implements Stringer.
func (mt MonotonicTime) String() string {
	return strconv.FormatInt(mt.nanoseconds, 10)
}

// MonotonicTimeInfinite returns the monotonic timestamp as far away in the
// future as possible.
func MonotonicTimeInfinite() MonotonicTime {
	return MonotonicTime{nanoseconds: math.MaxInt64}
}

// Before reports whether the monotonic clock reading mt is before u.
func (mt MonotonicTime) Before(u MonotonicTime) bool {
	return mt.nanoseconds < u.nanoseconds
}

// After reports whether the monotonic clock reading mt is after u.
func (mt MonotonicTime) After(u MonotonicTime) bool {
	return mt.nanoseconds > u.nanoseconds
}

// Add returns the monotonic clock reading mt+d.
func (mt MonotonicTime) Add(d time.Duration) MonotonicTime {
	return MonotonicTime{
		nanoseconds: time.Unix(0, mt.nanoseconds).Add(d).Sub(time.Unix(0, 0)).Nanoseconds(),
	}
}

// Sub returns the duration mt-u. If the result exceeds the maximum (or minimum)
// value that can be stored in a Duration, the maximum (or minimum) duration
// will be returned. To compute t-d for a duration d, use t.Add(-d).
func (mt MonotonicTime) Sub(u MonotonicTime) time.Duration {
	return time.Unix(0, mt.nanoseconds).Sub(time.Unix(0, u.nanoseconds))
}

// Milliseconds returns the time in milliseconds.
func (mt MonotonicTime) Milliseconds() int64 {
	return mt.nanoseconds / 1e6
}

// A Clock provides the current time and schedules work for execution.
//
// Times returned by a Clock should always be used for application-visible
// time. Only monotonic times should be used for netstack internal timekeeping.
type Clock interface {
	// Now returns the current local time.
	Now() time.Time

	// NowMonotonic returns the current monotonic clock reading.
	NowMonotonic() MonotonicTime

	// AfterFunc waits for the duration to elapse and then calls f in its own
	// goroutine. It returns a Timer that can be used to cancel the call using
	// its Stop method.
	AfterFunc(d time.Duration, f func()) Timer
}

// Timer represents a single event. A Timer must be created with
// Clock.AfterFunc.
type Timer interface {
	// Stop prevents the Timer from firing. It returns true if the call stops the
	// timer, false if the timer has already expired or been stopped.
	//
	// If Stop returns false, then the timer has already expired and the function
	// f of Clock.AfterFunc(d, f) has been started in its own goroutine; Stop
	// does not wait for f to complete before returning. If the caller needs to
	// know whether f is completed, it must coordinate with f explicitly.
	Stop() bool

	// Reset changes the timer to expire after duration d.
	//
	// Reset should be invoked only on stopped or expired timers. If the timer is
	// known to have expired, Reset can be used directly. Otherwise, the caller
	// must coordinate with the function f of Clock.AfterFunc(d, f).
	Reset(d time.Duration)
}

// Address is a byte slice cast as a string that represents the address of a
// network node. Or, in the case of unix endpoints, it may represent a path.
//
// +stateify savable
type Address struct {
	addr   [16]byte
	length int
}

// AddrFrom4 converts addr to an Address.
func AddrFrom4(addr [4]byte) Address {
	ret := Address{
		length: 4,
	}
	// It's guaranteed that copy will return 4.
	copy(ret.addr[:], addr[:])
	return ret
}

// AddrFrom4Slice converts addr to an Address. It panics if len(addr) != 4.
func AddrFrom4Slice(addr []byte) Address {
	if len(addr) != 4 {
		panic(fmt.Sprintf("bad address length for address %v", addr))
	}
	ret := Address{
		length: 4,
	}
	// It's guaranteed that copy will return 4.
	copy(ret.addr[:], addr)
	return ret
}

// AddrFrom16 converts addr to an Address.
func AddrFrom16(addr [16]byte) Address {
	ret := Address{
		length: 16,
	}
	// It's guaranteed that copy will return 16.
	copy(ret.addr[:], addr[:])
	return ret
}

// AddrFrom16Slice converts addr to an Address. It panics if len(addr) != 16.
func AddrFrom16Slice(addr []byte) Address {
	if len(addr) != 16 {
		panic(fmt.Sprintf("bad address length for address %v", addr))
	}
	ret := Address{
		length: 16,
	}
	// It's guaranteed that copy will return 16.
	copy(ret.addr[:], addr)
	return ret
}

// AddrFromSlice converts addr to an Address. It returns the Address zero value
// if len(addr) != 4 or 16.
func AddrFromSlice(addr []byte) Address {
	switch len(addr) {
	case ipv4AddressSize:
		return AddrFrom4Slice(addr)
	case ipv6AddressSize:
		return AddrFrom16Slice(addr)
	}
	return Address{}
}

// As4 returns a as a 4 byte array. It panics if the address length is not 4.
func (a Address) As4() [4]byte {
	if a.Len() != 4 {
		panic(fmt.Sprintf("bad address length for address %v", a.addr))
	}
	return [4]byte(a.addr[:4])
}

// As16 returns a as a 16 byte array. It panics if the address length is not 16.
func (a Address) As16() [16]byte {
	if a.Len() != 16 {
		panic(fmt.Sprintf("bad address length for address %v", a.addr))
	}
	return [16]byte(a.addr[:16])
}

// AsSlice returns a as a byte slice. Callers should be careful as it can
// return a window into existing memory.
//
// +checkescape
func (a *Address) AsSlice() []byte {
	return a.addr[:a.length]
}

// BitLen returns the length in bits of a.
func (a Address) BitLen() int {
	return a.Len() * 8
}

// Len returns the length in bytes of a.
func (a Address) Len() int {
	return a.length
}

// WithPrefix returns the address with a prefix that represents a point subnet.
func (a Address) WithPrefix() AddressWithPrefix {
	return AddressWithPrefix{
		Address:   a,
		PrefixLen: a.BitLen(),
	}
}

// Unspecified returns true if the address is unspecified.
func (a Address) Unspecified() bool {
	for _, b := range a.addr {
		if b != 0 {
			return false
		}
	}
	return true
}

// Equal returns whether a and other are equal. It exists for use by the cmp
// library.
func (a Address) Equal(other Address) bool {
	return a == other
}

// MatchingPrefix returns the matching prefix length in bits.
//
// Panics if b and a have different lengths.
func (a Address) MatchingPrefix(b Address) uint8 {
	const bitsInAByte = 8

	if a.Len() != b.Len() {
		panic(fmt.Sprintf("addresses %s and %s do not have the same length", a, b))
	}

	var prefix uint8
	for i := 0; i < a.length; i++ {
		aByte := a.addr[i]
		bByte := b.addr[i]

		if aByte == bByte {
			prefix += bitsInAByte
			continue
		}

		// Count the remaining matching bits in the byte from MSbit to LSBbit.
		mask := uint8(1) << (bitsInAByte - 1)
		for {
			if aByte&mask == bByte&mask {
				prefix++
				mask >>= 1
				continue
			}

			break
		}

		break
	}

	return prefix
}

// AddressMask is a bitmask for an address.
//
// +stateify savable
type AddressMask struct {
	mask   [16]byte
	length int
}

// MaskFrom returns a Mask based on str.
//
// MaskFrom may allocate, and so should not be in hot paths.
func MaskFrom(str string) AddressMask {
	mask := AddressMask{length: len(str)}
	copy(mask.mask[:], str)
	return mask
}

// MaskFromBytes returns a Mask based on bs.
func MaskFromBytes(bs []byte) AddressMask {
	mask := AddressMask{length: len(bs)}
	copy(mask.mask[:], bs)
	return mask
}

// String implements Stringer.
func (m AddressMask) String() string {
	return fmt.Sprintf("%x", m.mask)
}

// AsSlice returns a as a byte slice. Callers should be careful as it can
// return a window into existing memory.
func (m *AddressMask) AsSlice() []byte {
	return []byte(m.mask[:m.length])
}

// BitLen returns the length of the mask in bits.
func (m AddressMask) BitLen() int {
	return m.length * 8
}

// Len returns the length of the mask in bytes.
func (m AddressMask) Len() int {
	return m.length
}

// Prefix returns the number of bits before the first host bit.
func (m AddressMask) Prefix() int {
	p := 0
	for _, b := range m.mask[:m.length] {
		p += bits.LeadingZeros8(^b)
	}
	return p
}

// Equal returns whether m and other are equal. It exists for use by the cmp
// library.
func (m AddressMask) Equal(other AddressMask) bool {
	return m == other
}

// Subnet is a subnet defined by its address and mask.
//
// +stateify savable
type Subnet struct {
	address Address
	mask    AddressMask
}

// NewSubnet creates a new Subnet, checking that the address and mask are the same length.
func NewSubnet(a Address, m AddressMask) (Subnet, error) {
	if a.Len() != m.Len() {
		return Subnet{}, errSubnetLengthMismatch
	}
	for i := 0; i < a.Len(); i++ {
		if a.addr[i]&^m.mask[i] != 0 {
			return Subnet{}, errSubnetAddressMasked
		}
	}
	return Subnet{a, m}, nil
}

// String implements Stringer.
func (s Subnet) String() string {
	return fmt.Sprintf("%s/%d", s.ID(), s.Prefix())
}

// Contains returns true iff the address is of the same length and matches the
// subnet address and mask.
func (s *Subnet) Contains(a Address) bool {
	if a.Len() != s.address.Len() {
		return false
	}
	for i := 0; i < a.Len(); i++ {
		if a.addr[i]&s.mask.mask[i] != s.address.addr[i] {
			return false
		}
	}
	return true
}

// ID returns the subnet ID.
func (s *Subnet) ID() Address {
	return s.address
}

// Bits returns the number of ones (network bits) and zeros (host bits) in the
// subnet mask.
func (s *Subnet) Bits() (ones int, zeros int) {
	ones = s.mask.Prefix()
	return ones, s.mask.BitLen() - ones
}

// Prefix returns the number of bits before the first host bit.
func (s *Subnet) Prefix() int {
	return s.mask.Prefix()
}

// Mask returns the subnet mask.
func (s *Subnet) Mask() AddressMask {
	return s.mask
}

// Broadcast returns the subnet's broadcast address.
func (s *Subnet) Broadcast() Address {
	addrCopy := s.address
	for i := 0; i < addrCopy.Len(); i++ {
		addrCopy.addr[i] |= ^s.mask.mask[i]
	}
	return addrCopy
}

// IsBroadcast returns true if the address is considered a broadcast address.
func (s *Subnet) IsBroadcast(address Address) bool {
	// Only IPv4 supports the notion of a broadcast address.
	if address.Len() != ipv4AddressSize {
		return false
	}

	// Normally, we would just compare address with the subnet's broadcast
	// address but there is an exception where a simple comparison is not
	// correct. This exception is for /31 and /32 IPv4 subnets where all
	// addresses are considered valid host addresses.
	//
	// For /31 subnets, the case is easy. RFC 3021 Section 2.1 states that
	// both addresses in a /31 subnet "MUST be interpreted as host addresses."
	//
	// For /32, the case is a bit more vague. RFC 3021 makes no mention of /32
	// subnets. However, the same reasoning applies - if an exception is not
	// made, then there do not exist any host addresses in a /32 subnet. RFC
	// 4632 Section 3.1 also vaguely implies this interpretation by referring
	// to addresses in /32 subnets as "host routes."
	return s.Prefix() <= 30 && s.Broadcast() == address
}

// Equal returns true if this Subnet is equal to the given Subnet.
func (s Subnet) Equal(o Subnet) bool {
	// If this changes, update Route.Equal accordingly.
	return s == o
}

// NICID is a number that uniquely identifies a NIC.
type NICID int32

// ShutdownFlags represents flags that can be passed to the Shutdown() method
// of the Endpoint interface.
type ShutdownFlags int

// Values of the flags that can be passed to the Shutdown() method. They can
// be OR'ed together.
const (
	ShutdownRead ShutdownFlags = 1 << iota
	ShutdownWrite
)

// PacketType is used to indicate the destination of the packet.
type PacketType uint8

const (
	// PacketHost indicates a packet addressed to the local host.
	PacketHost PacketType = iota

	// PacketOtherHost indicates an outgoing packet addressed to
	// another host caught by a NIC in promiscuous mode.
	PacketOtherHost

	// PacketOutgoing for a packet originating from the local host
	// that is looped back to a packet socket.
	PacketOutgoing

	// PacketBroadcast indicates a link layer broadcast packet.
	PacketBroadcast

	// PacketMulticast indicates a link layer multicast packet.
	PacketMulticast
)

// FullAddress represents a full transport node address, as required by the
// Connect() and Bind() methods.
//
// +stateify savable
type FullAddress struct {
	// NIC is the ID of the NIC this address refers to.
	//
	// This may not be used by all endpoint types.
	NIC NICID

	// Addr is the network address.
	Addr Address

	// Port is the transport port.
	//
	// This may not be used by all endpoint types.
	Port uint16

	// LinkAddr is the link layer address.
	LinkAddr LinkAddress
}

// Payloader is an interface that provides data.
//
// This interface allows the endpoint to request the amount of data it needs
// based on internal buffers without exposing them.
type Payloader interface {
	io.Reader

	// Len returns the number of bytes of the unread portion of the
	// Reader.
	Len() int
}

var _ Payloader = (*bytes.Buffer)(nil)
var _ Payloader = (*bytes.Reader)(nil)

var _ io.Writer = (*SliceWriter)(nil)

// SliceWriter implements io.Writer for slices.
type SliceWriter []byte

// Write implements io.Writer.Write.
func (s *SliceWriter) Write(b []byte) (int, error) {
	n := copy(*s, b)
	*s = (*s)[n:]
	var err error
	if n != len(b) {
		err = io.ErrShortWrite
	}
	return n, err
}

var _ io.Writer = (*LimitedWriter)(nil)

// A LimitedWriter writes to W but limits the amount of data copied to just N
// bytes. Each call to Write updates N to reflect the new amount remaining.
type LimitedWriter struct {
	W io.Writer
	N int64
}

func (l *LimitedWriter) Write(p []byte) (int, error) {
	pLen := int64(len(p))
	if pLen > l.N {
		p = p[:l.N]
	}
	n, err := l.W.Write(p)
	n64 := int64(n)
	if err == nil && n64 != pLen {
		err = io.ErrShortWrite
	}
	l.N -= n64
	return n, err
}

// SendableControlMessages contains socket control messages that can be written.
//
// +stateify savable
type SendableControlMessages struct {
	// HasTTL indicates whether TTL is valid/set.
	HasTTL bool

	// TTL is the IPv4 Time To Live of the associated packet.
	TTL uint8

	// HasHopLimit indicates whether HopLimit is valid/set.
	HasHopLimit bool

	// HopLimit is the IPv6 Hop Limit of the associated packet.
	HopLimit uint8

	// HasIPv6PacketInfo indicates whether IPv6PacketInfo is set.
	HasIPv6PacketInfo bool

	// IPv6PacketInfo holds interface and address data on an incoming packet.
	IPv6PacketInfo IPv6PacketInfo
}

// ReceivableControlMessages contains socket control messages that can be
// received.
//
// +stateify savable
type ReceivableControlMessages struct {
	// Timestamp is the time that the last packet used to create the read data
	// was received.
	Timestamp time.Time `state:".(int64)"`

	// HasInq indicates whether Inq is valid/set.
	HasInq bool

	// Inq is the number of bytes ready to be received.
	Inq int32

	// HasTOS indicates whether TOS is valid/set.
	HasTOS bool

	// TOS is the IPv4 type of service of the associated packet.
	TOS uint8

	// HasTTL indicates whether TTL is valid/set.
	HasTTL bool

	// TTL is the IPv4 Time To Live of the associated packet.
	TTL uint8

	// HasHopLimit indicates whether HopLimit is valid/set.
	HasHopLimit bool

	// HopLimit is the IPv6 Hop Limit of the associated packet.
	HopLimit uint8

	// HasTimestamp indicates whether Timestamp is valid/set.
	HasTimestamp bool

	// HasTClass indicates whether TClass is valid/set.
	HasTClass bool

	// TClass is the IPv6 traffic class of the associated packet.
	TClass uint32

	// HasIPPacketInfo indicates whether PacketInfo is set.
	HasIPPacketInfo bool

	// PacketInfo holds interface and address data on an incoming packet.
	PacketInfo IPPacketInfo

	// HasIPv6PacketInfo indicates whether IPv6PacketInfo is set.
	HasIPv6PacketInfo bool

	// IPv6PacketInfo holds interface and address data on an incoming packet.
	IPv6PacketInfo IPv6PacketInfo

	// HasOriginalDestinationAddress indicates whether OriginalDstAddress is
	// set.
	HasOriginalDstAddress bool

	// OriginalDestinationAddress holds the original destination address
	// and port of the incoming packet.
	OriginalDstAddress FullAddress

	// SockErr is the dequeued socket error on recvmsg(MSG_ERRQUEUE).
	SockErr *SockError
}

// PacketOwner is used to get UID and GID of the packet.
type PacketOwner interface {
	// KUID returns KUID of the packet.
	KUID() uint32

	// KGID returns KGID of the packet.
	KGID() uint32
}

// ReadOptions contains options for Endpoint.Read.
type ReadOptions struct {
	// Peek indicates whether this read is a peek.
	Peek bool

	// NeedRemoteAddr indicates whether to return the remote address, if
	// supported.
	NeedRemoteAddr bool

	// NeedLinkPacketInfo indicates whether to return the link-layer information,
	// if supported.
	NeedLinkPacketInfo bool
}

// ReadResult represents result for a successful Endpoint.Read.
type ReadResult struct {
	// Count is the number of bytes received and written to the buffer.
	Count int

	// Total is the number of bytes of the received packet. This can be used to
	// determine whether the read is truncated.
	Total int

	// ControlMessages is the control messages received.
	ControlMessages ReceivableControlMessages

	// RemoteAddr is the remote address if ReadOptions.NeedAddr is true.
	RemoteAddr FullAddress

	// LinkPacketInfo is the link-layer information of the received packet if
	// ReadOptions.NeedLinkPacketInfo is true.
	LinkPacketInfo LinkPacketInfo
}

// Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
// that exposes functionality like read, write, connect, etc. to users of the
// networking stack.
type Endpoint interface {
	// Close puts the endpoint in a closed state and frees all resources
	// associated with it. Close initiates the teardown process, the
	// Endpoint may not be fully closed when Close returns.
	Close()

	// Abort initiates an expedited endpoint teardown. As compared to
	// Close, Abort prioritizes closing the Endpoint quickly over cleanly.
	// Abort is best effort; implementing Abort with Close is acceptable.
	Abort()

	// Read reads data from the endpoint and optionally writes to dst.
	//
	// This method does not block if there is no data pending; in this case,
	// ErrWouldBlock is returned.
	//
	// If non-zero number of bytes are successfully read and written to dst, err
	// must be nil. Otherwise, if dst failed to write anything, ErrBadBuffer
	// should be returned.
	Read(io.Writer, ReadOptions) (ReadResult, Error)

	// Write writes data to the endpoint's peer. This method does not block if
	// the data cannot be written.
	//
	// Unlike io.Writer.Write, Endpoint.Write transfers ownership of any bytes
	// successfully written to the Endpoint. That is, if a call to
	// Write(SlicePayload{data}) returns (n, err), it may retain data[:n], and
	// the caller should not use data[:n] after Write returns.
	//
	// Note that unlike io.Writer.Write, it is not an error for Write to
	// perform a partial write (if n > 0, no error may be returned). Only
	// stream (TCP) Endpoints may return partial writes, and even then only
	// in the case where writing additional data would block. Other Endpoints
	// will either write the entire message or return an error.
	Write(Payloader, WriteOptions) (int64, Error)

	// Connect connects the endpoint to its peer. Specifying a NIC is
	// optional.
	//
	// There are three classes of return values:
	//	nil -- the attempt to connect succeeded.
	//	ErrConnectStarted/ErrAlreadyConnecting -- the connect attempt started
	//		but hasn't completed yet. In this case, the caller must call Connect
	//		or GetSockOpt(ErrorOption) when the endpoint becomes writable to
	//		get the actual result. The first call to Connect after the socket has
	//		connected returns nil. Calling connect again results in ErrAlreadyConnected.
	//	Anything else -- the attempt to connect failed.
	//
	// If address.Addr is empty, this means that Endpoint has to be
	// disconnected if this is supported, otherwise
	// ErrAddressFamilyNotSupported must be returned.
	Connect(address FullAddress) Error

	// Disconnect disconnects the endpoint from its peer.
	Disconnect() Error

	// Shutdown closes the read and/or write end of the endpoint connection
	// to its peer.
	Shutdown(flags ShutdownFlags) Error

	// Listen puts the endpoint in "listen" mode, which allows it to accept
	// new connections.
	Listen(backlog int) Error

	// Accept returns a new endpoint if a peer has established a connection
	// to an endpoint previously set to listen mode. This method does not
	// block if no new connections are available.
	//
	// The returned Queue is the wait queue for the newly created endpoint.
	//
	// If peerAddr is not nil then it is populated with the peer address of the
	// returned endpoint.
	Accept(peerAddr *FullAddress) (Endpoint, *waiter.Queue, Error)

	// Bind binds the endpoint to a specific local address and port.
	// Specifying a NIC is optional.
	Bind(address FullAddress) Error

	// GetLocalAddress returns the address to which the endpoint is bound.
	GetLocalAddress() (FullAddress, Error)

	// GetRemoteAddress returns the address to which the endpoint is
	// connected.
	GetRemoteAddress() (FullAddress, Error)

	// Readiness returns the current readiness of the endpoint. For example,
	// if waiter.EventIn is set, the endpoint is immediately readable.
	Readiness(mask waiter.EventMask) waiter.EventMask

	// SetSockOpt sets a socket option.
	SetSockOpt(opt SettableSocketOption) Error

	// SetSockOptInt sets a socket option, for simple cases where a value
	// has the int type.
	SetSockOptInt(opt SockOptInt, v int) Error

	// GetSockOpt gets a socket option.
	GetSockOpt(opt GettableSocketOption) Error

	// GetSockOptInt gets a socket option for simple cases where a return
	// value has the int type.
	GetSockOptInt(SockOptInt) (int, Error)

	// State returns a socket's lifecycle state. The returned value is
	// protocol-specific and is primarily used for diagnostics.
	State() uint32

	// ModerateRecvBuf should be called everytime data is copied to the user
	// space. This allows for dynamic tuning of recv buffer space for a
	// given socket.
	//
	// NOTE: This method is a no-op for sockets other than TCP.
	ModerateRecvBuf(copied int)

	// Info returns a copy to the transport endpoint info.
	Info() EndpointInfo

	// Stats returns a reference to the endpoint stats.
	Stats() EndpointStats

	// SetOwner sets the task owner to the endpoint owner.
	SetOwner(owner PacketOwner)

	// LastError clears and returns the last error reported by the endpoint.
	LastError() Error

	// SocketOptions returns the structure which contains all the socket
	// level options.
	SocketOptions() *SocketOptions
}

// EndpointWithPreflight is the interface implemented by endpoints that need
// to expose the `Preflight` method for preparing the endpoint prior to
// calling `Write`.
type EndpointWithPreflight interface {
	// Prepares the endpoint for writes using the provided WriteOptions,
	// returning an error if the options were incompatible with the endpoint's
	// current state.
	Preflight(WriteOptions) Error
}

// LinkPacketInfo holds Link layer information for a received packet.
//
// +stateify savable
type LinkPacketInfo struct {
	// Protocol is the NetworkProtocolNumber for the packet.
	Protocol NetworkProtocolNumber

	// PktType is used to indicate the destination of the packet.
	PktType PacketType
}

// EndpointInfo is the interface implemented by each endpoint info struct.
type EndpointInfo interface {
	// IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
	// marker interface.
	IsEndpointInfo()
}

// EndpointStats is the interface implemented by each endpoint stats struct.
type EndpointStats interface {
	// IsEndpointStats is an empty method to implement the tcpip.EndpointStats
	// marker interface.
	IsEndpointStats()
}

// WriteOptions contains options for Endpoint.Write.
type WriteOptions struct {
	// If To is not nil, write to the given address instead of the endpoint's
	// peer.
	To *FullAddress

	// More has the same semantics as Linux's MSG_MORE.
	More bool

	// EndOfRecord has the same semantics as Linux's MSG_EOR.
	EndOfRecord bool

	// Atomic means that all data fetched from Payloader must be written to the
	// endpoint. If Atomic is false, then data fetched from the Payloader may be
	// discarded if available endpoint buffer space is insufficient.
	Atomic bool

	// ControlMessages contains optional overrides used when writing a packet.
	ControlMessages SendableControlMessages
}

// SockOptInt represents socket options which values have the int type.
type SockOptInt int

const (
	// KeepaliveCountOption is used by SetSockOptInt/GetSockOptInt to
	// specify the number of un-ACKed TCP keepalives that will be sent
	// before the connection is closed.
	KeepaliveCountOption SockOptInt = iota

	// IPv4TOSOption is used by SetSockOptInt/GetSockOptInt to specify TOS
	// for all subsequent outgoing IPv4 packets from the endpoint.
	IPv4TOSOption

	// IPv6TrafficClassOption is used by SetSockOptInt/GetSockOptInt to
	// specify TOS for all subsequent outgoing IPv6 packets from the
	// endpoint.
	IPv6TrafficClassOption

	// MaxSegOption is used by SetSockOptInt/GetSockOptInt to set/get the
	// current Maximum Segment Size(MSS) value as specified using the
	// TCP_MAXSEG option.
	MaxSegOption

	// MTUDiscoverOption is used to set/get the path MTU discovery setting.
	//
	// NOTE: Setting this option to any other value than PMTUDiscoveryDont
	// is not supported and will fail as such, and getting this option will
	// always return PMTUDiscoveryDont.
	MTUDiscoverOption

	// MulticastTTLOption is used by SetSockOptInt/GetSockOptInt to control
	// the default TTL value for multicast messages. The default is 1.
	MulticastTTLOption

	// ReceiveQueueSizeOption is used in GetSockOptInt to specify that the
	// number of unread bytes in the input buffer should be returned.
	ReceiveQueueSizeOption

	// SendQueueSizeOption is used in GetSockOptInt to specify that the
	// number of unread bytes in the output buffer should be returned.
	SendQueueSizeOption

	// IPv4TTLOption is used by SetSockOptInt/GetSockOptInt to control the default
	// TTL value for unicast messages.
	//
	// The default is configured by DefaultTTLOption. A UseDefaultIPv4TTL value
	// configures the endpoint to use the default.
	IPv4TTLOption

	// IPv6HopLimitOption is used by SetSockOptInt/GetSockOptInt to control the
	// default hop limit value for unicast messages.
	//
	// The default is configured by DefaultTTLOption. A UseDefaultIPv6HopLimit
	// value configures the endpoint to use the default.
	IPv6HopLimitOption

	// TCPSynCountOption is used by SetSockOptInt/GetSockOptInt to specify
	// the number of SYN retransmits that TCP should send before aborting
	// the attempt to connect. It cannot exceed 255.
	//
	// NOTE: This option is currently only stubbed out and is no-op.
	TCPSynCountOption

	// TCPWindowClampOption is used by SetSockOptInt/GetSockOptInt to bound
	// the size of the advertised window to this value.
	//
	// NOTE: This option is currently only stubed out and is a no-op
	TCPWindowClampOption

	// IPv6Checksum is used to request the stack to populate and validate the IPv6
	// checksum for transport level headers.
	IPv6Checksum

	// PacketMMapVersionOption is used to set the packet mmap version.
	PacketMMapVersionOption

	// PacketMMapReserveOption is used to set the packet mmap reserved space
	// between the aligned header and the payload.
	PacketMMapReserveOption
)

const (
	// UseDefaultIPv4TTL is the IPv4TTLOption value that configures an endpoint to
	// use the default ttl currently configured by the IPv4 protocol (see
	// DefaultTTLOption).
	UseDefaultIPv4TTL = 0

	// UseDefaultIPv6HopLimit is the IPv6HopLimitOption value that configures an
	// endpoint to use the default hop limit currently configured by the IPv6
	// protocol (see DefaultTTLOption).
	UseDefaultIPv6HopLimit = -1
)

// PMTUDStrategy is the kind of PMTUD to perform.
type PMTUDStrategy int

const (
	// PMTUDiscoveryWant is a setting of the MTUDiscoverOption to use
	// per-route settings.
	PMTUDiscoveryWant PMTUDStrategy = iota

	// PMTUDiscoveryDont is a setting of the MTUDiscoverOption to disable
	// path MTU discovery.
	PMTUDiscoveryDont

	// PMTUDiscoveryDo is a setting of the MTUDiscoverOption to always do
	// path MTU discovery.
	PMTUDiscoveryDo

	// PMTUDiscoveryProbe is a setting of the MTUDiscoverOption to set DF
	// but ignore path MTU.
	PMTUDiscoveryProbe
)

// GettableNetworkProtocolOption is a marker interface for network protocol
// options that may be queried.
type GettableNetworkProtocolOption interface {
	isGettableNetworkProtocolOption()
}

// SettableNetworkProtocolOption is a marker interface for network protocol
// options that may be set.
type SettableNetworkProtocolOption interface {
	isSettableNetworkProtocolOption()
}

// DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
// a default TTL.
type DefaultTTLOption uint8

func (*DefaultTTLOption) isGettableNetworkProtocolOption() {}

func (*DefaultTTLOption) isSettableNetworkProtocolOption() {}

// GettableTransportProtocolOption is a marker interface for transport protocol
// options that may be queried.
type GettableTransportProtocolOption interface {
	isGettableTransportProtocolOption()
}

// SettableTransportProtocolOption is a marker interface for transport protocol
// options that may be set.
type SettableTransportProtocolOption interface {
	isSettableTransportProtocolOption()
}

// TCPSACKEnabled the SACK option for TCP.
//
// See: https://tools.ietf.org/html/rfc2018.
type TCPSACKEnabled bool

func (*TCPSACKEnabled) isGettableTransportProtocolOption() {}

func (*TCPSACKEnabled) isSettableTransportProtocolOption() {}

// TCPRecovery is the loss deteoction algorithm used by TCP.
type TCPRecovery int32

func (*TCPRecovery) isGettableTransportProtocolOption() {}

func (*TCPRecovery) isSettableTransportProtocolOption() {}

// TCPAlwaysUseSynCookies indicates unconditional usage of syncookies.
type TCPAlwaysUseSynCookies bool

func (*TCPAlwaysUseSynCookies) isGettableTransportProtocolOption() {}

func (*TCPAlwaysUseSynCookies) isSettableTransportProtocolOption() {}

const (
	// TCPRACKLossDetection indicates RACK is used for loss detection and
	// recovery.
	TCPRACKLossDetection TCPRecovery = 1 << iota

	// TCPRACKStaticReoWnd indicates the reordering window should not be
	// adjusted when DSACK is received.
	TCPRACKStaticReoWnd

	// TCPRACKNoDupTh indicates RACK should not consider the classic three
	// duplicate acknowledgements rule to mark the segments as lost. This
	// is used when reordering is not detected.
	TCPRACKNoDupTh
)

// TCPDelayEnabled enables/disables Nagle's algorithm in TCP.
type TCPDelayEnabled bool

func (*TCPDelayEnabled) isGettableTransportProtocolOption() {}

func (*TCPDelayEnabled) isSettableTransportProtocolOption() {}

// TCPSendBufferSizeRangeOption is the send buffer size range for TCP.
//
// +stateify savable
type TCPSendBufferSizeRangeOption struct {
	Min     int
	Default int
	Max     int
}

func (*TCPSendBufferSizeRangeOption) isGettableTransportProtocolOption() {}

func (*TCPSendBufferSizeRangeOption) isSettableTransportProtocolOption() {}

// TCPReceiveBufferSizeRangeOption is the receive buffer size range for TCP.
//
// +stateify savable
type TCPReceiveBufferSizeRangeOption struct {
	Min     int
	Default int
	Max     int
}

func (*TCPReceiveBufferSizeRangeOption) isGettableTransportProtocolOption() {}

func (*TCPReceiveBufferSizeRangeOption) isSettableTransportProtocolOption() {}

// TCPAvailableCongestionControlOption is the supported congestion control
// algorithms for TCP
type TCPAvailableCongestionControlOption string

func (*TCPAvailableCongestionControlOption) isGettableTransportProtocolOption() {}

func (*TCPAvailableCongestionControlOption) isSettableTransportProtocolOption() {}

// TCPModerateReceiveBufferOption enables/disables receive buffer moderation
// for TCP.
type TCPModerateReceiveBufferOption bool

func (*TCPModerateReceiveBufferOption) isGettableTransportProtocolOption() {}

func (*TCPModerateReceiveBufferOption) isSettableTransportProtocolOption() {}

// GettableSocketOption is a marker interface for socket options that may be
// queried.
type GettableSocketOption interface {
	isGettableSocketOption()
}

// SettableSocketOption is a marker interface for socket options that may be
// configured.
type SettableSocketOption interface {
	isSettableSocketOption()
}

// ICMPv6Filter specifies a filter for ICMPv6 types.
//
// +stateify savable
type ICMPv6Filter struct {
	// DenyType indicates if an ICMP type should be blocked.
	//
	// The ICMPv6 type field is 8 bits so there are up to 256 different ICMPv6
	// types.
	DenyType [8]uint32
}

// ShouldDeny returns true iff the ICMPv6 Type should be denied.
func (f *ICMPv6Filter) ShouldDeny(icmpType uint8) bool {
	const bitsInUint32 = 32
	i := icmpType / bitsInUint32
	b := icmpType % bitsInUint32
	return f.DenyType[i]&(1<<b) != 0
}

func (*ICMPv6Filter) isGettableSocketOption() {}

func (*ICMPv6Filter) isSettableSocketOption() {}

// TpacketReq is the tpacket_req structure as described in
// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
//
// +stateify savable
type TpacketReq struct {
	TpBlockSize uint32
	TpBlockNr   uint32
	TpFrameSize uint32
	TpFrameNr   uint32
}

func (*TpacketReq) isSettableSocketOption() {}

// TpacketStats is the statistics for a packet_mmap ring buffer from
// <linux/if_packet.h>.
//
// +stateify savable
type TpacketStats struct {
	Packets uint32
	Dropped uint32
}

func (*TpacketStats) isGettableSocketOption() {}

// EndpointState represents the state of an endpoint.
type EndpointState uint8

// CongestionControlState indicates the current congestion control state for
// TCP sender.
type CongestionControlState int

const (
	// Open indicates that the sender is receiving acks in order and
	// no loss or dupACK's etc have been detected.
	Open CongestionControlState = iota
	// RTORecovery indicates that an RTO has occurred and the sender
	// has entered an RTO based recovery phase.
	RTORecovery
	// FastRecovery indicates that the sender has entered FastRecovery
	// based on receiving nDupAck's. This state is entered only when
	// SACK is not in use.
	FastRecovery
	// SACKRecovery indicates that the sender has entered SACK based
	// recovery.
	SACKRecovery
	// Disorder indicates the sender either received some SACK blocks
	// or dupACK's.
	Disorder
)

// TCPInfoOption is used by GetSockOpt to expose TCP statistics.
type TCPInfoOption struct {
	// RTT is the smoothed round trip time.
	RTT time.Duration

	// RTTVar is the round trip time variation.
	RTTVar time.Duration

	// RTO is the retransmission timeout for the endpoint.
	RTO time.Duration

	// State is the current endpoint protocol state.
	State EndpointState

	// CcState is the congestion control state.
	CcState CongestionControlState

	// SndCwnd is the congestion window, in packets.
	SndCwnd uint32

	// SndSsthresh is the threshold between slow start and congestion
	// avoidance.
	SndSsthresh uint32

	// ReorderSeen indicates if reordering is seen in the endpoint.
	ReorderSeen bool
}

func (*TCPInfoOption) isGettableSocketOption() {}

// KeepaliveIdleOption is used by SetSockOpt/GetSockOpt to specify the time a
// connection must remain idle before the first TCP keepalive packet is sent.
// Once this time is reached, KeepaliveIntervalOption is used instead.
type KeepaliveIdleOption time.Duration

func (*KeepaliveIdleOption) isGettableSocketOption() {}

func (*KeepaliveIdleOption) isSettableSocketOption() {}

// KeepaliveIntervalOption is used by SetSockOpt/GetSockOpt to specify the
// interval between sending TCP keepalive packets.
type KeepaliveIntervalOption time.Duration

func (*KeepaliveIntervalOption) isGettableSocketOption() {}

func (*KeepaliveIntervalOption) isSettableSocketOption() {}

// TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
// specified timeout for a given TCP connection.
// See: RFC5482 for details.
type TCPUserTimeoutOption time.Duration

func (*TCPUserTimeoutOption) isGettableSocketOption() {}

func (*TCPUserTimeoutOption) isSettableSocketOption() {}

// CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
// the current congestion control algorithm.
type CongestionControlOption string

func (*CongestionControlOption) isGettableSocketOption() {}

func (*CongestionControlOption) isSettableSocketOption() {}

func (*CongestionControlOption) isGettableTransportProtocolOption() {}

func (*CongestionControlOption) isSettableTransportProtocolOption() {}

// TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
// maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
// before being marked closed.
type TCPLingerTimeoutOption time.Duration

func (*TCPLingerTimeoutOption) isGettableSocketOption() {}

func (*TCPLingerTimeoutOption) isSettableSocketOption() {}

func (*TCPLingerTimeoutOption) isGettableTransportProtocolOption() {}

func (*TCPLingerTimeoutOption) isSettableTransportProtocolOption() {}

// TCPTimeWaitTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
// maximum duration for which a socket lingers in the TIME_WAIT state
// before being marked closed.
type TCPTimeWaitTimeoutOption time.Duration

func (*TCPTimeWaitTimeoutOption) isGettableSocketOption() {}

func (*TCPTimeWaitTimeoutOption) isSettableSocketOption() {}

func (*TCPTimeWaitTimeoutOption) isGettableTransportProtocolOption() {}

func (*TCPTimeWaitTimeoutOption) isSettableTransportProtocolOption() {}

// TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
// accept to return a completed connection only when there is data to be
// read. This usually means the listening socket will drop the final ACK
// for a handshake till the specified timeout until a segment with data arrives.
type TCPDeferAcceptOption time.Duration

func (*TCPDeferAcceptOption) isGettableSocketOption() {}

func (*TCPDeferAcceptOption) isSettableSocketOption() {}

// TCPMinRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
// default MinRTO used by the Stack.
type TCPMinRTOOption time.Duration

func (*TCPMinRTOOption) isGettableSocketOption() {}

func (*TCPMinRTOOption) isSettableSocketOption() {}

func (*TCPMinRTOOption) isGettableTransportProtocolOption() {}

func (*TCPMinRTOOption) isSettableTransportProtocolOption() {}

// TCPMaxRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
// default MaxRTO used by the Stack.
type TCPMaxRTOOption time.Duration

func (*TCPMaxRTOOption) isGettableSocketOption() {}

func (*TCPMaxRTOOption) isSettableSocketOption() {}

func (*TCPMaxRTOOption) isGettableTransportProtocolOption() {}

func (*TCPMaxRTOOption) isSettableTransportProtocolOption() {}

// TCPMaxRetriesOption is used by SetSockOpt/GetSockOpt to set/get the
// maximum number of retransmits after which we time out the connection.
type TCPMaxRetriesOption uint64

func (*TCPMaxRetriesOption) isGettableSocketOption() {}

func (*TCPMaxRetriesOption) isSettableSocketOption() {}

func (*TCPMaxRetriesOption) isGettableTransportProtocolOption() {}

func (*TCPMaxRetriesOption) isSettableTransportProtocolOption() {}

// TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide
// default for number of times SYN is retransmitted before aborting a connect.
type TCPSynRetriesOption uint8

func (*TCPSynRetriesOption) isGettableSocketOption() {}

func (*TCPSynRetriesOption) isSettableSocketOption() {}

func (*TCPSynRetriesOption) isGettableTransportProtocolOption() {}

func (*TCPSynRetriesOption) isSettableTransportProtocolOption() {}

// MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
// default interface for multicast.
type MulticastInterfaceOption struct {
	NIC           NICID
	InterfaceAddr Address
}

func (*MulticastInterfaceOption) isGettableSocketOption() {}

func (*MulticastInterfaceOption) isSettableSocketOption() {}

// MembershipOption is used to identify a multicast membership on an interface.
type MembershipOption struct {
	NIC           NICID
	InterfaceAddr Address
	MulticastAddr Address
}

// AddMembershipOption identifies a multicast group to join on some interface.
type AddMembershipOption MembershipOption

func (*AddMembershipOption) isSettableSocketOption() {}

// RemoveMembershipOption identifies a multicast group to leave on some
// interface.
type RemoveMembershipOption MembershipOption

func (*RemoveMembershipOption) isSettableSocketOption() {}

// SocketDetachFilterOption is used by SetSockOpt to detach a previously attached
// classic BPF filter on a given endpoint.
type SocketDetachFilterOption int

func (*SocketDetachFilterOption) isSettableSocketOption() {}

// OriginalDestinationOption is used to get the original destination address
// and port of a redirected packet.
type OriginalDestinationOption FullAddress

func (*OriginalDestinationOption) isGettableSocketOption() {}

// TCPTimeWaitReuseOption is used stack.(*Stack).TransportProtocolOption to
// specify if the stack can reuse the port bound by an endpoint in TIME-WAIT for
// new connections when it is safe from protocol viewpoint.
type TCPTimeWaitReuseOption uint8

func (*TCPTimeWaitReuseOption) isGettableSocketOption() {}

func (*TCPTimeWaitReuseOption) isSettableSocketOption() {}

func (*TCPTimeWaitReuseOption) isGettableTransportProtocolOption() {}

func (*TCPTimeWaitReuseOption) isSettableTransportProtocolOption() {}

const (
	// TCPTimeWaitReuseDisabled indicates reuse of port bound by endpoints in TIME-WAIT cannot
	// be reused for new connections.
	TCPTimeWaitReuseDisabled TCPTimeWaitReuseOption = iota

	// TCPTimeWaitReuseGlobal indicates reuse of port bound by endpoints in TIME-WAIT can
	// be reused for new connections irrespective of the src/dest addresses.
	TCPTimeWaitReuseGlobal

	// TCPTimeWaitReuseLoopbackOnly indicates reuse of port bound by endpoint in TIME-WAIT can
	// only be reused if the connection was a connection over loopback. i.e. src/dest addresses
	// are loopback addresses.
	TCPTimeWaitReuseLoopbackOnly
)

// LingerOption is used by SetSockOpt/GetSockOpt to set/get the
// duration for which a socket lingers before returning from Close.
//
// +marshal
// +stateify savable
type LingerOption struct {
	Enabled bool
	Timeout time.Duration
}

// IPPacketInfo is the message structure for IP_PKTINFO.
//
// +stateify savable
type IPPacketInfo struct {
	// NIC is the ID of the NIC to be used.
	NIC NICID

	// LocalAddr is the local address.
	LocalAddr Address

	// DestinationAddr is the destination address found in the IP header.
	DestinationAddr Address
}

// IPv6PacketInfo is the message structure for IPV6_PKTINFO.
//
// +stateify savable
type IPv6PacketInfo struct {
	Addr Address
	NIC  NICID
}

// SendBufferSizeOption is used by stack.(Stack*).Option/SetOption to
// get/set the default, min and max send buffer sizes.
//
// +stateify savable
type SendBufferSizeOption struct {
	// Min is the minimum size for send buffer.
	Min int

	// Default is the default size for send buffer.
	Default int

	// Max is the maximum size for send buffer.
	Max int
}

// ReceiveBufferSizeOption is used by stack.(Stack*).Option/SetOption to
// get/set the default, min and max receive buffer sizes.
//
// +stateify savable
type ReceiveBufferSizeOption struct {
	// Min is the minimum size for send buffer.
	Min int

	// Default is the default size for send buffer.
	Default int

	// Max is the maximum size for send buffer.
	Max int
}

// GetSendBufferLimits is used to get the send buffer size limits.
type GetSendBufferLimits func(StackHandler) SendBufferSizeOption

// GetStackSendBufferLimits is used to get default, min and max send buffer size.
func GetStackSendBufferLimits(so StackHandler) SendBufferSizeOption {
	var ss SendBufferSizeOption
	if err := so.Option(&ss); err != nil {
		panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err))
	}
	return ss
}

// GetReceiveBufferLimits is used to get the send buffer size limits.
type GetReceiveBufferLimits func(StackHandler) ReceiveBufferSizeOption

// GetStackReceiveBufferLimits is used to get default, min and max send buffer size.
func GetStackReceiveBufferLimits(so StackHandler) ReceiveBufferSizeOption {
	var ss ReceiveBufferSizeOption
	if err := so.Option(&ss); err != nil {
		panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err))
	}
	return ss
}

// Route is a row in the routing table. It specifies through which NIC (and
// gateway) sets of packets should be routed. A row is considered viable if the
// masked target address matches the destination address in the row.
//
// +stateify savable
type Route struct {
	RouteEntry

	// Destination must contain the target address for this row to be viable.
	Destination Subnet

	// Gateway is the gateway to be used if this row is viable.
	Gateway Address

	// NIC is the id of the nic to be used if this row is viable.
	NIC NICID

	// SourceHint indicates a preferred source address to use when NICs
	// have multiple addresses.
	SourceHint Address

	// MTU is the maximum transmission unit to use for this route.
	// If MTU is 0, this field is ignored and the MTU of the NIC for which this route
	// is configured is used for egress packets.
	MTU uint32
}

// String implements the fmt.Stringer interface.
func (r Route) String() string {
	var out strings.Builder
	_, _ = fmt.Fprintf(&out, "%s", r.Destination)
	if r.Gateway.length > 0 {
		_, _ = fmt.Fprintf(&out, " via %s", r.Gateway)
	}
	_, _ = fmt.Fprintf(&out, " nic %d", r.NIC)
	return out.String()
}

// Equal returns true if the given Route is equal to this Route.
func (r Route) Equal(to Route) bool {
	// NOTE: This relies on the fact that r.Destination == to.Destination
	return r.Destination.Equal(to.Destination) && r.NIC == to.NIC
}

// TransportProtocolNumber is the number of a transport protocol.
type TransportProtocolNumber uint32

// NetworkProtocolNumber is the EtherType of a network protocol in an Ethernet
// frame.
//
// See: https://www.iana.org/assignments/ieee-802-numbers/ieee-802-numbers.xhtml
type NetworkProtocolNumber uint32

// A StatCounter keeps track of a statistic.
//
// +stateify savable
type StatCounter struct {
	count atomicbitops.Uint64
}

// Increment adds one to the counter.
func (s *StatCounter) Increment() {
	s.IncrementBy(1)
}

// Decrement minuses one to the counter.
func (s *StatCounter) Decrement() {
	s.IncrementBy(^uint64(0))
}

// Value returns the current value of the counter.
func (s *StatCounter) Value() uint64 {
	return s.count.Load()
}

// IncrementBy increments the counter by v.
func (s *StatCounter) IncrementBy(v uint64) {
	s.count.Add(v)
}

func (s *StatCounter) String() string {
	return strconv.FormatUint(s.Value(), 10)
}

// A MultiCounterStat keeps track of two counters at once.
//
// +stateify savable
type MultiCounterStat struct {
	a *StatCounter
	b *StatCounter
}

// Init sets both internal counters to point to a and b.
func (m *MultiCounterStat) Init(a, b *StatCounter) {
	m.a = a
	m.b = b
}

// Increment adds one to the counters.
func (m *MultiCounterStat) Increment() {
	m.a.Increment()
	m.b.Increment()
}

// IncrementBy increments the counters by v.
func (m *MultiCounterStat) IncrementBy(v uint64) {
	m.a.IncrementBy(v)
	m.b.IncrementBy(v)
}

// ICMPv4PacketStats enumerates counts for all ICMPv4 packet types.
//
// +stateify savable
type ICMPv4PacketStats struct {
	// LINT.IfChange(ICMPv4PacketStats)

	// EchoRequest is the number of ICMPv4 echo packets counted.
	EchoRequest *StatCounter

	// EchoReply is the number of ICMPv4 echo reply packets counted.
	EchoReply *StatCounter

	// DstUnreachable is the number of ICMPv4 destination unreachable packets
	// counted.
	DstUnreachable *StatCounter

	// SrcQuench is the number of ICMPv4 source quench packets counted.
	SrcQuench *StatCounter

	// Redirect is the number of ICMPv4 redirect packets counted.
	Redirect *StatCounter

	// TimeExceeded is the number of ICMPv4 time exceeded packets counted.
	TimeExceeded *StatCounter

	// ParamProblem is the number of ICMPv4 parameter problem packets counted.
	ParamProblem *StatCounter

	// Timestamp is the number of ICMPv4 timestamp packets counted.
	Timestamp *StatCounter

	// TimestampReply is the number of ICMPv4 timestamp reply packets counted.
	TimestampReply *StatCounter

	// InfoRequest is the number of ICMPv4 information request packets counted.
	InfoRequest *StatCounter

	// InfoReply is the number of ICMPv4 information reply packets counted.
	InfoReply *StatCounter

	// LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4PacketStats)
}

// ICMPv4SentPacketStats collects outbound ICMPv4-specific stats.
//
// +stateify savable
type ICMPv4SentPacketStats struct {
	// LINT.IfChange(ICMPv4SentPacketStats)

	ICMPv4PacketStats

	// Dropped is the number of ICMPv4 packets dropped due to link layer errors.
	Dropped *StatCounter

	// RateLimited is the number of ICMPv4 packets dropped due to rate limit being
	// exceeded.
	RateLimited *StatCounter

	// LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4SentPacketStats)
}

// ICMPv4ReceivedPacketStats collects inbound ICMPv4-specific stats.
//
// +stateify savable
type ICMPv4ReceivedPacketStats struct {
	// LINT.IfChange(ICMPv4ReceivedPacketStats)

	ICMPv4PacketStats

	// Invalid is the number of invalid ICMPv4 packets received.
	Invalid *StatCounter

	// LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4ReceivedPacketStats)
}

// ICMPv4Stats collects ICMPv4-specific stats.
//
// +stateify savable
type ICMPv4Stats struct {
	// LINT.IfChange(ICMPv4Stats)

	// PacketsSent contains statistics about sent packets.
	PacketsSent ICMPv4SentPacketStats

	// PacketsReceived contains statistics about received packets.
	PacketsReceived ICMPv4ReceivedPacketStats

	// LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4Stats)
}

// ICMPv6PacketStats enumerates counts for all ICMPv6 packet types.
//
// +stateify savable
type ICMPv6PacketStats struct {
	// LINT.IfChange(ICMPv6PacketStats)

	// EchoRequest is the number of ICMPv6 echo request packets counted.
	EchoRequest *StatCounter

	// EchoReply is the number of ICMPv6 echo reply packets counted.
	EchoReply *StatCounter

	// DstUnreachable is the number of ICMPv6 destination unreachable packets
	// counted.
	DstUnreachable *StatCounter

	// PacketTooBig is the number of ICMPv6 packet too big packets counted.
	PacketTooBig *StatCounter

	// TimeExceeded is the number of ICMPv6 time exceeded packets counted.
	TimeExceeded *StatCounter

	// ParamProblem is the number of ICMPv6 parameter problem packets counted.
	ParamProblem *StatCounter

	// RouterSolicit is the number of ICMPv6 router solicit packets counted.
	RouterSolicit *StatCounter

	// RouterAdvert is the number of ICMPv6 router advert packets counted.
	RouterAdvert *StatCounter

	// NeighborSolicit is the number of ICMPv6 neighbor solicit packets counted.
	NeighborSolicit *StatCounter

	// NeighborAdvert is the number of ICMPv6 neighbor advert packets counted.
	NeighborAdvert *StatCounter

	// RedirectMsg is the number of ICMPv6 redirect message packets counted.
	RedirectMsg *StatCounter

	// MulticastListenerQuery is the number of Multicast Listener Query messages
	// counted.
	MulticastListenerQuery *StatCounter

	// MulticastListenerReport is the number of Multicast Listener Report messages
	// counted.
	MulticastListenerReport *StatCounter

	// MulticastListenerReportV2 is the number of Multicast Listener Report
	// messages counted.
	MulticastListenerReportV2 *StatCounter

	// MulticastListenerDone is the number of Multicast Listener Done messages
	// counted.
	MulticastListenerDone *StatCounter

	// LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6PacketStats)
}

// ICMPv6SentPacketStats collects outbound ICMPv6-specific stats.
//
// +stateify savable
type ICMPv6SentPacketStats struct {
	// LINT.IfChange(ICMPv6SentPacketStats)

	ICMPv6PacketStats

	// Dropped is the number of ICMPv6 packets dropped due to link layer errors.
	Dropped *StatCounter

	// RateLimited is the number of ICMPv6 packets dropped due to rate limit being
	// exceeded.
	RateLimited *StatCounter

	// LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6SentPacketStats)
}

// ICMPv6ReceivedPacketStats collects inbound ICMPv6-specific stats.
//
// +stateify savable
type ICMPv6ReceivedPacketStats struct {
	// LINT.IfChange(ICMPv6ReceivedPacketStats)

	ICMPv6PacketStats

	// Unrecognized is the number of ICMPv6 packets received that the transport
	// layer does not know how to parse.
	Unrecognized *StatCounter

	// Invalid is the number of invalid ICMPv6 packets received.
	Invalid *StatCounter

	// RouterOnlyPacketsDroppedByHost is the number of ICMPv6 packets dropped due
	// to being router-specific packets.
	RouterOnlyPacketsDroppedByHost *StatCounter

	// LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6ReceivedPacketStats)
}

// ICMPv6Stats collects ICMPv6-specific stats.
//
// +stateify savable
type ICMPv6Stats struct {
	// LINT.IfChange(ICMPv6Stats)

	// PacketsSent contains statistics about sent packets.
	PacketsSent ICMPv6SentPacketStats

	// PacketsReceived contains statistics about received packets.
	PacketsReceived ICMPv6ReceivedPacketStats

	// LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6Stats)
}

// ICMPStats collects ICMP-specific stats (both v4 and v6).
//
// +stateify savable
type ICMPStats struct {
	// V4 contains the ICMPv4-specifics stats.
	V4 ICMPv4Stats

	// V6 contains the ICMPv4-specifics stats.
	V6 ICMPv6Stats
}

// IGMPPacketStats enumerates counts for all IGMP packet types.
//
// +stateify savable
type IGMPPacketStats struct {
	// LINT.IfChange(IGMPPacketStats)

	// MembershipQuery is the number of Membership Query messages counted.
	MembershipQuery *StatCounter

	// V1MembershipReport is the number of Version 1 Membership Report messages
	// counted.
	V1MembershipReport *StatCounter

	// V2MembershipReport is the number of Version 2 Membership Report messages
	// counted.
	V2MembershipReport *StatCounter

	// V3MembershipReport is the number of Version 3 Membership Report messages
	// counted.
	V3MembershipReport *StatCounter

	// LeaveGroup is the number of Leave Group messages counted.
	LeaveGroup *StatCounter

	// LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPPacketStats)
}

// IGMPSentPacketStats collects outbound IGMP-specific stats.
//
// +stateify savable
type IGMPSentPacketStats struct {
	// LINT.IfChange(IGMPSentPacketStats)

	IGMPPacketStats

	// Dropped is the number of IGMP packets dropped.
	Dropped *StatCounter

	// LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPSentPacketStats)
}

// IGMPReceivedPacketStats collects inbound IGMP-specific stats.
//
// +stateify savable
type IGMPReceivedPacketStats struct {
	// LINT.IfChange(IGMPReceivedPacketStats)

	IGMPPacketStats

	// Invalid is the number of invalid IGMP packets received.
	Invalid *StatCounter

	// ChecksumErrors is the number of IGMP packets dropped due to bad checksums.
	ChecksumErrors *StatCounter

	// Unrecognized is the number of unrecognized messages counted, these are
	// silently ignored for forward-compatibility.
	Unrecognized *StatCounter

	// LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPReceivedPacketStats)
}

// IGMPStats collects IGMP-specific stats.
//
// +stateify savable
type IGMPStats struct {
	// LINT.IfChange(IGMPStats)

	// PacketsSent contains statistics about sent packets.
	PacketsSent IGMPSentPacketStats

	// PacketsReceived contains statistics about received packets.
	PacketsReceived IGMPReceivedPacketStats

	// LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPStats)
}

// IPForwardingStats collects stats related to IP forwarding (both v4 and v6).
//
// +stateify savable
type IPForwardingStats struct {
	// LINT.IfChange(IPForwardingStats)

	// Unrouteable is the number of IP packets received which were dropped
	// because a route to their destination could not be constructed.
	Unrouteable *StatCounter

	// ExhaustedTTL is the number of IP packets received which were dropped
	// because their TTL was exhausted.
	ExhaustedTTL *StatCounter

	// InitializingSource is the number of IP packets which were dropped
	// because they contained a source address that may only be used on the local
	// network as part of initialization work.
	InitializingSource *StatCounter

	// LinkLocalSource is the number of IP packets which were dropped
	// because they contained a link-local source address.
	LinkLocalSource *StatCounter

	// LinkLocalDestination is the number of IP packets which were dropped
	// because they contained a link-local destination address.
	LinkLocalDestination *StatCounter

	// PacketTooBig is the number of IP packets which were dropped because they
	// were too big for the outgoing MTU.
	PacketTooBig *StatCounter

	// HostUnreachable is the number of IP packets received which could not be
	// successfully forwarded due to an unresolvable next hop.
	HostUnreachable *StatCounter

	// ExtensionHeaderProblem is the number of IP packets which were dropped
	// because of a problem encountered when processing an IPv6 extension
	// header.
	ExtensionHeaderProblem *StatCounter

	// UnexpectedMulticastInputInterface is the number of multicast packets that
	// were received on an interface that did not match the corresponding route's
	// expected input interface.
	UnexpectedMulticastInputInterface *StatCounter

	// UnknownOutputEndpoint is the number of packets that could not be forwarded
	// because the output endpoint could not be found.
	UnknownOutputEndpoint *StatCounter

	// NoMulticastPendingQueueBufferSpace is the number of multicast packets that
	// were dropped due to insufficient buffer space in the pending packet queue.
	NoMulticastPendingQueueBufferSpace *StatCounter

	// OutgoingDeviceNoBufferSpace is the number of packets that were dropped due
	// to insufficient space in the outgoing device.
	OutgoingDeviceNoBufferSpace *StatCounter

	// Errors is the number of IP packets received which could not be
	// successfully forwarded.
	Errors *StatCounter

	// OutgoingDeviceClosedForSend is the number of packets that were dropped due
	// to the outgoing device being closed for send.
	OutgoingDeviceClosedForSend *StatCounter

	// LINT.ThenChange(network/internal/ip/stats.go:MultiCounterIPForwardingStats)
}

// IPStats collects IP-specific stats (both v4 and v6).
//
// +stateify savable
type IPStats struct {
	// LINT.IfChange(IPStats)

	// PacketsReceived is the number of IP packets received from the link layer.
	PacketsReceived *StatCounter

	// ValidPacketsReceived is the number of valid IP packets that reached the IP
	// layer.
	ValidPacketsReceived *StatCounter

	// DisabledPacketsReceived is the number of IP packets received from the link
	// layer when the IP layer is disabled.
	DisabledPacketsReceived *StatCounter

	// InvalidDestinationAddressesReceived is the number of IP packets received
	// with an unknown or invalid destination address.
	InvalidDestinationAddressesReceived *StatCounter

	// InvalidSourceAddressesReceived is the number of IP packets received with a
	// source address that should never have been received on the wire.
	InvalidSourceAddressesReceived *StatCounter

	// PacketsDelivered is the number of incoming IP packets that are successfully
	// delivered to the transport layer.
	PacketsDelivered *StatCounter

	// PacketsSent is the number of IP packets sent via WritePacket.
	PacketsSent *StatCounter

	// OutgoingPacketErrors is the number of IP packets which failed to write to a
	// link-layer endpoint.
	OutgoingPacketErrors *StatCounter

	// MalformedPacketsReceived is the number of IP Packets that were dropped due
	// to the IP packet header failing validation checks.
	MalformedPacketsReceived *StatCounter

	// MalformedFragmentsReceived is the number of IP Fragments that were dropped
	// due to the fragment failing validation checks.
	MalformedFragmentsReceived *StatCounter

	// IPTablesPreroutingDropped is the number of IP packets dropped in the
	// Prerouting chain.
	IPTablesPreroutingDropped *StatCounter

	// IPTablesInputDropped is the number of IP packets dropped in the Input
	// chain.
	IPTablesInputDropped *StatCounter

	// IPTablesForwardDropped is the number of IP packets dropped in the Forward
	// chain.
	IPTablesForwardDropped *StatCounter

	// IPTablesOutputDropped is the number of IP packets dropped in the Output
	// chain.
	IPTablesOutputDropped *StatCounter

	// IPTablesPostroutingDropped is the number of IP packets dropped in the
	// Postrouting chain.
	IPTablesPostroutingDropped *StatCounter

	// TODO(https://gvisor.dev/issues/5529): Move the IPv4-only option stats out
	// of IPStats.
	// OptionTimestampReceived is the number of Timestamp options seen.
	OptionTimestampReceived *StatCounter

	// OptionRecordRouteReceived is the number of Record Route options seen.
	OptionRecordRouteReceived *StatCounter

	// OptionRouterAlertReceived is the number of Router Alert options seen.
	OptionRouterAlertReceived *StatCounter

	// OptionUnknownReceived is the number of unknown IP options seen.
	OptionUnknownReceived *StatCounter

	// Forwarding collects stats related to IP forwarding.
	Forwarding IPForwardingStats

	// LINT.ThenChange(network/internal/ip/stats.go:MultiCounterIPStats)
}

// ARPStats collects ARP-specific stats.
//
// +stateify savable
type ARPStats struct {
	// LINT.IfChange(ARPStats)

	// PacketsReceived is the number of ARP packets received from the link layer.
	PacketsReceived *StatCounter

	// DisabledPacketsReceived is the number of ARP packets received from the link
	// layer when the ARP layer is disabled.
	DisabledPacketsReceived *StatCounter

	// MalformedPacketsReceived is the number of ARP packets that were dropped due
	// to being malformed.
	MalformedPacketsReceived *StatCounter

	// RequestsReceived is the number of ARP requests received.
	RequestsReceived *StatCounter

	// RequestsReceivedUnknownTargetAddress is the number of ARP requests that
	// were targeted to an interface different from the one it was received on.
	RequestsReceivedUnknownTargetAddress *StatCounter

	// OutgoingRequestInterfaceHasNoLocalAddressErrors is the number of failures
	// to send an ARP request because the interface has no network address
	// assigned to it.
	OutgoingRequestInterfaceHasNoLocalAddressErrors *StatCounter

	// OutgoingRequestBadLocalAddressErrors is the number of failures to send an
	// ARP request with a bad local address.
	OutgoingRequestBadLocalAddressErrors *StatCounter

	// OutgoingRequestsDropped is the number of ARP requests which failed to write
	// to a link-layer endpoint.
	OutgoingRequestsDropped *StatCounter

	// OutgoingRequestSent is the number of ARP requests successfully written to a
	// link-layer endpoint.
	OutgoingRequestsSent *StatCounter

	// RepliesReceived is the number of ARP replies received.
	RepliesReceived *StatCounter

	// OutgoingRepliesDropped is the number of ARP replies which failed to write
	// to a link-layer endpoint.
	OutgoingRepliesDropped *StatCounter

	// OutgoingRepliesSent is the number of ARP replies successfully written to a
	// link-layer endpoint.
	OutgoingRepliesSent *StatCounter

	// LINT.ThenChange(network/arp/stats.go:multiCounterARPStats)
}

// TCPStats collects TCP-specific stats.
//
// +stateify savable
type TCPStats struct {
	// ActiveConnectionOpenings is the number of connections opened
	// successfully via Connect.
	ActiveConnectionOpenings *StatCounter

	// PassiveConnectionOpenings is the number of connections opened
	// successfully via Listen.
	PassiveConnectionOpenings *StatCounter

	// CurrentEstablished is the number of TCP connections for which the
	// current state is ESTABLISHED.
	CurrentEstablished *StatCounter

	// CurrentConnected is the number of TCP connections that
	// are in connected state.
	CurrentConnected *StatCounter

	// EstablishedResets is the number of times TCP connections have made
	// a direct transition to the CLOSED state from either the
	// ESTABLISHED state or the CLOSE-WAIT state.
	EstablishedResets *StatCounter

	// EstablishedClosed is the number of times established TCP connections
	// made a transition to CLOSED state.
	EstablishedClosed *StatCounter

	// EstablishedTimedout is the number of times an established connection
	// was reset because of keep-alive time out.
	EstablishedTimedout *StatCounter

	// ListenOverflowSynDrop is the number of times the listen queue overflowed
	// and a SYN was dropped.
	ListenOverflowSynDrop *StatCounter

	// ListenOverflowAckDrop is the number of times the final ACK
	// in the handshake was dropped due to overflow.
	ListenOverflowAckDrop *StatCounter

	// ListenOverflowCookieSent is the number of times a SYN cookie was sent.
	ListenOverflowSynCookieSent *StatCounter

	// ListenOverflowSynCookieRcvd is the number of times a valid SYN
	// cookie was received.
	ListenOverflowSynCookieRcvd *StatCounter

	// ListenOverflowInvalidSynCookieRcvd is the number of times an invalid SYN cookie
	// was received.
	ListenOverflowInvalidSynCookieRcvd *StatCounter

	// FailedConnectionAttempts is the number of calls to Connect or Listen
	// (active and passive openings, respectively) that end in an error.
	FailedConnectionAttempts *StatCounter

	// ValidSegmentsReceived is the number of TCP segments received that
	// the transport layer successfully parsed.
	ValidSegmentsReceived *StatCounter

	// InvalidSegmentsReceived is the number of TCP segments received that
	// the transport layer could not parse.
	InvalidSegmentsReceived *StatCounter

	// SegmentsSent is the number of TCP segments sent.
	SegmentsSent *StatCounter

	// SegmentSendErrors is the number of TCP segments failed to be sent.
	SegmentSendErrors *StatCounter

	// ResetsSent is the number of TCP resets sent.
	ResetsSent *StatCounter

	// ResetsReceived is the number of TCP resets received.
	ResetsReceived *StatCounter

	// Retransmits is the number of TCP segments retransmitted.
	Retransmits *StatCounter

	// FastRecovery is the number of times Fast Recovery was used to
	// recover from packet loss.
	FastRecovery *StatCounter

	// SACKRecovery is the number of times SACK Recovery was used to
	// recover from packet loss.
	SACKRecovery *StatCounter

	// TLPRecovery is the number of times recovery was accomplished by the tail
	// loss probe.
	TLPRecovery *StatCounter

	// SlowStartRetransmits is the number of segments retransmitted in slow
	// start.
	SlowStartRetransmits *StatCounter

	// FastRetransmit is the number of segments retransmitted in fast
	// recovery.
	FastRetransmit *StatCounter

	// Timeouts is the number of times the RTO expired.
	Timeouts *StatCounter

	// ChecksumErrors is the number of segments dropped due to bad checksums.
	ChecksumErrors *StatCounter

	// FailedPortReservations is the number of times TCP failed to reserve
	// a port.
	FailedPortReservations *StatCounter

	// SegmentsAckedWithDSACK is the number of segments acknowledged with
	// DSACK.
	SegmentsAckedWithDSACK *StatCounter

	// SpuriousRecovery is the number of times the connection entered loss
	// recovery spuriously.
	SpuriousRecovery *StatCounter

	// SpuriousRTORecovery is the number of spurious RTOs.
	SpuriousRTORecovery *StatCounter

	// ForwardMaxInFlightDrop is the number of connection requests that are
	// dropped due to exceeding the maximum number of in-flight connection
	// requests.
	ForwardMaxInFlightDrop *StatCounter
}

// UDPStats collects UDP-specific stats.
//
// +stateify savable
type UDPStats struct {
	// PacketsReceived is the number of UDP datagrams received via
	// HandlePacket.
	PacketsReceived *StatCounter

	// UnknownPortErrors is the number of incoming UDP datagrams dropped
	// because they did not have a known destination port.
	UnknownPortErrors *StatCounter

	// ReceiveBufferErrors is the number of incoming UDP datagrams dropped
	// due to the receiving buffer being in an invalid state.
	ReceiveBufferErrors *StatCounter

	// MalformedPacketsReceived is the number of incoming UDP datagrams
	// dropped due to the UDP header being in a malformed state.
	MalformedPacketsReceived *StatCounter

	// PacketsSent is the number of UDP datagrams sent via sendUDP.
	PacketsSent *StatCounter

	// PacketSendErrors is the number of datagrams failed to be sent.
	PacketSendErrors *StatCounter

	// ChecksumErrors is the number of datagrams dropped due to bad checksums.
	ChecksumErrors *StatCounter
}

// NICNeighborStats holds metrics for the neighbor table.
//
// +stateify savable
type NICNeighborStats struct {
	// LINT.IfChange(NICNeighborStats)

	// UnreachableEntryLookups counts the number of lookups performed on an
	// entry in Unreachable state.
	UnreachableEntryLookups *StatCounter

	// DroppedConfirmationForNoninitiatedNeighbor counts the number of neighbor
	// responses that were dropped because they didn't match an entry in the
	// cache.
	DroppedConfirmationForNoninitiatedNeighbor *StatCounter

	// DroppedInvalidLinkAddressConfirmations counts the number of neighbor
	// responses that were ignored because they had an invalid source link-layer
	// address.
	DroppedInvalidLinkAddressConfirmations *StatCounter

	// LINT.ThenChange(stack/nic_stats.go:multiCounterNICNeighborStats)
}

// NICPacketStats holds basic packet statistics.
//
// +stateify savable
type NICPacketStats struct {
	// LINT.IfChange(NICPacketStats)

	// Packets is the number of packets counted.
	Packets *StatCounter

	// Bytes is the number of bytes counted.
	Bytes *StatCounter

	// LINT.ThenChange(stack/nic_stats.go:multiCounterNICPacketStats)
}

// IntegralStatCounterMap holds a map associating integral keys with
// StatCounters.
//
// +stateify savable
type IntegralStatCounterMap struct {
	mu sync.RWMutex `state:"nosave"`
	// +checklocks:mu
	counterMap map[uint64]*StatCounter
}

// Keys returns all keys present in the map.
func (m *IntegralStatCounterMap) Keys() []uint64 {
	m.mu.RLock()
	defer m.mu.RUnlock()
	var keys []uint64
	for k := range m.counterMap {
		keys = append(keys, k)
	}
	return keys
}

// Get returns the counter mapped by the provided key.
func (m *IntegralStatCounterMap) Get(key uint64) (*StatCounter, bool) {
	m.mu.RLock()
	defer m.mu.RUnlock()
	counter, ok := m.counterMap[key]
	return counter, ok
}

// Init initializes the map.
func (m *IntegralStatCounterMap) Init() {
	m.mu.Lock()
	defer m.mu.Unlock()
	m.counterMap = make(map[uint64]*StatCounter)
}

// Increment increments the counter associated with the provided key.
func (m *IntegralStatCounterMap) Increment(key uint64) {
	m.mu.RLock()
	counter, ok := m.counterMap[key]
	m.mu.RUnlock()

	if !ok {
		m.mu.Lock()
		counter, ok = m.counterMap[key]
		if !ok {
			counter = new(StatCounter)
			m.counterMap[key] = counter
		}
		m.mu.Unlock()
	}
	counter.Increment()
}

// A MultiIntegralStatCounterMap keeps track of two integral counter maps at
// once.
//
// +stateify savable
type MultiIntegralStatCounterMap struct {
	a *IntegralStatCounterMap
	b *IntegralStatCounterMap
}

// Init sets the internal integral counter maps to point to a and b.
func (m *MultiIntegralStatCounterMap) Init(a, b *IntegralStatCounterMap) {
	m.a = a
	m.b = b
}

// Increment increments the counter in each map corresponding to the
// provided key.
func (m *MultiIntegralStatCounterMap) Increment(key uint64) {
	m.a.Increment(key)
	m.b.Increment(key)
}

// NICStats holds NIC statistics.
//
// +stateify savable
type NICStats struct {
	// LINT.IfChange(NICStats)

	// UnknownL3ProtocolRcvdPacketCounts records the number of packets received
	// for each unknown or unsupported network protocol number.
	UnknownL3ProtocolRcvdPacketCounts *IntegralStatCounterMap

	// UnknownL4ProtocolRcvdPacketCounts records the number of packets received
	// for each unknown or unsupported transport protocol number.
	UnknownL4ProtocolRcvdPacketCounts *IntegralStatCounterMap

	// MalformedL4RcvdPackets is the number of packets received by a NIC that
	// could not be delivered to a transport endpoint because the L4 header could
	// not be parsed.
	MalformedL4RcvdPackets *StatCounter

	// Tx contains statistics about transmitted packets.
	Tx NICPacketStats

	// TxPacketsDroppedNoBufferSpace is the number of packets dropepd due to the
	// NIC not having enough buffer space to send the packet.
	//
	// Packets may be dropped with a no buffer space error when the device TX
	// queue is full.
	TxPacketsDroppedNoBufferSpace *StatCounter

	// Rx contains statistics about received packets.
	Rx NICPacketStats

	// DisabledRx contains statistics about received packets on disabled NICs.
	DisabledRx NICPacketStats

	// Neighbor contains statistics about neighbor entries.
	Neighbor NICNeighborStats

	// LINT.ThenChange(stack/nic_stats.go:multiCounterNICStats)
}

// FillIn returns a copy of s with nil fields initialized to new StatCounters.
func (s NICStats) FillIn() NICStats {
	InitStatCounters(reflect.ValueOf(&s).Elem())
	return s
}

// Stats holds statistics about the networking stack.
//
// +stateify savable
type Stats struct {
	// TODO(https://gvisor.dev/issues/5986): Make the DroppedPackets stat less
	// ambiguous.

	// DroppedPackets is the number of packets dropped at the transport layer.
	DroppedPackets *StatCounter

	// NICs is an aggregation of every NIC's statistics. These should not be
	// incremented using this field, but using the relevant NIC multicounters.
	NICs NICStats

	// ICMP is an aggregation of every NetworkEndpoint's ICMP statistics (both v4
	// and v6). These should not be incremented using this field, but using the
	// relevant NetworkEndpoint ICMP multicounters.
	ICMP ICMPStats

	// IGMP is an aggregation of every NetworkEndpoint's IGMP statistics. These
	// should not be incremented using this field, but using the relevant
	// NetworkEndpoint IGMP multicounters.
	IGMP IGMPStats

	// IP is an aggregation of every NetworkEndpoint's IP statistics. These should
	// not be incremented using this field, but using the relevant NetworkEndpoint
	// IP multicounters.
	IP IPStats

	// ARP is an aggregation of every NetworkEndpoint's ARP statistics. These
	// should not be incremented using this field, but using the relevant
	// NetworkEndpoint ARP multicounters.
	ARP ARPStats

	// TCP holds TCP-specific stats.
	TCP TCPStats

	// UDP holds UDP-specific stats.
	UDP UDPStats
}

// ReceiveErrors collects packet receive errors within transport endpoint.
//
// +stateify savable
type ReceiveErrors struct {
	// ReceiveBufferOverflow is the number of received packets dropped
	// due to the receive buffer being full.
	ReceiveBufferOverflow StatCounter

	// MalformedPacketsReceived is the number of incoming packets
	// dropped due to the packet header being in a malformed state.
	MalformedPacketsReceived StatCounter

	// ClosedReceiver is the number of received packets dropped because
	// of receiving endpoint state being closed.
	ClosedReceiver StatCounter

	// ChecksumErrors is the number of packets dropped due to bad checksums.
	ChecksumErrors StatCounter
}

// SendErrors collects packet send errors within the transport layer for an
// endpoint.
//
// +stateify savable
type SendErrors struct {
	// SendToNetworkFailed is the number of packets failed to be written to
	// the network endpoint.
	SendToNetworkFailed StatCounter

	// NoRoute is the number of times we failed to resolve IP route.
	NoRoute StatCounter
}

// ReadErrors collects segment read errors from an endpoint read call.
//
// +stateify savable
type ReadErrors struct {
	// ReadClosed is the number of received packet drops because the endpoint
	// was shutdown for read.
	ReadClosed StatCounter

	// InvalidEndpointState is the number of times we found the endpoint state
	// to be unexpected.
	InvalidEndpointState StatCounter

	// NotConnected is the number of times we tried to read but found that the
	// endpoint was not connected.
	NotConnected StatCounter
}

// WriteErrors collects packet write errors from an endpoint write call.
//
// +stateify savable
type WriteErrors struct {
	// WriteClosed is the number of packet drops because the endpoint
	// was shutdown for write.
	WriteClosed StatCounter

	// InvalidEndpointState is the number of times we found the endpoint state
	// to be unexpected.
	InvalidEndpointState StatCounter

	// InvalidArgs is the number of times invalid input arguments were
	// provided for endpoint Write call.
	InvalidArgs StatCounter
}

// TransportEndpointStats collects statistics about the endpoint.
//
// +stateify savable
type TransportEndpointStats struct {
	// PacketsReceived is the number of successful packet receives.
	PacketsReceived StatCounter

	// PacketsSent is the number of successful packet sends.
	PacketsSent StatCounter

	// ReceiveErrors collects packet receive errors within transport layer.
	ReceiveErrors ReceiveErrors

	// ReadErrors collects packet read errors from an endpoint read call.
	ReadErrors ReadErrors

	// SendErrors collects packet send errors within the transport layer.
	SendErrors SendErrors

	// WriteErrors collects packet write errors from an endpoint write call.
	WriteErrors WriteErrors
}

// IsEndpointStats is an empty method to implement the tcpip.EndpointStats
// marker interface.
func (*TransportEndpointStats) IsEndpointStats() {}

// InitStatCounters initializes v's fields with nil StatCounter fields to new
// StatCounters.
func InitStatCounters(v reflect.Value) {
	for i := 0; i < v.NumField(); i++ {
		v := v.Field(i)
		if s, ok := v.Addr().Interface().(**StatCounter); ok {
			if *s == nil {
				*s = new(StatCounter)
			}
		} else if s, ok := v.Addr().Interface().(**IntegralStatCounterMap); ok {
			if *s == nil {
				*s = new(IntegralStatCounterMap)
				(*s).Init()
			}
		} else {
			InitStatCounters(v)
		}
	}
}

// FillIn returns a copy of s with nil fields initialized to new StatCounters.
func (s Stats) FillIn() Stats {
	InitStatCounters(reflect.ValueOf(&s).Elem())
	return s
}

// Clone clones a copy of the TransportEndpointStats into dst by atomically
// reading each field.
func (src *TransportEndpointStats) Clone(dst *TransportEndpointStats) {
	clone(reflect.ValueOf(dst).Elem(), reflect.ValueOf(src).Elem())
}

func clone(dst reflect.Value, src reflect.Value) {
	for i := 0; i < dst.NumField(); i++ {
		d := dst.Field(i)
		s := src.Field(i)
		if c, ok := s.Addr().Interface().(*StatCounter); ok {
			d.Addr().Interface().(*StatCounter).IncrementBy(c.Value())
		} else {
			clone(d, s)
		}
	}
}

// String implements the fmt.Stringer interface.
func (a Address) String() string {
	switch l := a.Len(); l {
	case 4:
		return fmt.Sprintf("%d.%d.%d.%d", int(a.addr[0]), int(a.addr[1]), int(a.addr[2]), int(a.addr[3]))
	case 16:
		// Find the longest subsequence of hexadecimal zeros.
		start, end := -1, -1
		for i := 0; i < a.Len(); i += 2 {
			j := i
			for j < a.Len() && a.addr[j] == 0 && a.addr[j+1] == 0 {
				j += 2
			}
			if j > i+2 && j-i > end-start {
				start, end = i, j
			}
		}

		var b strings.Builder
		for i := 0; i < a.Len(); i += 2 {
			if i == start {
				b.WriteString("::")
				i = end
				if end >= a.Len() {
					break
				}
			} else if i > 0 {
				b.WriteByte(':')
			}
			v := uint16(a.addr[i+0])<<8 | uint16(a.addr[i+1])
			if v == 0 {
				b.WriteByte('0')
			} else {
				const digits = "0123456789abcdef"
				for i := uint(3); i < 4; i-- {
					if v := v >> (i * 4); v != 0 {
						b.WriteByte(digits[v&0xf])
					}
				}
			}
		}
		return b.String()
	default:
		return fmt.Sprintf("%x", a.addr[:l])
	}
}

// To4 converts the IPv4 address to a 4-byte representation.
// If the address is not an IPv4 address, To4 returns the empty Address.
func (a Address) To4() Address {
	const (
		ipv4len = 4
		ipv6len = 16
	)
	if a.Len() == ipv4len {
		return a
	}
	if a.Len() == ipv6len &&
		isZeros(a.addr[:10]) &&
		a.addr[10] == 0xff &&
		a.addr[11] == 0xff {
		return AddrFrom4Slice(a.addr[12:16])
	}
	return Address{}
}

// isZeros reports whether addr is all zeros.
func isZeros(addr []byte) bool {
	for _, b := range addr {
		if b != 0 {
			return false
		}
	}
	return true
}

// LinkAddress is a byte slice cast as a string that represents a link address.
// It is typically a 6-byte MAC address.
type LinkAddress string

// String implements the fmt.Stringer interface.
func (a LinkAddress) String() string {
	switch len(a) {
	case 6:
		return fmt.Sprintf("%02x:%02x:%02x:%02x:%02x:%02x", a[0], a[1], a[2], a[3], a[4], a[5])
	default:
		return fmt.Sprintf("%x", []byte(a))
	}
}

// ParseMACAddress parses an IEEE 802 address.
//
// It must be in the format aa:bb:cc:dd:ee:ff or aa-bb-cc-dd-ee-ff.
func ParseMACAddress(s string) (LinkAddress, error) {
	parts := strings.FieldsFunc(s, func(c rune) bool {
		return c == ':' || c == '-'
	})
	if len(parts) != LinkAddressSize {
		return "", fmt.Errorf("inconsistent parts: %s", s)
	}
	addr := make([]byte, 0, len(parts))
	for _, part := range parts {
		u, err := strconv.ParseUint(part, 16, 8)
		if err != nil {
			return "", fmt.Errorf("invalid hex digits: %s", s)
		}
		addr = append(addr, byte(u))
	}
	return LinkAddress(addr), nil
}

// GetRandMacAddr returns a mac address that can be used for local virtual devices.
func GetRandMacAddr() LinkAddress {
	mac := make(net.HardwareAddr, LinkAddressSize)
	rand.Read(mac) // Fill with random data.
	mac[0] &^= 0x1 // Clear multicast bit.
	mac[0] |= 0x2  // Set local assignment bit (IEEE802).
	return LinkAddress(mac)
}

// AddressWithPrefix is an address with its subnet prefix length.
//
// +stateify savable
type AddressWithPrefix struct {
	// Address is a network address.
	Address Address

	// PrefixLen is the subnet prefix length.
	PrefixLen int
}

// String implements the fmt.Stringer interface.
func (a AddressWithPrefix) String() string {
	return fmt.Sprintf("%s/%d", a.Address, a.PrefixLen)
}

// Subnet converts the address and prefix into a Subnet value and returns it.
func (a AddressWithPrefix) Subnet() Subnet {
	addrLen := a.Address.length
	if a.PrefixLen <= 0 {
		return Subnet{
			address: Address{length: addrLen},
			mask:    AddressMask{length: addrLen},
		}
	}
	if a.PrefixLen >= addrLen*8 {
		sub := Subnet{
			address: a.Address,
			mask:    AddressMask{length: addrLen},
		}
		for i := 0; i < addrLen; i++ {
			sub.mask.mask[i] = 0xff
		}
		return sub
	}

	sa := Address{length: addrLen}
	sm := AddressMask{length: addrLen}
	n := uint(a.PrefixLen)
	for i := 0; i < addrLen; i++ {
		if n >= 8 {
			sa.addr[i] = a.Address.addr[i]
			sm.mask[i] = 0xff
			n -= 8
			continue
		}
		sm.mask[i] = ^byte(0xff >> n)
		sa.addr[i] = a.Address.addr[i] & sm.mask[i]
		n = 0
	}

	// For extra caution, call NewSubnet rather than directly creating the Subnet
	// value. If that fails it indicates a serious bug in this code, so panic is
	// in order.
	s, err := NewSubnet(sa, sm)
	if err != nil {
		panic("invalid subnet: " + err.Error())
	}
	return s
}

// ProtocolAddress is an address and the network protocol it is associated
// with.
//
// +stateify savable
type ProtocolAddress struct {
	// Protocol is the protocol of the address.
	Protocol NetworkProtocolNumber

	// AddressWithPrefix is a network address with its subnet prefix length.
	AddressWithPrefix AddressWithPrefix
}

var (
	// danglingEndpointsMu protects access to danglingEndpoints.
	danglingEndpointsMu sync.Mutex

	// danglingEndpoints tracks all dangling endpoints no longer owned by the app.
	danglingEndpoints = make(map[Endpoint]struct{})
)

// GetDanglingEndpoints returns all dangling endpoints.
func GetDanglingEndpoints() []Endpoint {
	danglingEndpointsMu.Lock()
	es := make([]Endpoint, 0, len(danglingEndpoints))
	for e := range danglingEndpoints {
		es = append(es, e)
	}
	danglingEndpointsMu.Unlock()
	return es
}

// ReleaseDanglingEndpoints clears out all all reference counted objects held by
// dangling endpoints.
func ReleaseDanglingEndpoints() {
	// Get the dangling endpoints first to avoid locking around Release(), which
	// can cause a lock inversion with endpoint.mu and danglingEndpointsMu.
	// Calling Release on a dangling endpoint that has been deleted is a noop.
	eps := GetDanglingEndpoints()
	for _, ep := range eps {
		ep.Abort()
	}
}

// AddDanglingEndpoint adds a dangling endpoint.
func AddDanglingEndpoint(e Endpoint) {
	danglingEndpointsMu.Lock()
	danglingEndpoints[e] = struct{}{}
	danglingEndpointsMu.Unlock()
}

// DeleteDanglingEndpoint removes a dangling endpoint.
func DeleteDanglingEndpoint(e Endpoint) {
	danglingEndpointsMu.Lock()
	delete(danglingEndpoints, e)
	danglingEndpointsMu.Unlock()
}

// AsyncLoading is the global barrier for asynchronous endpoint loading
// activities.
var AsyncLoading sync.WaitGroup
