third_party/golibs/vendor/gvisor.dev/gvisor/pkg/tcpip/stack/conntrack.go - fuchsia - Git at Google

 // Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package stack

 import (
 	"encoding/binary"
 	"fmt"
 	"math"
 	"math/rand"
 	"sync"
 	"time"

 	"gvisor.dev/gvisor/pkg/atomicbitops"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack"
 )

 // Connection tracking is used to track and manipulate packets for NAT rules.
 // The connection is created for a packet if it does not exist. Every
 // connection contains two tuples (original and reply). The tuples are
 // manipulated if there is a matching NAT rule. The packet is modified by
 // looking at the tuples in each hook.
 //
 // Currently, only TCP tracking is supported.

 // Our hash table has 16K buckets.
 const numBuckets = 1 << 14

 const (
 	establishedTimeout   time.Duration = 5 * 24 * time.Hour
 	unestablishedTimeout time.Duration = 120 * time.Second
 )

 // tuple holds a connection's identifying and manipulating data in one
 // direction. It is immutable.
 //
 // +stateify savable
 type tuple struct {
 	// tupleEntry is used to build an intrusive list of tuples.
 	tupleEntry

 	// conn is the connection tracking entry this tuple belongs to.
 	conn *conn

 	// reply is true iff the tuple's direction is opposite that of the first
 	// packet seen on the connection.
 	reply bool

 	mu sync.RWMutex `state:"nosave"`
 	// +checklocks:mu
 	tupleID tupleID
 }

 func (t *tuple) id() tupleID {
 	t.mu.RLock()
 	defer t.mu.RUnlock()
 	return t.tupleID
 }

 // tupleID uniquely identifies a trackable connection in one direction.
 //
 // +stateify savable
 type tupleID struct {
 	srcAddr tcpip.Address
 	// The source port of a packet in the original direction is overloaded with
 	// the ident of an Echo Request packet.
 	//
 	// This also matches the behaviour of sending packets on Linux where the
 	// socket's source port value is used for the source port of outgoing packets
 	// for TCP/UDP and the ident field for outgoing Echo Requests on Ping sockets:
 	//
 	//   IPv4: https://github.com/torvalds/linux/blob/c5c17547b778975b3d83a73c8d84e8fb5ecf3ba5/net/ipv4/ping.c#L810
 	//   IPv6: https://github.com/torvalds/linux/blob/c5c17547b778975b3d83a73c8d84e8fb5ecf3ba5/net/ipv6/ping.c#L133
 	srcPortOrEchoRequestIdent uint16
 	dstAddr                   tcpip.Address
 	// The opposite of srcPortOrEchoRequestIdent; the destination port of a packet
 	// in the reply direction is overloaded with the ident of an Echo Reply.
 	dstPortOrEchoReplyIdent uint16
 	transProto              tcpip.TransportProtocolNumber
 	netProto                tcpip.NetworkProtocolNumber
 }

 // reply creates the reply tupleID.
 func (ti tupleID) reply() tupleID {
 	return tupleID{
 		srcAddr:                   ti.dstAddr,
 		srcPortOrEchoRequestIdent: ti.dstPortOrEchoReplyIdent,
 		dstAddr:                   ti.srcAddr,
 		dstPortOrEchoReplyIdent:   ti.srcPortOrEchoRequestIdent,
 		transProto:                ti.transProto,
 		netProto:                  ti.netProto,
 	}
 }

 type manipType int

 const (
 	// manipNotPerformed indicates that NAT has not been performed.
 	manipNotPerformed manipType = iota

 	// manipPerformed indicates that NAT was performed.
 	manipPerformed

 	// manipPerformedNoop indicates that NAT was performed but it was a no-op.
 	manipPerformedNoop
 )

 type finalizeResult uint32

 const (
 	// A finalizeResult must be explicitly set so we don't make use of the zero
 	// value.
 	_ finalizeResult = iota

 	finalizeResultSuccess
 	finalizeResultConflict
 )

 // conn is a tracked connection.
 //
 // +stateify savable
 type conn struct {
 	ct *ConnTrack

 	// original is the tuple in original direction. It is immutable.
 	original tuple

 	// reply is the tuple in reply direction.
 	reply tuple

 	finalizeOnce sync.Once
 	// Holds a finalizeResult.
 	finalizeResult atomicbitops.Uint32

 	mu sync.RWMutex `state:"nosave"`
 	// sourceManip indicates the source manipulation type.
 	//
 	// +checklocks:mu
 	sourceManip manipType
 	// destinationManip indicates the destination's manipulation type.
 	//
 	// +checklocks:mu
 	destinationManip manipType

 	stateMu sync.RWMutex `state:"nosave"`
 	// tcb is TCB control block. It is used to keep track of states
 	// of tcp connection.
 	//
 	// +checklocks:stateMu
 	tcb tcpconntrack.TCB
 	// lastUsed is the last time the connection saw a relevant packet, and
 	// is updated by each packet on the connection.
 	//
 	// +checklocks:stateMu
 	lastUsed tcpip.MonotonicTime
 }

 // timedOut returns whether the connection timed out based on its state.
 func (cn *conn) timedOut(now tcpip.MonotonicTime) bool {
 	cn.stateMu.RLock()
 	defer cn.stateMu.RUnlock()
 	if cn.tcb.State() == tcpconntrack.ResultAlive {
 		// Use the same default as Linux, which doesn't delete
 		// established connections for 5(!) days.
 		return now.Sub(cn.lastUsed) > establishedTimeout
 	}
 	// Use the same default as Linux, which lets connections in most states
 	// other than established remain for <= 120 seconds.
 	return now.Sub(cn.lastUsed) > unestablishedTimeout
 }

 // update the connection tracking state.
 func (cn *conn) update(pkt *PacketBuffer, reply bool) {
 	cn.stateMu.Lock()
 	defer cn.stateMu.Unlock()

 	// Mark the connection as having been used recently so it isn't reaped.
 	cn.lastUsed = cn.ct.clock.NowMonotonic()

 	if pkt.TransportProtocolNumber != header.TCPProtocolNumber {
 		return
 	}

 	tcpHeader := header.TCP(pkt.TransportHeader().View())

 	// Update the state of tcb. tcb assumes it's always initialized on the
 	// client. However, we only need to know whether the connection is
 	// established or not, so the client/server distinction isn't important.
 	if cn.tcb.IsEmpty() {
 		cn.tcb.Init(tcpHeader, pkt.Data().Size())
 		return
 	}

 	if reply {
 		cn.tcb.UpdateStateReply(tcpHeader, pkt.Data().Size())
 	} else {
 		cn.tcb.UpdateStateOriginal(tcpHeader, pkt.Data().Size())
 	}
 }

 // ConnTrack tracks all connections created for NAT rules. Most users are
 // expected to only call handlePacket, insertRedirectConn, and maybeInsertNoop.
 //
 // ConnTrack keeps all connections in a slice of buckets, each of which holds a
 // linked list of tuples. This gives us some desirable properties:
 // - Each bucket has its own lock, lessening lock contention.
 // - The slice is large enough that lists stay short (<10 elements on average).
 //   Thus traversal is fast.
 // - During linked list traversal we reap expired connections. This amortizes
 //   the cost of reaping them and makes reapUnused faster.
 //
 // Locks are ordered by their location in the buckets slice. That is, a
 // goroutine that locks buckets[i] can only lock buckets[j] s.t. i < j.
 //
 // +stateify savable
 type ConnTrack struct {
 	// seed is a one-time random value initialized at stack startup
 	// and is used in the calculation of hash keys for the list of buckets.
 	// It is immutable.
 	seed uint32

 	// clock provides timing used to determine conntrack reapings.
 	clock tcpip.Clock
 	rand  *rand.Rand

 	mu sync.RWMutex `state:"nosave"`
 	// mu protects the buckets slice, but not buckets' contents. Only take
 	// the write lock if you are modifying the slice or saving for S/R.
 	//
 	// +checklocks:mu
 	buckets []bucket
 }

 // +stateify savable
 type bucket struct {
 	mu sync.RWMutex `state:"nosave"`
 	// +checklocks:mu
 	tuples tupleList
 }

 // A netAndTransHeadersFunc returns the network and transport headers found
 // in an ICMP payload. The transport layer's payload will not be returned.
 //
 // May panic if the packet does not hold the transport header.
 type netAndTransHeadersFunc func(icmpPayload []byte, minTransHdrLen int) (netHdr header.Network, transHdrBytes []byte)

 func v4NetAndTransHdr(icmpPayload []byte, minTransHdrLen int) (header.Network, []byte) {
 	netHdr := header.IPv4(icmpPayload)
 	// Do not use netHdr.Payload() as we might not hold the full packet
 	// in the ICMP error; Payload() panics if the buffer is smaller than
 	// the total length specified in the IPv4 header.
 	transHdr := icmpPayload[netHdr.HeaderLength():]
 	return netHdr, transHdr[:minTransHdrLen]
 }

 func v6NetAndTransHdr(icmpPayload []byte, minTransHdrLen int) (header.Network, []byte) {
 	netHdr := header.IPv6(icmpPayload)
 	// Do not use netHdr.Payload() as we might not hold the full packet
 	// in the ICMP error; Payload() panics if the IP payload is smaller than
 	// the payload length specified in the IPv6 header.
 	transHdr := icmpPayload[header.IPv6MinimumSize:]
 	return netHdr, transHdr[:minTransHdrLen]
 }

 func getEmbeddedNetAndTransHeaders(pkt *PacketBuffer, netHdrLength int, getNetAndTransHdr netAndTransHeadersFunc, transProto tcpip.TransportProtocolNumber) (header.Network, header.ChecksummableTransport, bool) {
 	switch transProto {
 	case header.TCPProtocolNumber:
 		if netAndTransHeader, ok := pkt.Data().PullUp(netHdrLength + header.TCPMinimumSize); ok {
 			netHeader, transHeaderBytes := getNetAndTransHdr(netAndTransHeader, header.TCPMinimumSize)
 			return netHeader, header.TCP(transHeaderBytes), true
 		}
 	case header.UDPProtocolNumber:
 		if netAndTransHeader, ok := pkt.Data().PullUp(netHdrLength + header.UDPMinimumSize); ok {
 			netHeader, transHeaderBytes := getNetAndTransHdr(netAndTransHeader, header.UDPMinimumSize)
 			return netHeader, header.UDP(transHeaderBytes), true
 		}
 	}
 	return nil, nil, false
 }

 func getHeaders(pkt *PacketBuffer) (netHdr header.Network, transHdr header.Transport, isICMPError bool, ok bool) {
 	switch pkt.TransportProtocolNumber {
 	case header.TCPProtocolNumber:
 		if tcpHeader := header.TCP(pkt.TransportHeader().View()); len(tcpHeader) >= header.TCPMinimumSize {
 			return pkt.Network(), tcpHeader, false, true
 		}
 		return nil, nil, false, false
 	case header.UDPProtocolNumber:
 		if udpHeader := header.UDP(pkt.TransportHeader().View()); len(udpHeader) >= header.UDPMinimumSize {
 			return pkt.Network(), udpHeader, false, true
 		}
 		return nil, nil, false, false
 	case header.ICMPv4ProtocolNumber:
 		icmpHeader := header.ICMPv4(pkt.TransportHeader().View())
 		if len(icmpHeader) < header.ICMPv4MinimumSize {
 			return nil, nil, false, false
 		}

 		switch icmpType := icmpHeader.Type(); icmpType {
 		case header.ICMPv4Echo, header.ICMPv4EchoReply:
 			return pkt.Network(), icmpHeader, false, true
 		case header.ICMPv4DstUnreachable, header.ICMPv4TimeExceeded, header.ICMPv4ParamProblem:
 		default:
 			panic(fmt.Sprintf("unexpected ICMPv4 type = %d", icmpType))
 		}

 		h, ok := pkt.Data().PullUp(header.IPv4MinimumSize)
 		if !ok {
 			panic(fmt.Sprintf("should have a valid IPv4 packet; only have %d bytes, want at least %d bytes", pkt.Data().Size(), header.IPv4MinimumSize))
 		}

 		if header.IPv4(h).HeaderLength() > header.IPv4MinimumSize {
 			// TODO(https://gvisor.dev/issue/6765): Handle IPv4 options.
 			panic("should have dropped packets with IPv4 options")
 		}

 		if netHdr, transHdr, ok := getEmbeddedNetAndTransHeaders(pkt, header.IPv4MinimumSize, v4NetAndTransHdr, pkt.tuple.id().transProto); ok {
 			return netHdr, transHdr, true, true
 		}
 		return nil, nil, false, false
 	case header.ICMPv6ProtocolNumber:
 		icmpHeader := header.ICMPv6(pkt.TransportHeader().View())
 		if len(icmpHeader) < header.ICMPv6MinimumSize {
 			return nil, nil, false, false
 		}

 		switch icmpType := icmpHeader.Type(); icmpType {
 		case header.ICMPv6EchoRequest, header.ICMPv6EchoReply:
 			return pkt.Network(), icmpHeader, false, true
 		case header.ICMPv6DstUnreachable, header.ICMPv6PacketTooBig, header.ICMPv6TimeExceeded, header.ICMPv6ParamProblem:
 		default:
 			panic(fmt.Sprintf("unexpected ICMPv6 type = %d", icmpType))
 		}

 		h, ok := pkt.Data().PullUp(header.IPv6MinimumSize)
 		if !ok {
 			panic(fmt.Sprintf("should have a valid IPv6 packet; only have %d bytes, want at least %d bytes", pkt.Data().Size(), header.IPv6MinimumSize))
 		}

 		// We do not support extension headers in ICMP errors so the next header
 		// in the IPv6 packet should be a tracked protocol if we reach this point.
 		//
 		// TODO(https://gvisor.dev/issue/6789): Support extension headers.
 		transProto := pkt.tuple.id().transProto
 		if got := header.IPv6(h).TransportProtocol(); got != transProto {
 			panic(fmt.Sprintf("got TransportProtocol() = %d, want = %d", got, transProto))
 		}

 		if netHdr, transHdr, ok := getEmbeddedNetAndTransHeaders(pkt, header.IPv6MinimumSize, v6NetAndTransHdr, transProto); ok {
 			return netHdr, transHdr, true, true
 		}
 		return nil, nil, false, false
 	default:
 		panic(fmt.Sprintf("unexpected transport protocol = %d", pkt.TransportProtocolNumber))
 	}
 }

 func getTupleIDForRegularPacket(netHdr header.Network, netProto tcpip.NetworkProtocolNumber, transHdr header.Transport, transProto tcpip.TransportProtocolNumber) tupleID {
 	return tupleID{
 		srcAddr:                   netHdr.SourceAddress(),
 		srcPortOrEchoRequestIdent: transHdr.SourcePort(),
 		dstAddr:                   netHdr.DestinationAddress(),
 		dstPortOrEchoReplyIdent:   transHdr.DestinationPort(),
 		transProto:                transProto,
 		netProto:                  netProto,
 	}
 }

 func getTupleIDForPacketInICMPError(pkt *PacketBuffer, getNetAndTransHdr netAndTransHeadersFunc, netProto tcpip.NetworkProtocolNumber, netLen int, transProto tcpip.TransportProtocolNumber) (tupleID, bool) {
 	if netHdr, transHdr, ok := getEmbeddedNetAndTransHeaders(pkt, netLen, getNetAndTransHdr, transProto); ok {
 		return tupleID{
 			srcAddr:                   netHdr.DestinationAddress(),
 			srcPortOrEchoRequestIdent: transHdr.DestinationPort(),
 			dstAddr:                   netHdr.SourceAddress(),
 			dstPortOrEchoReplyIdent:   transHdr.SourcePort(),
 			transProto:                transProto,
 			netProto:                  netProto,
 		}, true
 	}

 	return tupleID{}, false
 }

 type getTupleIDDisposition int

 const (
 	getTupleIDNotOK getTupleIDDisposition = iota
 	getTupleIDOKAndAllowNewConn
 	getTupleIDOKAndDontAllowNewConn
 )

 func getTupleIDForEchoPacket(pkt *PacketBuffer, ident uint16, request bool) tupleID {
 	netHdr := pkt.Network()
 	tid := tupleID{
 		srcAddr:    netHdr.SourceAddress(),
 		dstAddr:    netHdr.DestinationAddress(),
 		transProto: pkt.TransportProtocolNumber,
 		netProto:   pkt.NetworkProtocolNumber,
 	}

 	if request {
 		tid.srcPortOrEchoRequestIdent = ident
 	} else {
 		tid.dstPortOrEchoReplyIdent = ident
 	}

 	return tid
 }

 func getTupleID(pkt *PacketBuffer) (tupleID, getTupleIDDisposition) {
 	switch pkt.TransportProtocolNumber {
 	case header.TCPProtocolNumber:
 		if transHeader := header.TCP(pkt.TransportHeader().View()); len(transHeader) >= header.TCPMinimumSize {
 			return getTupleIDForRegularPacket(pkt.Network(), pkt.NetworkProtocolNumber, transHeader, pkt.TransportProtocolNumber), getTupleIDOKAndAllowNewConn
 		}
 	case header.UDPProtocolNumber:
 		if transHeader := header.UDP(pkt.TransportHeader().View()); len(transHeader) >= header.UDPMinimumSize {
 			return getTupleIDForRegularPacket(pkt.Network(), pkt.NetworkProtocolNumber, transHeader, pkt.TransportProtocolNumber), getTupleIDOKAndAllowNewConn
 		}
 	case header.ICMPv4ProtocolNumber:
 		icmp := header.ICMPv4(pkt.TransportHeader().View())
 		if len(icmp) < header.ICMPv4MinimumSize {
 			return tupleID{}, getTupleIDNotOK
 		}

 		switch icmp.Type() {
 		case header.ICMPv4Echo:
 			return getTupleIDForEchoPacket(pkt, icmp.Ident(), true /* request */), getTupleIDOKAndAllowNewConn
 		case header.ICMPv4EchoReply:
 			// Do not create a new connection in response to a reply packet as only
 			// the first packet of a connection should create a conntrack entry but
 			// a reply is never the first packet sent for a connection.
 			return getTupleIDForEchoPacket(pkt, icmp.Ident(), false /* request */), getTupleIDOKAndDontAllowNewConn
 		case header.ICMPv4DstUnreachable, header.ICMPv4TimeExceeded, header.ICMPv4ParamProblem:
 		default:
 			// Unsupported ICMP type for NAT-ing.
 			return tupleID{}, getTupleIDNotOK
 		}

 		h, ok := pkt.Data().PullUp(header.IPv4MinimumSize)
 		if !ok {
 			return tupleID{}, getTupleIDNotOK
 		}

 		ipv4 := header.IPv4(h)
 		if ipv4.HeaderLength() > header.IPv4MinimumSize {
 			// TODO(https://gvisor.dev/issue/6765): Handle IPv4 options.
 			return tupleID{}, getTupleIDNotOK
 		}

 		if tid, ok := getTupleIDForPacketInICMPError(pkt, v4NetAndTransHdr, header.IPv4ProtocolNumber, header.IPv4MinimumSize, ipv4.TransportProtocol()); ok {
 			// Do not create a new connection in response to an ICMP error.
 			return tid, getTupleIDOKAndDontAllowNewConn
 		}
 	case header.ICMPv6ProtocolNumber:
 		icmp := header.ICMPv6(pkt.TransportHeader().View())
 		if len(icmp) < header.ICMPv6MinimumSize {
 			return tupleID{}, getTupleIDNotOK
 		}

 		switch icmp.Type() {
 		case header.ICMPv6EchoRequest:
 			return getTupleIDForEchoPacket(pkt, icmp.Ident(), true /* request */), getTupleIDOKAndAllowNewConn
 		case header.ICMPv6EchoReply:
 			// Do not create a new connection in response to a reply packet as only
 			// the first packet of a connection should create a conntrack entry but
 			// a reply is never the first packet sent for a connection.
 			return getTupleIDForEchoPacket(pkt, icmp.Ident(), false /* request */), getTupleIDOKAndDontAllowNewConn
 		case header.ICMPv6DstUnreachable, header.ICMPv6PacketTooBig, header.ICMPv6TimeExceeded, header.ICMPv6ParamProblem:
 		default:
 			return tupleID{}, getTupleIDNotOK
 		}

 		h, ok := pkt.Data().PullUp(header.IPv6MinimumSize)
 		if !ok {
 			return tupleID{}, getTupleIDNotOK
 		}

 		// TODO(https://gvisor.dev/issue/6789): Handle extension headers.
 		if tid, ok := getTupleIDForPacketInICMPError(pkt, v6NetAndTransHdr, header.IPv6ProtocolNumber, header.IPv6MinimumSize, header.IPv6(h).TransportProtocol()); ok {
 			// Do not create a new connection in response to an ICMP error.
 			return tid, getTupleIDOKAndDontAllowNewConn
 		}
 	}

 	return tupleID{}, getTupleIDNotOK
 }

 func (ct *ConnTrack) init() {
 	ct.mu.Lock()
 	defer ct.mu.Unlock()
 	ct.buckets = make([]bucket, numBuckets)
 }

 // getConnAndUpdate attempts to get a connection or creates one if no
 // connection exists for the packet and packet's protocol is trackable.
 //
 // If the packet's protocol is trackable, the connection's state is updated to
 // match the contents of the packet.
 func (ct *ConnTrack) getConnAndUpdate(pkt *PacketBuffer) *tuple {
 	// Get or (maybe) create a connection.
 	t := func() *tuple {
 		var allowNewConn bool
 		tid, res := getTupleID(pkt)
 		switch res {
 		case getTupleIDNotOK:
 			return nil
 		case getTupleIDOKAndAllowNewConn:
 			allowNewConn = true
 		case getTupleIDOKAndDontAllowNewConn:
 			allowNewConn = false
 		default:
 			panic(fmt.Sprintf("unhandled %[1]T = %[1]d", res))
 		}

 		bktID := ct.bucket(tid)

 		ct.mu.RLock()
 		bkt := &ct.buckets[bktID]
 		ct.mu.RUnlock()

 		now := ct.clock.NowMonotonic()
 		if t := bkt.connForTID(tid, now); t != nil {
 			return t
 		}

 		if !allowNewConn {
 			return nil
 		}

 		bkt.mu.Lock()
 		defer bkt.mu.Unlock()

 		// Make sure a connection wasn't added between when we last checked the
 		// bucket and acquired the bucket's write lock.
 		if t := bkt.connForTIDRLocked(tid, now); t != nil {
 			return t
 		}

 		// This is the first packet we're seeing for the connection. Create an entry
 		// for this new connection.
 		conn := &conn{
 			ct:       ct,
 			original: tuple{tupleID: tid},
 			reply:    tuple{tupleID: tid.reply(), reply: true},
 			lastUsed: now,
 		}
 		conn.original.conn = conn
 		conn.reply.conn = conn

 		// For now, we only map an entry for the packet's original tuple as NAT may be
 		// performed on this connection. Until the packet goes through all the hooks
 		// and its final address/port is known, we cannot know what the response
 		// packet's addresses/ports will look like.
 		//
 		// This is okay because the destination cannot send its response until it
 		// receives the packet; the packet will only be received once all the hooks
 		// have been performed.
 		//
 		// See (*conn).finalize.
 		bkt.tuples.PushFront(&conn.original)
 		return &conn.original
 	}()
 	if t != nil {
 		t.conn.update(pkt, t.reply)
 	}
 	return t
 }

 func (ct *ConnTrack) connForTID(tid tupleID) *tuple {
 	bktID := ct.bucket(tid)

 	ct.mu.RLock()
 	bkt := &ct.buckets[bktID]
 	ct.mu.RUnlock()

 	return bkt.connForTID(tid, ct.clock.NowMonotonic())
 }

 func (bkt *bucket) connForTID(tid tupleID, now tcpip.MonotonicTime) *tuple {
 	bkt.mu.RLock()
 	defer bkt.mu.RUnlock()
 	return bkt.connForTIDRLocked(tid, now)
 }

 // +checklocksread:bkt.mu
 func (bkt *bucket) connForTIDRLocked(tid tupleID, now tcpip.MonotonicTime) *tuple {
 	for other := bkt.tuples.Front(); other != nil; other = other.Next() {
 		if tid == other.id() && !other.conn.timedOut(now) {
 			return other
 		}
 	}
 	return nil
 }

 func (ct *ConnTrack) finalize(cn *conn) finalizeResult {
 	ct.mu.RLock()
 	buckets := ct.buckets
 	ct.mu.RUnlock()

 	{
 		tid := cn.reply.id()
 		id := ct.bucket(tid)

 		bkt := &buckets[id]
 		bkt.mu.Lock()
 		t := bkt.connForTIDRLocked(tid, ct.clock.NowMonotonic())
 		if t == nil {
 			bkt.tuples.PushFront(&cn.reply)
 			bkt.mu.Unlock()
 			return finalizeResultSuccess
 		}
 		bkt.mu.Unlock()

 		if t.conn == cn {
 			// We already have an entry for the reply tuple.
 			//
 			// This can occur when the source address/port is the same as the
 			// destination address/port. In this scenario, tid == tid.reply().
 			return finalizeResultSuccess
 		}
 	}

 	// Another connection for the reply already exists. Remove the original and
 	// let the caller know we failed.
 	//
 	// TODO(https://gvisor.dev/issue/6850): Investigate handling this clash
 	// better.

 	tid := cn.original.id()
 	id := ct.bucket(tid)
 	bkt := &buckets[id]
 	bkt.mu.Lock()
 	defer bkt.mu.Unlock()
 	bkt.tuples.Remove(&cn.original)
 	return finalizeResultConflict
 }

 func (cn *conn) getFinalizeResult() finalizeResult {
 	return finalizeResult(cn.finalizeResult.Load())
 }

 // finalize attempts to finalize the connection and returns true iff the
 // connection was successfully finalized.
 //
 // If the connection failed to finalize, the caller should drop the packet
 // associated with the connection.
 //
 // If multiple goroutines attempt to finalize at the same time, only one
 // goroutine will perform the work to finalize the connection, but all
 // goroutines will block until the finalizing goroutine finishes finalizing.
 func (cn *conn) finalize() bool {
 	cn.finalizeOnce.Do(func() {
 		cn.finalizeResult.Store(uint32(cn.ct.finalize(cn)))
 	})

 	switch res := cn.getFinalizeResult(); res {
 	case finalizeResultSuccess:
 		return true
 	case finalizeResultConflict:
 		return false
 	default:
 		panic(fmt.Sprintf("unhandled result = %d", res))
 	}
 }

 func (cn *conn) maybePerformNoopNAT(dnat bool) {
 	cn.mu.Lock()
 	defer cn.mu.Unlock()

 	var manip *manipType
 	if dnat {
 		manip = &cn.destinationManip
 	} else {
 		manip = &cn.sourceManip
 	}

 	if *manip == manipNotPerformed {
 		*manip = manipPerformedNoop
 	}
 }

 type portOrIdentRange struct {
 	start uint16
 	size  uint32
 }

 // performNAT setups up the connection for the specified NAT and rewrites the
 // packet.
 //
 // If NAT has already been performed on the connection, then the packet will
 // be rewritten with the NAT performed on the connection, ignoring the passed
 // address and port range.
 //
 // Generally, only the first packet of a connection reaches this method; other
 // packets will be manipulated without needing to modify the connection.
 func (cn *conn) performNAT(pkt *PacketBuffer, hook Hook, r *Route, portsOrIdents portOrIdentRange, natAddress tcpip.Address, dnat bool) {
 	lastPortOrIdent := func() uint16 {
 		lastPortOrIdent := uint32(portsOrIdents.start) + portsOrIdents.size - 1
 		if lastPortOrIdent > math.MaxUint16 {
 			panic(fmt.Sprintf("got lastPortOrIdent = %d, want <= MaxUint16(=%d); portsOrIdents=%#v", lastPortOrIdent, math.MaxUint16, portsOrIdents))
 		}
 		return uint16(lastPortOrIdent)
 	}()

 	// Make sure the packet is re-written after performing NAT.
 	defer func() {
 		// handlePacket returns true if the packet may skip the NAT table as the
 		// connection is already NATed, but if we reach this point we must be in the
 		// NAT table, so the return value is useless for us.
 		_ = cn.handlePacket(pkt, hook, r)
 	}()

 	cn.mu.Lock()
 	defer cn.mu.Unlock()

 	cn.reply.mu.Lock()
 	defer cn.reply.mu.Unlock()

 	var manip *manipType
 	var address *tcpip.Address
 	var portOrIdent *uint16
 	if dnat {
 		manip = &cn.destinationManip
 		address = &cn.reply.tupleID.srcAddr
 		portOrIdent = &cn.reply.tupleID.srcPortOrEchoRequestIdent
 	} else {
 		manip = &cn.sourceManip
 		address = &cn.reply.tupleID.dstAddr
 		portOrIdent = &cn.reply.tupleID.dstPortOrEchoReplyIdent
 	}

 	if *manip != manipNotPerformed {
 		return
 	}
 	*manip = manipPerformed
 	*address = natAddress

 	// Does the current port/ident fit in the range?
 	if portsOrIdents.start <= *portOrIdent && *portOrIdent <= lastPortOrIdent {
 		// Yes, is the current reply tuple unique?
 		if other := cn.ct.connForTID(cn.reply.tupleID); other == nil {
 			// Yes! No need to change the port.
 			return
 		}
 	}

 	// Try our best to find a port/ident that results in a unique reply tuple.
 	//
 	// We limit the number of attempts to find a unique tuple to not waste a lot
 	// of time looking for a unique tuple.
 	//
 	// Matches linux behaviour introduced in
 	// https://github.com/torvalds/linux/commit/a504b703bb1da526a01593da0e4be2af9d9f5fa8.
 	const maxAttemptsForInitialRound uint32 = 128
 	const minAttemptsToContinue = 16

 	allowedInitialAttempts := maxAttemptsForInitialRound
 	if allowedInitialAttempts > portsOrIdents.size {
 		allowedInitialAttempts = portsOrIdents.size
 	}

 	for maxAttempts := allowedInitialAttempts; ; maxAttempts /= 2 {
 		// Start reach round with a random initial port/ident offset.
 		randOffset := cn.ct.rand.Uint32()

 		for i := uint32(0); i < maxAttempts; i++ {
 			newPortOrIdentU32 := uint32(portsOrIdents.start) + (randOffset+i)%portsOrIdents.size
 			if newPortOrIdentU32 > math.MaxUint16 {
 				panic(fmt.Sprintf("got newPortOrIdentU32 = %d, want <= MaxUint16(=%d); portsOrIdents=%#v, randOffset=%d", newPortOrIdentU32, math.MaxUint16, portsOrIdents, randOffset))
 			}

 			*portOrIdent = uint16(newPortOrIdentU32)

 			if other := cn.ct.connForTID(cn.reply.tupleID); other == nil {
 				// We found a unique tuple!
 				return
 			}
 		}

 		if maxAttempts == portsOrIdents.size {
 			// We already tried all the ports/idents in the range so no need to keep
 			// trying.
 			return
 		}

 		if maxAttempts < minAttemptsToContinue {
 			return
 		}
 	}

 	// We did not find a unique tuple, use the last used port anyways.
 	// TODO(https://gvisor.dev/issue/6850): Handle not finding a unique tuple
 	// better (e.g. remove the connection and drop the packet).
 }

 // handlePacket attempts to handle a packet and perform NAT if the connection
 // has had NAT performed on it.
 //
 // Returns true if the packet can skip the NAT table.
 func (cn *conn) handlePacket(pkt *PacketBuffer, hook Hook, rt *Route) bool {
 	netHdr, transHdr, isICMPError, ok := getHeaders(pkt)
 	if !ok {
 		return false
 	}

 	fullChecksum := false
 	updatePseudoHeader := false
 	natDone := &pkt.snatDone
 	dnat := false
 	switch hook {
 	case Prerouting:
 		// Packet came from outside the stack so it must have a checksum set
 		// already.
 		fullChecksum = true
 		updatePseudoHeader = true

 		natDone = &pkt.dnatDone
 		dnat = true
 	case Input:
 	case Forward:
 		panic("should not handle packet in the forwarding hook")
 	case Output:
 		natDone = &pkt.dnatDone
 		dnat = true
 		fallthrough
 	case Postrouting:
 		if pkt.TransportProtocolNumber == header.TCPProtocolNumber && pkt.GSOOptions.Type != GSONone && pkt.GSOOptions.NeedsCsum {
 			updatePseudoHeader = true
 		} else if rt.RequiresTXTransportChecksum() {
 			fullChecksum = true
 			updatePseudoHeader = true
 		}
 	default:
 		panic(fmt.Sprintf("unrecognized hook = %d", hook))
 	}

 	if *natDone {
 		panic(fmt.Sprintf("packet already had NAT(dnat=%t) performed at hook=%s; pkt=%#v", dnat, hook, pkt))
 	}

 	// TODO(gvisor.dev/issue/5748): TCP checksums on inbound packets should be
 	// validated if checksum offloading is off. It may require IP defrag if the
 	// packets are fragmented.

 	reply := pkt.tuple.reply

 	tid, manip := func() (tupleID, manipType) {
 		cn.mu.RLock()
 		defer cn.mu.RUnlock()

 		if reply {
 			tid := cn.original.id()

 			if dnat {
 				return tid, cn.sourceManip
 			}
 			return tid, cn.destinationManip
 		}

 		tid := cn.reply.id()
 		if dnat {
 			return tid, cn.destinationManip
 		}
 		return tid, cn.sourceManip
 	}()
 	switch manip {
 	case manipNotPerformed:
 		return false
 	case manipPerformedNoop:
 		*natDone = true
 		return true
 	case manipPerformed:
 	default:
 		panic(fmt.Sprintf("unhandled manip = %d", manip))
 	}

 	newPort := tid.dstPortOrEchoReplyIdent
 	newAddr := tid.dstAddr
 	if dnat {
 		newPort = tid.srcPortOrEchoRequestIdent
 		newAddr = tid.srcAddr
 	}

 	rewritePacket(
 		netHdr,
 		transHdr,
 		!dnat != isICMPError,
 		fullChecksum,
 		updatePseudoHeader,
 		newPort,
 		newAddr,
 	)

 	*natDone = true

 	if !isICMPError {
 		return true
 	}

 	// We performed NAT on (erroneous) packet that triggered an ICMP response, but
 	// not the ICMP packet itself.
 	switch pkt.TransportProtocolNumber {
 	case header.ICMPv4ProtocolNumber:
 		icmp := header.ICMPv4(pkt.TransportHeader().View())
 		// TODO(https://gvisor.dev/issue/6788): Incrementally update ICMP checksum.
 		icmp.SetChecksum(0)
 		icmp.SetChecksum(header.ICMPv4Checksum(icmp, pkt.Data().AsRange().Checksum()))

 		network := header.IPv4(pkt.NetworkHeader().View())
 		if dnat {
 			network.SetDestinationAddressWithChecksumUpdate(tid.srcAddr)
 		} else {
 			network.SetSourceAddressWithChecksumUpdate(tid.dstAddr)
 		}
 	case header.ICMPv6ProtocolNumber:
 		network := header.IPv6(pkt.NetworkHeader().View())
 		srcAddr := network.SourceAddress()
 		dstAddr := network.DestinationAddress()
 		if dnat {
 			dstAddr = tid.srcAddr
 		} else {
 			srcAddr = tid.dstAddr
 		}

 		icmp := header.ICMPv6(pkt.TransportHeader().View())
 		// TODO(https://gvisor.dev/issue/6788): Incrementally update ICMP checksum.
 		icmp.SetChecksum(0)
 		payload := pkt.Data()
 		icmp.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{
 			Header:      icmp,
 			Src:         srcAddr,
 			Dst:         dstAddr,
 			PayloadCsum: payload.AsRange().Checksum(),
 			PayloadLen:  payload.Size(),
 		}))

 		if dnat {
 			network.SetDestinationAddress(dstAddr)
 		} else {
 			network.SetSourceAddress(srcAddr)
 		}
 	}

 	return true
 }

 // bucket gets the conntrack bucket for a tupleID.
 func (ct *ConnTrack) bucket(id tupleID) int {
 	h := jenkins.Sum32(ct.seed)
 	h.Write([]byte(id.srcAddr))
 	h.Write([]byte(id.dstAddr))
 	shortBuf := make([]byte, 2)
 	binary.LittleEndian.PutUint16(shortBuf, id.srcPortOrEchoRequestIdent)
 	h.Write([]byte(shortBuf))
 	binary.LittleEndian.PutUint16(shortBuf, id.dstPortOrEchoReplyIdent)
 	h.Write([]byte(shortBuf))
 	binary.LittleEndian.PutUint16(shortBuf, uint16(id.transProto))
 	h.Write([]byte(shortBuf))
 	binary.LittleEndian.PutUint16(shortBuf, uint16(id.netProto))
 	h.Write([]byte(shortBuf))
 	ct.mu.RLock()
 	defer ct.mu.RUnlock()
 	return int(h.Sum32()) % len(ct.buckets)
 }

 // reapUnused deletes timed out entries from the conntrack map. The rules for
 // reaping are:
 // - Each call to reapUnused traverses a fraction of the conntrack table.
 //   Specifically, it traverses len(ct.buckets)/fractionPerReaping.
 // - After reaping, reapUnused decides when it should next run based on the
 //   ratio of expired connections to examined connections. If the ratio is
 //   greater than maxExpiredPct, it schedules the next run quickly. Otherwise it
 //   slightly increases the interval between runs.
 // - maxFullTraversal caps the time it takes to traverse the entire table.
 //
 // reapUnused returns the next bucket that should be checked and the time after
 // which it should be called again.
 func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, time.Duration) {
 	const fractionPerReaping = 128
 	const maxExpiredPct = 50
 	const maxFullTraversal = 60 * time.Second
 	const minInterval = 10 * time.Millisecond
 	const maxInterval = maxFullTraversal / fractionPerReaping

 	now := ct.clock.NowMonotonic()
 	checked := 0
 	expired := 0
 	var idx int
 	ct.mu.RLock()
 	defer ct.mu.RUnlock()
 	for i := 0; i < len(ct.buckets)/fractionPerReaping; i++ {
 		idx = (i + start) % len(ct.buckets)
 		bkt := &ct.buckets[idx]
 		bkt.mu.Lock()
 		for tuple := bkt.tuples.Front(); tuple != nil; {
 			// reapTupleLocked updates tuple's next pointer so we grab it here.
 			nextTuple := tuple.Next()

 			checked++
 			if ct.reapTupleLocked(tuple, idx, bkt, now) {
 				expired++
 			}

 			tuple = nextTuple
 		}
 		bkt.mu.Unlock()
 	}
 	// We already checked buckets[idx].
 	idx++

 	// If half or more of the connections are expired, the table has gotten
 	// stale. Reschedule quickly.
 	expiredPct := 0
 	if checked != 0 {
 		expiredPct = expired * 100 / checked
 	}
 	if expiredPct > maxExpiredPct {
 		return idx, minInterval
 	}
 	if interval := prevInterval + minInterval; interval <= maxInterval {
 		// Increment the interval between runs.
 		return idx, interval
 	}
 	// We've hit the maximum interval.
 	return idx, maxInterval
 }

 // reapTupleLocked tries to remove tuple and its reply from the table. It
 // returns whether the tuple's connection has timed out.
 //
 // Precondition: ct.mu is read locked and bkt.mu is write locked.
 // +checklocksread:ct.mu
 // +checklocks:bkt.mu
 func (ct *ConnTrack) reapTupleLocked(reapingTuple *tuple, bktID int, bkt *bucket, now tcpip.MonotonicTime) bool {
 	if !reapingTuple.conn.timedOut(now) {
 		return false
 	}

 	var otherTuple *tuple
 	if reapingTuple.reply {
 		otherTuple = &reapingTuple.conn.original
 	} else {
 		otherTuple = &reapingTuple.conn.reply
 	}

 	otherTupleBktID := ct.bucket(otherTuple.id())
 	replyTupleInserted := reapingTuple.conn.getFinalizeResult() == finalizeResultSuccess

 	// To maintain lock order, we can only reap both tuples if the tuple for the
 	// other direction appears later in the table.
 	if bktID > otherTupleBktID && replyTupleInserted {
 		return true
 	}

 	bkt.tuples.Remove(reapingTuple)

 	if !replyTupleInserted {
 		// The other tuple is the reply which has not yet been inserted.
 		return true
 	}

 	// Reap the other connection.
 	if bktID == otherTupleBktID {
 		// Don't re-lock if both tuples are in the same bucket.
 		bkt.tuples.Remove(otherTuple)
 	} else {
 		otherTupleBkt := &ct.buckets[otherTupleBktID]
 		otherTupleBkt.mu.Lock()
 		otherTupleBkt.tuples.Remove(otherTuple)
 		otherTupleBkt.mu.Unlock()
 	}

 	return true
 }

 func (ct *ConnTrack) originalDst(epID TransportEndpointID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber) (tcpip.Address, uint16, tcpip.Error) {
 	// Lookup the connection. The reply's original destination
 	// describes the original address.
 	tid := tupleID{
 		srcAddr:                   epID.LocalAddress,
 		srcPortOrEchoRequestIdent: epID.LocalPort,
 		dstAddr:                   epID.RemoteAddress,
 		dstPortOrEchoReplyIdent:   epID.RemotePort,
 		transProto:                transProto,
 		netProto:                  netProto,
 	}
 	t := ct.connForTID(tid)
 	if t == nil {
 		// Not a tracked connection.
 		return "", 0, &tcpip.ErrNotConnected{}
 	}

 	t.conn.mu.RLock()
 	defer t.conn.mu.RUnlock()
 	if t.conn.destinationManip == manipNotPerformed {
 		// Unmanipulated destination.
 		return "", 0, &tcpip.ErrInvalidOptionValue{}
 	}

 	id := t.conn.original.id()
 	return id.dstAddr, id.dstPortOrEchoReplyIdent, nil
 }