pkg/tcpip/network/ipv4/ipv4.go - third_party/gvisor.dev/gvisor/netstack - Git at Google

 // Copyright 2021 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Package ipv4 contains the implementation of the ipv4 network protocol.
 package ipv4

 import (
 	"fmt"
 	"math"
 	"reflect"
 	"sync/atomic"
 	"time"

 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/header/parse"
 	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )

 const (
 	// ReassembleTimeout is the time a packet stays in the reassembly
 	// system before being evicted.
 	// As per RFC 791 section 3.2:
 	//   The current recommendation for the initial timer setting is 15 seconds.
 	//   This may be changed as experience with this protocol accumulates.
 	//
 	// Considering that it is an old recommendation, we use the same reassembly
 	// timeout that linux defines, which is 30 seconds:
 	// https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ip.h#L138
 	ReassembleTimeout = 30 * time.Second

 	// ProtocolNumber is the ipv4 protocol number.
 	ProtocolNumber = header.IPv4ProtocolNumber

 	// MaxTotalSize is maximum size that can be encoded in the 16-bit
 	// TotalLength field of the ipv4 header.
 	MaxTotalSize = 0xffff

 	// DefaultTTL is the default time-to-live value for this endpoint.
 	DefaultTTL = 64

 	// buckets is the number of identifier buckets.
 	buckets = 2048

 	// The size of a fragment block, in bytes, as per RFC 791 section 3.1,
 	// page 14.
 	fragmentblockSize = 8
 )

 var ipv4BroadcastAddr = header.IPv4Broadcast.WithPrefix()

 var _ stack.LinkResolvableNetworkEndpoint = (*endpoint)(nil)
 var _ stack.GroupAddressableEndpoint = (*endpoint)(nil)
 var _ stack.AddressableEndpoint = (*endpoint)(nil)
 var _ stack.NetworkEndpoint = (*endpoint)(nil)

 type endpoint struct {
 	nic        stack.NetworkInterface
 	dispatcher stack.TransportDispatcher
 	protocol   *protocol
 	stats      sharedStats

 	// enabled is set to 1 when the enpoint is enabled and 0 when it is
 	// disabled.
 	//
 	// Must be accessed using atomic operations.
 	enabled uint32

 	mu struct {
 		sync.RWMutex

 		addressableEndpointState stack.AddressableEndpointState
 		igmp                     igmpState
 	}
 }

 // HandleLinkResolutionFailure implements stack.LinkResolvableNetworkEndpoint.
 func (e *endpoint) HandleLinkResolutionFailure(pkt *stack.PacketBuffer) {
 	// handleControl expects the entire offending packet to be in the packet
 	// buffer's data field.
 	pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
 	})
 	pkt.NICID = e.nic.ID()
 	pkt.NetworkProtocolNumber = ProtocolNumber
 	// Use the same control type as an ICMPv4 destination host unreachable error
 	// since the host is considered unreachable if we cannot resolve the link
 	// address to the next hop.
 	e.handleControl(&icmpv4DestinationHostUnreachableSockError{}, pkt)
 }

 // NewEndpoint creates a new ipv4 endpoint.
 func (p *protocol) NewEndpoint(nic stack.NetworkInterface, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
 	e := &endpoint{
 		nic:        nic,
 		dispatcher: dispatcher,
 		protocol:   p,
 	}
 	e.mu.Lock()
 	e.mu.addressableEndpointState.Init(e)
 	e.mu.igmp.init(e)
 	e.mu.Unlock()

 	tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem())

 	stackStats := p.stack.Stats()
 	e.stats.ip.Init(&e.stats.localStats.IP, &stackStats.IP)
 	e.stats.icmp.init(&e.stats.localStats.ICMP, &stackStats.ICMP.V4)
 	e.stats.igmp.init(&e.stats.localStats.IGMP, &stackStats.IGMP)

 	p.mu.Lock()
 	p.mu.eps[nic.ID()] = e
 	p.mu.Unlock()

 	return e
 }

 func (p *protocol) findEndpointWithAddress(addr tcpip.Address) *endpoint {
 	p.mu.RLock()
 	defer p.mu.RUnlock()

 	for _, e := range p.mu.eps {
 		if addressEndpoint := e.AcquireAssignedAddress(addr, false /* allowTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
 			addressEndpoint.DecRef()
 			return e
 		}
 	}

 	return nil
 }

 func (p *protocol) forgetEndpoint(nicID tcpip.NICID) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	delete(p.mu.eps, nicID)
 }

 // Enable implements stack.NetworkEndpoint.
 func (e *endpoint) Enable() tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()

 	// If the NIC is not enabled, the endpoint can't do anything meaningful so
 	// don't enable the endpoint.
 	if !e.nic.Enabled() {
 		return &tcpip.ErrNotPermitted{}
 	}

 	// If the endpoint is already enabled, there is nothing for it to do.
 	if !e.setEnabled(true) {
 		return nil
 	}

 	// Create an endpoint to receive broadcast packets on this interface.
 	ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */)
 	if err != nil {
 		return err
 	}
 	// We have no need for the address endpoint.
 	ep.DecRef()

 	// Groups may have been joined while the endpoint was disabled, or the
 	// endpoint may have left groups from the perspective of IGMP when the
 	// endpoint was disabled. Either way, we need to let routers know to
 	// send us multicast traffic.
 	e.mu.igmp.initializeAll()

 	// As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
 	// multicast group. Note, the IANA calls the all-hosts multicast group the
 	// all-systems multicast group.
 	if err := e.joinGroupLocked(header.IPv4AllSystems); err != nil {
 		// joinGroupLocked only returns an error if the group address is not a valid
 		// IPv4 multicast address.
 		panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllSystems, err))
 	}

 	return nil
 }

 // Enabled implements stack.NetworkEndpoint.
 func (e *endpoint) Enabled() bool {
 	return e.nic.Enabled() && e.isEnabled()
 }

 // isEnabled returns true if the endpoint is enabled, regardless of the
 // enabled status of the NIC.
 func (e *endpoint) isEnabled() bool {
 	return atomic.LoadUint32(&e.enabled) == 1
 }

 // setEnabled sets the enabled status for the endpoint.
 //
 // Returns true if the enabled status was updated.
 func (e *endpoint) setEnabled(v bool) bool {
 	if v {
 		return atomic.SwapUint32(&e.enabled, 1) == 0
 	}
 	return atomic.SwapUint32(&e.enabled, 0) == 1
 }

 // Disable implements stack.NetworkEndpoint.
 func (e *endpoint) Disable() {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 	e.disableLocked()
 }

 func (e *endpoint) disableLocked() {
 	if !e.isEnabled() {
 		return
 	}

 	// The endpoint may have already left the multicast group.
 	switch err := e.leaveGroupLocked(header.IPv4AllSystems); err.(type) {
 	case nil, *tcpip.ErrBadLocalAddress:
 	default:
 		panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err))
 	}

 	// Leave groups from the perspective of IGMP so that routers know that
 	// we are no longer interested in the group.
 	e.mu.igmp.softLeaveAll()

 	// The address may have already been removed.
 	switch err := e.mu.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err.(type) {
 	case nil, *tcpip.ErrBadLocalAddress:
 	default:
 		panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err))
 	}

 	// Reset the IGMP V1 present flag.
 	//
 	// If the node comes back up on the same network, it will re-learn that it
 	// needs to perform IGMPv1.
 	e.mu.igmp.resetV1Present()

 	if !e.setEnabled(false) {
 		panic("should have only done work to disable the endpoint if it was enabled")
 	}
 }

 // DefaultTTL is the default time-to-live value for this endpoint.
 func (e *endpoint) DefaultTTL() uint8 {
 	return e.protocol.DefaultTTL()
 }

 // MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus
 // the network layer max header length.
 func (e *endpoint) MTU() uint32 {
 	networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv4MinimumSize)
 	if err != nil {
 		return 0
 	}
 	return networkMTU
 }

 // MaxHeaderLength returns the maximum length needed by ipv4 headers (and
 // underlying protocols).
 func (e *endpoint) MaxHeaderLength() uint16 {
 	return e.nic.MaxHeaderLength() + header.IPv4MaximumHeaderSize
 }

 // NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
 func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
 	return e.protocol.Number()
 }

 func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, options header.IPv4OptionsSerializer) tcpip.Error {
 	hdrLen := header.IPv4MinimumSize
 	var optLen int
 	if options != nil {
 		optLen = int(options.Length())
 	}
 	hdrLen += optLen
 	if hdrLen > header.IPv4MaximumHeaderSize {
 		return &tcpip.ErrMessageTooLong{}
 	}
 	ip := header.IPv4(pkt.NetworkHeader().Push(hdrLen))
 	length := pkt.Size()
 	if length > math.MaxUint16 {
 		return &tcpip.ErrMessageTooLong{}
 	}
 	// RFC 6864 section 4.3 mandates uniqueness of ID values for non-atomic
 	// datagrams. Since the DF bit is never being set here, all datagrams
 	// are non-atomic and need an ID.
 	id := atomic.AddUint32(&e.protocol.ids[hashRoute(srcAddr, dstAddr, params.Protocol, e.protocol.hashIV)%buckets], 1)
 	ip.Encode(&header.IPv4Fields{
 		TotalLength: uint16(length),
 		ID:          uint16(id),
 		TTL:         params.TTL,
 		TOS:         params.TOS,
 		Protocol:    uint8(params.Protocol),
 		SrcAddr:     srcAddr,
 		DstAddr:     dstAddr,
 		Options:     options,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
 	pkt.NetworkProtocolNumber = ProtocolNumber
 	return nil
 }

 // handleFragments fragments pkt and calls the handler function on each
 // fragment. It returns the number of fragments handled and the number of
 // fragments left to be processed. The IP header must already be present in the
 // original packet.
 func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, networkMTU uint32, pkt *stack.PacketBuffer, handler func(*stack.PacketBuffer) tcpip.Error) (int, int, tcpip.Error) {
 	// Round the MTU down to align to 8 bytes.
 	fragmentPayloadSize := networkMTU &^ 7
 	networkHeader := header.IPv4(pkt.NetworkHeader().View())
 	pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadSize, pkt.AvailableHeaderBytes()+len(networkHeader))

 	var n int
 	for {
 		fragPkt, more := buildNextFragment(&pf, networkHeader)
 		if err := handler(fragPkt); err != nil {
 			return n, pf.RemainingFragmentCount() + 1, err
 		}
 		n++
 		if !more {
 			return n, pf.RemainingFragmentCount(), nil
 		}
 	}
 }

 // WritePacket writes a packet to the given destination address and protocol.
 func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) tcpip.Error {
 	if err := e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* options */); err != nil {
 		return err
 	}

 	// iptables filtering. All packets that reach here are locally
 	// generated.
 	outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
 	if ok := e.protocol.stack.IPTables().Check(stack.Output, pkt, gso, r, "" /* preroutingAddr */, "" /* inNicName */, outNicName); !ok {
 		// iptables is telling us to drop the packet.
 		e.stats.ip.IPTablesOutputDropped.Increment()
 		return nil
 	}

 	// If the packet is manipulated as per NAT Output rules, handle packet
 	// based on destination address and do not send the packet to link
 	// layer.
 	//
 	// TODO(gvisor.dev/issue/170): We should do this for every
 	// packet, rather than only NATted packets, but removing this check
 	// short circuits broadcasts before they are sent out to other hosts.
 	if pkt.NatDone {
 		netHeader := header.IPv4(pkt.NetworkHeader().View())
 		if ep := e.protocol.findEndpointWithAddress(netHeader.DestinationAddress()); ep != nil {
 			// Since we rewrote the packet but it is being routed back to us, we
 			// can safely assume the checksum is valid.
 			ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
 			return nil
 		}
 	}

 	return e.writePacket(r, gso, pkt, false /* headerIncluded */)
 }

 func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer, headerIncluded bool) tcpip.Error {
 	if r.Loop&stack.PacketLoop != 0 {
 		// If the packet was generated by the stack (not a raw/packet endpoint
 		// where a packet may be written with the header included), then we can
 		// safely assume the checksum is valid.
 		e.handleLocalPacket(pkt, !headerIncluded /* canSkipRXChecksum */)
 	}
 	if r.Loop&stack.PacketOut == 0 {
 		return nil
 	}

 	stats := e.stats.ip

 	networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
 	if err != nil {
 		stats.OutgoingPacketErrors.Increment()
 		return err
 	}

 	if packetMustBeFragmented(pkt, networkMTU, gso) {
 		sent, remain, err := e.handleFragments(r, gso, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error {
 			// TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
 			// fragment one by one using WritePacket() (current strategy) or if we
 			// want to create a PacketBufferList from the fragments and feed it to
 			// WritePackets(). It'll be faster but cost more memory.
 			return e.nic.WritePacket(r, gso, ProtocolNumber, fragPkt)
 		})
 		stats.PacketsSent.IncrementBy(uint64(sent))
 		stats.OutgoingPacketErrors.IncrementBy(uint64(remain))
 		return err
 	}

 	if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
 		stats.OutgoingPacketErrors.Increment()
 		return err
 	}
 	stats.PacketsSent.Increment()
 	return nil
 }

 // WritePackets implements stack.NetworkEndpoint.WritePackets.
 func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, tcpip.Error) {
 	if r.Loop&stack.PacketLoop != 0 {
 		panic("multiple packets in local loop")
 	}
 	if r.Loop&stack.PacketOut == 0 {
 		return pkts.Len(), nil
 	}

 	stats := e.stats.ip

 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
 		if err := e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* options */); err != nil {
 			return 0, err
 		}

 		networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
 		if err != nil {
 			stats.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
 			return 0, err
 		}

 		if packetMustBeFragmented(pkt, networkMTU, gso) {
 			// Keep track of the packet that is about to be fragmented so it can be
 			// removed once the fragmentation is done.
 			originalPkt := pkt
 			if _, _, err := e.handleFragments(r, gso, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error {
 				// Modify the packet list in place with the new fragments.
 				pkts.InsertAfter(pkt, fragPkt)
 				pkt = fragPkt
 				return nil
 			}); err != nil {
 				panic(fmt.Sprintf("e.handleFragments(_, _, %d, _, _) = %s", networkMTU, err))
 			}
 			// Remove the packet that was just fragmented and process the rest.
 			pkts.Remove(originalPkt)
 		}
 	}

 	outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
 	// iptables filtering. All packets that reach here are locally
 	// generated.
 	dropped, natPkts := e.protocol.stack.IPTables().CheckPackets(stack.Output, pkts, gso, r, "", outNicName)
 	stats.IPTablesOutputDropped.IncrementBy(uint64(len(dropped)))
 	for pkt := range dropped {
 		pkts.Remove(pkt)
 	}

 	// The NAT-ed packets may now be destined for us.
 	locallyDelivered := 0
 	for pkt := range natPkts {
 		ep := e.protocol.findEndpointWithAddress(header.IPv4(pkt.NetworkHeader().View()).DestinationAddress())
 		if ep == nil {
 			// The NAT-ed packet is still destined for some remote node.
 			continue
 		}

 		// Do not send the locally destined packet out the NIC.
 		pkts.Remove(pkt)

 		// Deliver the packet locally.
 		ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
 		locallyDelivered++

 	}

 	// The rest of the packets can be delivered to the NIC as a batch.
 	pktsLen := pkts.Len()
 	written, err := e.nic.WritePackets(r, gso, pkts, ProtocolNumber)
 	stats.PacketsSent.IncrementBy(uint64(written))
 	stats.OutgoingPacketErrors.IncrementBy(uint64(pktsLen - written))

 	// Dropped packets aren't errors, so include them in the return value.
 	return locallyDelivered + written + len(dropped), err
 }

 // WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
 func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
 	h, ok := pkt.Data().PullUp(header.IPv4MinimumSize)
 	if !ok {
 		return &tcpip.ErrMalformedHeader{}
 	}

 	hdrLen := header.IPv4(h).HeaderLength()
 	if hdrLen < header.IPv4MinimumSize {
 		return &tcpip.ErrMalformedHeader{}
 	}

 	h, ok = pkt.Data().PullUp(int(hdrLen))
 	if !ok {
 		return &tcpip.ErrMalformedHeader{}
 	}
 	ip := header.IPv4(h)

 	// Always set the total length.
 	pktSize := pkt.Data().Size()
 	ip.SetTotalLength(uint16(pktSize))

 	// Set the source address when zero.
 	if ip.SourceAddress() == header.IPv4Any {
 		ip.SetSourceAddress(r.LocalAddress)
 	}

 	// Set the destination. If the packet already included a destination, it will
 	// be part of the route anyways.
 	ip.SetDestinationAddress(r.RemoteAddress)

 	// Set the packet ID when zero.
 	if ip.ID() == 0 {
 		// RFC 6864 section 4.3 mandates uniqueness of ID values for
 		// non-atomic datagrams, so assign an ID to all such datagrams
 		// according to the definition given in RFC 6864 section 4.
 		if ip.Flags()&header.IPv4FlagDontFragment == 0 || ip.Flags()&header.IPv4FlagMoreFragments != 0 || ip.FragmentOffset() > 0 {
 			ip.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r.LocalAddress, r.RemoteAddress, 0 /* protocol */, e.protocol.hashIV)%buckets], 1)))
 		}
 	}

 	// Always set the checksum.
 	ip.SetChecksum(0)
 	ip.SetChecksum(^ip.CalculateChecksum())

 	// Populate the packet buffer's network header and don't allow an invalid
 	// packet to be sent.
 	//
 	// Note that parsing only makes sure that the packet is well formed as per the
 	// wire format. We also want to check if the header's fields are valid before
 	// sending the packet.
 	if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().View()).IsValid(pktSize) {
 		return &tcpip.ErrMalformedHeader{}
 	}

 	return e.writePacket(r, nil /* gso */, pkt, true /* headerIncluded */)
 }

 // forwardPacket attempts to forward a packet to its final destination.
 func (e *endpoint) forwardPacket(pkt *stack.PacketBuffer) tcpip.Error {
 	h := header.IPv4(pkt.NetworkHeader().View())
 	ttl := h.TTL()
 	if ttl == 0 {
 		// As per RFC 792 page 6, Time Exceeded Message,
 		//
 		//  If the gateway processing a datagram finds the time to live field
 		//  is zero it must discard the datagram.  The gateway may also notify
 		//  the source host via the time exceeded message.
 		return e.protocol.returnError(&icmpReasonTTLExceeded{}, pkt)
 	}

 	if opts := h.Options(); len(opts) != 0 {
 		newOpts, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageForward{})
 		if optProblem != nil {
 			if optProblem.NeedICMP {
 				_ = e.protocol.returnError(&icmpReasonParamProblem{
 					pointer:    optProblem.Pointer,
 					forwarding: true,
 				}, pkt)
 				e.protocol.stack.Stats().MalformedRcvdPackets.Increment()
 				e.stats.ip.MalformedPacketsReceived.Increment()
 			}
 			return nil // option problems are not reported locally.
 		}
 		copied := copy(opts, newOpts)
 		if copied != len(newOpts) {
 			panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts)))
 		}
 		// Since in forwarding we handle all options, including copying those we
 		// do not recognise, the options region should remain the same size which
 		// simplifies processing. As we MAY receive a packet with a lot of padded
 		// bytes after the "end of options list" byte, make sure we copy
 		// them as the legal padding value (0).
 		for i := copied; i < len(opts); i++ {
 			// Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
 			opts[i] = byte(header.IPv4OptionListEndType)
 		}
 	}

 	dstAddr := h.DestinationAddress()

 	// Check if the destination is owned by the stack.
 	if ep := e.protocol.findEndpointWithAddress(dstAddr); ep != nil {
 		ep.handleValidatedPacket(h, pkt)
 		return nil
 	}

 	r, err := e.protocol.stack.FindRoute(0, "", dstAddr, ProtocolNumber, false /* multicastLoop */)
 	if err != nil {
 		return err
 	}
 	defer r.Release()

 	// We need to do a deep copy of the IP packet because
 	// WriteHeaderIncludedPacket takes ownership of the packet buffer, but we do
 	// not own it.
 	newHdr := header.IPv4(stack.PayloadSince(pkt.NetworkHeader()))

 	// As per RFC 791 page 30, Time to Live,
 	//
 	//   This field must be decreased at each point that the internet header
 	//   is processed to reflect the time spent processing the datagram.
 	//   Even if no local information is available on the time actually
 	//   spent, the field must be decremented by 1.
 	newHdr.SetTTL(ttl - 1)

 	return r.WriteHeaderIncludedPacket(stack.NewPacketBuffer(stack.PacketBufferOptions{
 		ReserveHeaderBytes: int(r.MaxHeaderLength()),
 		Data:               buffer.View(newHdr).ToVectorisedView(),
 	}))
 }

 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
 func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
 	stats := e.stats.ip

 	stats.PacketsReceived.Increment()

 	if !e.isEnabled() {
 		stats.DisabledPacketsReceived.Increment()
 		return
 	}

 	h, ok := e.protocol.parseAndValidate(pkt)
 	if !ok {
 		stats.MalformedPacketsReceived.Increment()
 		return
 	}

 	if !e.nic.IsLoopback() {
 		if !e.protocol.options.AllowExternalLoopbackTraffic {
 			if header.IsV4LoopbackAddress(h.SourceAddress()) {
 				stats.InvalidSourceAddressesReceived.Increment()
 				return
 			}

 			if header.IsV4LoopbackAddress(h.DestinationAddress()) {
 				stats.InvalidDestinationAddressesReceived.Increment()
 				return
 			}
 		}

 		if e.protocol.stack.HandleLocal() {
 			addressEndpoint := e.AcquireAssignedAddress(header.IPv4(pkt.NetworkHeader().View()).SourceAddress(), e.nic.Promiscuous(), stack.CanBePrimaryEndpoint)
 			if addressEndpoint != nil {
 				addressEndpoint.DecRef()

 				// The source address is one of our own, so we never should have gotten
 				// a packet like this unless HandleLocal is false or our NIC is the
 				// loopback interface.
 				stats.InvalidSourceAddressesReceived.Increment()
 				return
 			}
 		}

 		// Loopback traffic skips the prerouting chain.
 		inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
 		if ok := e.protocol.stack.IPTables().Check(stack.Prerouting, pkt, nil, nil, e.MainAddress().Address, inNicName, "" /* outNicName */); !ok {
 			// iptables is telling us to drop the packet.
 			stats.IPTablesPreroutingDropped.Increment()
 			return
 		}
 	}

 	e.handleValidatedPacket(h, pkt)
 }

 // handleLocalPacket is like HandlePacket except it does not perform the
 // prerouting iptables hook or check for loopback traffic that originated from
 // outside of the netstack (i.e. martian loopback packets).
 func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum bool) {
 	stats := e.stats.ip
 	stats.PacketsReceived.Increment()

 	pkt = pkt.CloneToInbound()
 	pkt.RXTransportChecksumValidated = canSkipRXChecksum

 	h, ok := e.protocol.parseAndValidate(pkt)
 	if !ok {
 		stats.MalformedPacketsReceived.Increment()
 		return
 	}

 	e.handleValidatedPacket(h, pkt)
 }

 func (e *endpoint) handleValidatedPacket(h header.IPv4, pkt *stack.PacketBuffer) {
 	pkt.NICID = e.nic.ID()
 	stats := e.stats

 	srcAddr := h.SourceAddress()
 	dstAddr := h.DestinationAddress()

 	// As per RFC 1122 section 3.2.1.3:
 	//   When a host sends any datagram, the IP source address MUST
 	//   be one of its own IP addresses (but not a broadcast or
 	//   multicast address).
 	if srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) {
 		stats.ip.InvalidSourceAddressesReceived.Increment()
 		return
 	}
 	// Make sure the source address is not a subnet-local broadcast address.
 	if addressEndpoint := e.AcquireAssignedAddress(srcAddr, false /* createTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
 		subnet := addressEndpoint.Subnet()
 		addressEndpoint.DecRef()
 		if subnet.IsBroadcast(srcAddr) {
 			stats.ip.InvalidSourceAddressesReceived.Increment()
 			return
 		}
 	}

 	// Before we do any processing, note if the packet was received as some
 	// sort of broadcast. The destination address should be an address we own
 	// or a group we joined.
 	if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil {
 		subnet := addressEndpoint.AddressWithPrefix().Subnet()
 		addressEndpoint.DecRef()
 		pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast
 	} else if !e.IsInGroup(dstAddr) {
 		if !e.protocol.Forwarding() {
 			stats.ip.InvalidDestinationAddressesReceived.Increment()
 			return
 		}
 		_ = e.forwardPacket(pkt)
 		return
 	}

 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
 	inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
 	if ok := e.protocol.stack.IPTables().Check(stack.Input, pkt, nil, nil, "" /* preroutingAddr */, inNicName, "" /* outNicName */); !ok {
 		// iptables is telling us to drop the packet.
 		stats.ip.IPTablesInputDropped.Increment()
 		return
 	}

 	if h.More() || h.FragmentOffset() != 0 {
 		if pkt.Data().Size()+pkt.TransportHeader().View().Size() == 0 {
 			// Drop the packet as it's marked as a fragment but has
 			// no payload.
 			stats.ip.MalformedPacketsReceived.Increment()
 			stats.ip.MalformedFragmentsReceived.Increment()
 			return
 		}
 		if opts := h.Options(); len(opts) != 0 {
 			// If there are options we need to check them before we do assembly
 			// or we could be assembling errant packets. However we do not change the
 			// options as that could lead to double processing later.
 			if _, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageVerify{}); optProblem != nil {
 				if optProblem.NeedICMP {
 					_ = e.protocol.returnError(&icmpReasonParamProblem{
 						pointer: optProblem.Pointer,
 					}, pkt)
 					e.protocol.stack.Stats().MalformedRcvdPackets.Increment()
 					e.stats.ip.MalformedPacketsReceived.Increment()
 				}
 				return
 			}
 		}
 		// The packet is a fragment, let's try to reassemble it.
 		start := h.FragmentOffset()
 		// Drop the fragment if the size of the reassembled payload would exceed the
 		// maximum payload size.
 		//
 		// Note that this addition doesn't overflow even on 32bit architecture
 		// because pkt.Data().Size() should not exceed 65535 (the max IP datagram
 		// size). Otherwise the packet would've been rejected as invalid before
 		// reaching here.
 		if int(start)+pkt.Data().Size() > header.IPv4MaximumPayloadSize {
 			stats.ip.MalformedPacketsReceived.Increment()
 			stats.ip.MalformedFragmentsReceived.Increment()
 			return
 		}

 		proto := h.Protocol()
 		resPkt, _, ready, err := e.protocol.fragmentation.Process(
 			// As per RFC 791 section 2.3, the identification value is unique
 			// for a source-destination pair and protocol.
 			fragmentation.FragmentID{
 				Source:      h.SourceAddress(),
 				Destination: h.DestinationAddress(),
 				ID:          uint32(h.ID()),
 				Protocol:    proto,
 			},
 			start,
 			start+uint16(pkt.Data().Size())-1,
 			h.More(),
 			proto,
 			pkt,
 		)
 		if err != nil {
 			stats.ip.MalformedPacketsReceived.Increment()
 			stats.ip.MalformedFragmentsReceived.Increment()
 			return
 		}
 		if !ready {
 			return
 		}
 		pkt = resPkt
 		h = header.IPv4(pkt.NetworkHeader().View())

 		// The reassembler doesn't take care of fixing up the header, so we need
 		// to do it here.
 		h.SetTotalLength(uint16(pkt.Data().Size() + len((h))))
 		h.SetFlagsFragmentOffset(0, 0)
 	}
 	stats.ip.PacketsDelivered.Increment()

 	p := h.TransportProtocol()
 	if p == header.ICMPv4ProtocolNumber {
 		// TODO(gvisor.dev/issues/3810): when we sort out ICMP and transport
 		// headers, the setting of the transport number here should be
 		// unnecessary and removed.
 		pkt.TransportProtocolNumber = p
 		e.handleICMP(pkt)
 		return
 	}
 	// ICMP handles options itself but do it here for all remaining destinations.
 	var hasRouterAlertOption bool
 	if opts := h.Options(); len(opts) != 0 {
 		newOpts, processedOpts, optProblem := e.processIPOptions(pkt, opts, &optionUsageReceive{})
 		if optProblem != nil {
 			if optProblem.NeedICMP {
 				_ = e.protocol.returnError(&icmpReasonParamProblem{
 					pointer: optProblem.Pointer,
 				}, pkt)
 				e.protocol.stack.Stats().MalformedRcvdPackets.Increment()
 				stats.ip.MalformedPacketsReceived.Increment()
 			}
 			return
 		}
 		hasRouterAlertOption = processedOpts.routerAlert
 		copied := copy(opts, newOpts)
 		if copied != len(newOpts) {
 			panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts)))
 		}
 		for i := copied; i < len(opts); i++ {
 			// Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
 			opts[i] = byte(header.IPv4OptionListEndType)
 		}
 	}
 	if p == header.IGMPProtocolNumber {
 		e.mu.Lock()
 		e.mu.igmp.handleIGMP(pkt, hasRouterAlertOption)
 		e.mu.Unlock()
 		return
 	}

 	switch res := e.dispatcher.DeliverTransportPacket(p, pkt); res {
 	case stack.TransportPacketHandled:
 	case stack.TransportPacketDestinationPortUnreachable:
 		// As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination
 		//   Unreachable messages with code:
 		//     3 (Port Unreachable), when the designated transport protocol
 		//     (e.g., UDP) is unable to demultiplex the datagram but has no
 		//     protocol mechanism to inform the sender.
 		_ = e.protocol.returnError(&icmpReasonPortUnreachable{}, pkt)
 	case stack.TransportPacketProtocolUnreachable:
 		// As per RFC: 1122 Section 3.2.2.1
 		//   A host SHOULD generate Destination Unreachable messages with code:
 		//     2 (Protocol Unreachable), when the designated transport protocol
 		//     is not supported
 		_ = e.protocol.returnError(&icmpReasonProtoUnreachable{}, pkt)
 	default:
 		panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
 	}
 }

 // Close cleans up resources associated with the endpoint.
 func (e *endpoint) Close() {
 	e.mu.Lock()
 	e.disableLocked()
 	e.mu.addressableEndpointState.Cleanup()
 	e.mu.Unlock()

 	e.protocol.forgetEndpoint(e.nic.ID())
 }

 // AddAndAcquirePermanentAddress implements stack.AddressableEndpoint.
 func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, tcpip.Error) {
 	e.mu.Lock()
 	defer e.mu.Unlock()

 	ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated)
 	if err == nil {
 		e.mu.igmp.sendQueuedReports()
 	}
 	return ep, err
 }

 // RemovePermanentAddress implements stack.AddressableEndpoint.
 func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 	return e.mu.addressableEndpointState.RemovePermanentAddress(addr)
 }

 // MainAddress implements stack.AddressableEndpoint.
 func (e *endpoint) MainAddress() tcpip.AddressWithPrefix {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 	return e.mu.addressableEndpointState.MainAddress()
 }

 // AcquireAssignedAddress implements stack.AddressableEndpoint.
 func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
 	e.mu.Lock()
 	defer e.mu.Unlock()

 	loopback := e.nic.IsLoopback()
 	return e.mu.addressableEndpointState.AcquireAssignedAddressOrMatching(localAddr, func(addressEndpoint stack.AddressEndpoint) bool {
 		subnet := addressEndpoint.Subnet()
 		// IPv4 has a notion of a subnet broadcast address and considers the
 		// loopback interface bound to an address's whole subnet (on linux).
 		return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr))
 	}, allowTemp, tempPEB)
 }

 // AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
 func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 	return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired)
 }

 // acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress
 // but with locking requirements
 //
 // Precondition: igmp.ep.mu must be read locked.
 func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
 	return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired)
 }

 // PrimaryAddresses implements stack.AddressableEndpoint.
 func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 	return e.mu.addressableEndpointState.PrimaryAddresses()
 }

 // PermanentAddresses implements stack.AddressableEndpoint.
 func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 	return e.mu.addressableEndpointState.PermanentAddresses()
 }

 // JoinGroup implements stack.GroupAddressableEndpoint.
 func (e *endpoint) JoinGroup(addr tcpip.Address) tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 	return e.joinGroupLocked(addr)
 }

 // joinGroupLocked is like JoinGroup but with locking requirements.
 //
 // Precondition: e.mu must be locked.
 func (e *endpoint) joinGroupLocked(addr tcpip.Address) tcpip.Error {
 	if !header.IsV4MulticastAddress(addr) {
 		return &tcpip.ErrBadAddress{}
 	}

 	e.mu.igmp.joinGroup(addr)
 	return nil
 }

 // LeaveGroup implements stack.GroupAddressableEndpoint.
 func (e *endpoint) LeaveGroup(addr tcpip.Address) tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 	return e.leaveGroupLocked(addr)
 }

 // leaveGroupLocked is like LeaveGroup but with locking requirements.
 //
 // Precondition: e.mu must be locked.
 func (e *endpoint) leaveGroupLocked(addr tcpip.Address) tcpip.Error {
 	return e.mu.igmp.leaveGroup(addr)
 }

 // IsInGroup implements stack.GroupAddressableEndpoint.
 func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 	return e.mu.igmp.isInGroup(addr)
 }

 // Stats implements stack.NetworkEndpoint.
 func (e *endpoint) Stats() stack.NetworkEndpointStats {
 	return &e.stats.localStats
 }

 var _ stack.ForwardingNetworkProtocol = (*protocol)(nil)
 var _ stack.NetworkProtocol = (*protocol)(nil)
 var _ fragmentation.TimeoutHandler = (*protocol)(nil)

 type protocol struct {
 	stack *stack.Stack

 	mu struct {
 		sync.RWMutex

 		// eps is keyed by NICID to allow protocol methods to retrieve an endpoint
 		// when handling a packet, by looking at which NIC handled the packet.
 		eps map[tcpip.NICID]*endpoint
 	}

 	// defaultTTL is the current default TTL for the protocol. Only the
 	// uint8 portion of it is meaningful.
 	//
 	// Must be accessed using atomic operations.
 	defaultTTL uint32

 	// forwarding is set to 1 when the protocol has forwarding enabled and 0
 	// when it is disabled.
 	//
 	// Must be accessed using atomic operations.
 	forwarding uint32

 	ids    []uint32
 	hashIV uint32

 	fragmentation *fragmentation.Fragmentation

 	options Options
 }

 // Number returns the ipv4 protocol number.
 func (p *protocol) Number() tcpip.NetworkProtocolNumber {
 	return ProtocolNumber
 }

 // MinimumPacketSize returns the minimum valid ipv4 packet size.
 func (p *protocol) MinimumPacketSize() int {
 	return header.IPv4MinimumSize
 }

 // DefaultPrefixLen returns the IPv4 default prefix length.
 func (p *protocol) DefaultPrefixLen() int {
 	return header.IPv4AddressSize * 8
 }

 // ParseAddresses implements NetworkProtocol.ParseAddresses.
 func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
 	h := header.IPv4(v)
 	return h.SourceAddress(), h.DestinationAddress()
 }

 // SetOption implements NetworkProtocol.SetOption.
 func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error {
 	switch v := option.(type) {
 	case *tcpip.DefaultTTLOption:
 		p.SetDefaultTTL(uint8(*v))
 		return nil
 	default:
 		return &tcpip.ErrUnknownProtocolOption{}
 	}
 }

 // Option implements NetworkProtocol.Option.
 func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error {
 	switch v := option.(type) {
 	case *tcpip.DefaultTTLOption:
 		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
 		return nil
 	default:
 		return &tcpip.ErrUnknownProtocolOption{}
 	}
 }

 // SetDefaultTTL sets the default TTL for endpoints created with this protocol.
 func (p *protocol) SetDefaultTTL(ttl uint8) {
 	atomic.StoreUint32(&p.defaultTTL, uint32(ttl))
 }

 // DefaultTTL returns the default TTL for endpoints created with this protocol.
 func (p *protocol) DefaultTTL() uint8 {
 	return uint8(atomic.LoadUint32(&p.defaultTTL))
 }

 // Close implements stack.TransportProtocol.Close.
 func (*protocol) Close() {}

 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}

 // parseAndValidate parses the packet (including its transport layer header) and
 // returns the parsed IP header.
 //
 // Returns true if the IP header was successfully parsed.
 func (p *protocol) parseAndValidate(pkt *stack.PacketBuffer) (header.IPv4, bool) {
 	transProtoNum, hasTransportHdr, ok := p.Parse(pkt)
 	if !ok {
 		return nil, false
 	}

 	h := header.IPv4(pkt.NetworkHeader().View())
 	// Do not include the link header's size when calculating the size of the IP
 	// packet.
 	if !h.IsValid(pkt.Size() - pkt.LinkHeader().View().Size()) {
 		return nil, false
 	}

 	// There has been some confusion regarding verifying checksums. We need
 	// just look for negative 0 (0xffff) as the checksum, as it's not possible to
 	// get positive 0 (0) for the checksum. Some bad implementations could get it
 	// when doing entry replacement in the early days of the Internet,
 	// however the lore that one needs to check for both persists.
 	//
 	// RFC 1624 section 1 describes the source of this confusion as:
 	//     [the partial recalculation method described in RFC 1071] computes a
 	//     result for certain cases that differs from the one obtained from
 	//     scratch (one's complement of one's complement sum of the original
 	//     fields).
 	//
 	// However RFC 1624 section 5 clarifies that if using the verification method
 	// "recommended by RFC 1071, it does not matter if an intermediate system
 	// generated a -0 instead of +0".
 	//
 	// RFC1071 page 1 specifies the verification method as:
 	//	  (3)  To check a checksum, the 1's complement sum is computed over the
 	//        same set of octets, including the checksum field.  If the result
 	//        is all 1 bits (-0 in 1's complement arithmetic), the check
 	//        succeeds.
 	if h.CalculateChecksum() != 0xffff {
 		return nil, false
 	}

 	if hasTransportHdr {
 		switch err := p.stack.ParsePacketBufferTransport(transProtoNum, pkt); err {
 		case stack.ParsedOK:
 		case stack.UnknownTransportProtocol, stack.TransportLayerParseError:
 			// The transport layer will handle unknown protocols and transport layer
 			// parsing errors.
 		default:
 			panic(fmt.Sprintf("unexpected error parsing transport header = %d", err))
 		}
 	}

 	return h, true
 }

 // Parse implements stack.NetworkProtocol.Parse.
 func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
 	if ok := parse.IPv4(pkt); !ok {
 		return 0, false, false
 	}

 	ipHdr := header.IPv4(pkt.NetworkHeader().View())
 	return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true
 }

 // Forwarding implements stack.ForwardingNetworkProtocol.
 func (p *protocol) Forwarding() bool {
 	return uint8(atomic.LoadUint32(&p.forwarding)) == 1
 }

 // SetForwarding implements stack.ForwardingNetworkProtocol.
 func (p *protocol) SetForwarding(v bool) {
 	if v {
 		atomic.StoreUint32(&p.forwarding, 1)
 	} else {
 		atomic.StoreUint32(&p.forwarding, 0)
 	}
 }

 // calculateNetworkMTU calculates the network-layer payload MTU based on the
 // link-layer payload mtu.
 func calculateNetworkMTU(linkMTU, networkHeaderSize uint32) (uint32, tcpip.Error) {
 	if linkMTU < header.IPv4MinimumMTU {
 		return 0, &tcpip.ErrInvalidEndpointState{}
 	}

 	// As per RFC 791 section 3.1, an IPv4 header cannot exceed 60 bytes in
 	// length:
 	//   The maximal internet header is 60 octets, and a typical internet header
 	//   is 20 octets, allowing a margin for headers of higher level protocols.
 	if networkHeaderSize > header.IPv4MaximumHeaderSize {
 		return 0, &tcpip.ErrMalformedHeader{}
 	}

 	networkMTU := linkMTU
 	if networkMTU > MaxTotalSize {
 		networkMTU = MaxTotalSize
 	}

 	return networkMTU - uint32(networkHeaderSize), nil
 }

 func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32, gso *stack.GSO) bool {
 	payload := pkt.TransportHeader().View().Size() + pkt.Data().Size()
 	return (gso == nil || gso.Type == stack.GSONone) && uint32(payload) > networkMTU
 }

 // addressToUint32 translates an IPv4 address into its little endian uint32
 // representation.
 //
 // This function does the same thing as binary.LittleEndian.Uint32 but operates
 // on a tcpip.Address (a string) without the need to convert it to a byte slice,
 // which would cause an allocation.
 func addressToUint32(addr tcpip.Address) uint32 {
 	_ = addr[3] // bounds check hint to compiler
 	return uint32(addr[0]) | uint32(addr[1])<<8 | uint32(addr[2])<<16 | uint32(addr[3])<<24
 }

 // hashRoute calculates a hash value for the given source/destination pair using
 // the addresses, transport protocol number and a 32-bit number to generate the
 // hash.
 func hashRoute(srcAddr, dstAddr tcpip.Address, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 {
 	a := addressToUint32(srcAddr)
 	b := addressToUint32(dstAddr)
 	return hash.Hash3Words(a, b, uint32(protocol), hashIV)
 }

 // Options holds options to configure a new protocol.
 type Options struct {
 	// IGMP holds options for IGMP.
 	IGMP IGMPOptions

 	// AllowExternalLoopbackTraffic indicates that inbound loopback packets (i.e.
 	// martian loopback packets) should be accepted.
 	AllowExternalLoopbackTraffic bool
 }

 // NewProtocolWithOptions returns an IPv4 network protocol.
 func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
 	ids := make([]uint32, buckets)

 	// Randomly initialize hashIV and the ids.
 	r := hash.RandN32(1 + buckets)
 	for i := range ids {
 		ids[i] = r[i]
 	}
 	hashIV := r[buckets]

 	return func(s *stack.Stack) stack.NetworkProtocol {
 		p := &protocol{
 			stack:      s,
 			ids:        ids,
 			hashIV:     hashIV,
 			defaultTTL: DefaultTTL,
 			options:    opts,
 		}
 		p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p)
 		p.mu.eps = make(map[tcpip.NICID]*endpoint)
 		return p
 	}
 }

 // NewProtocol is equivalent to NewProtocolWithOptions with an empty Options.
 func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
 	return NewProtocolWithOptions(Options{})(s)
 }

 func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) {
 	fragPkt, offset, copied, more := pf.BuildNextFragment()
 	fragPkt.NetworkProtocolNumber = ProtocolNumber

 	originalIPHeaderLength := len(originalIPHeader)
 	nextFragIPHeader := header.IPv4(fragPkt.NetworkHeader().Push(originalIPHeaderLength))
 	fragPkt.NetworkProtocolNumber = ProtocolNumber

 	if copied := copy(nextFragIPHeader, originalIPHeader); copied != len(originalIPHeader) {
 		panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got = %d, want = %d", copied, originalIPHeaderLength))
 	}

 	flags := originalIPHeader.Flags()
 	if more {
 		flags |= header.IPv4FlagMoreFragments
 	}
 	nextFragIPHeader.SetFlagsFragmentOffset(flags, uint16(offset))
 	nextFragIPHeader.SetTotalLength(uint16(nextFragIPHeader.HeaderLength()) + uint16(copied))
 	nextFragIPHeader.SetChecksum(0)
 	nextFragIPHeader.SetChecksum(^nextFragIPHeader.CalculateChecksum())

 	return fragPkt, more
 }

 // optionAction describes possible actions that may be taken on an option
 // while processing it.
 type optionAction uint8

 const (
 	// optionRemove says that the option should not be in the output option set.
 	optionRemove optionAction = iota

 	// optionProcess says that the option should be fully processed.
 	optionProcess

 	// optionVerify says the option should be checked and passed unchanged.
 	optionVerify

 	// optionPass says to pass the output set without checking.
 	optionPass
 )

 // optionActions list what to do for each option in a given scenario.
 type optionActions struct {
 	// timestamp controls what to do with a Timestamp option.
 	timestamp optionAction

 	// recordRoute controls what to do with a Record Route option.
 	recordRoute optionAction

 	// routerAlert controls what to do with a Router Alert option.
 	routerAlert optionAction

 	// unknown controls what to do with an unknown option.
 	unknown optionAction
 }

 // optionsUsage specifies the ways options may be operated upon for a given
 // scenario during packet processing.
 type optionsUsage interface {
 	actions() optionActions
 }

 // optionUsageVerify implements optionsUsage for when we just want to check
 // fragments. Don't change anything, just check and reject if bad. No
 // replacement options are generated.
 type optionUsageVerify struct{}

 // actions implements optionsUsage.
 func (*optionUsageVerify) actions() optionActions {
 	return optionActions{
 		timestamp:   optionVerify,
 		recordRoute: optionVerify,
 		routerAlert: optionVerify,
 		unknown:     optionRemove,
 	}
 }

 // optionUsageReceive implements optionsUsage for packets we will pass
 // to the transport layer (with the exception of Echo requests).
 type optionUsageReceive struct{}

 // actions implements optionsUsage.
 func (*optionUsageReceive) actions() optionActions {
 	return optionActions{
 		timestamp:   optionProcess,
 		recordRoute: optionProcess,
 		routerAlert: optionVerify,
 		unknown:     optionPass,
 	}
 }

 // optionUsageForward implements optionsUsage for packets about to be forwarded.
 // All options are passed on regardless of whether we recognise them, however
 // we do process the Timestamp and Record Route options.
 type optionUsageForward struct{}

 // actions implements optionsUsage.
 func (*optionUsageForward) actions() optionActions {
 	return optionActions{
 		timestamp:   optionProcess,
 		recordRoute: optionProcess,
 		routerAlert: optionVerify,
 		unknown:     optionPass,
 	}
 }

 // optionUsageEcho implements optionsUsage for echo packet processing.
 // Only Timestamp and RecordRoute are processed and sent back.
 type optionUsageEcho struct{}

 // actions implements optionsUsage.
 func (*optionUsageEcho) actions() optionActions {
 	return optionActions{
 		timestamp:   optionProcess,
 		recordRoute: optionProcess,
 		routerAlert: optionVerify,
 		unknown:     optionRemove,
 	}
 }

 // handleTimestamp does any required processing on a Timestamp option
 // in place.
 func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Address, clock tcpip.Clock, usage optionsUsage) *header.IPv4OptParameterProblem {
 	flags := tsOpt.Flags()
 	var entrySize uint8
 	switch flags {
 	case header.IPv4OptionTimestampOnlyFlag:
 		entrySize = header.IPv4OptionTimestampSize
 	case
 		header.IPv4OptionTimestampWithIPFlag,
 		header.IPv4OptionTimestampWithPredefinedIPFlag:
 		entrySize = header.IPv4OptionTimestampWithAddrSize
 	default:
 		return &header.IPv4OptParameterProblem{
 			Pointer:  header.IPv4OptTSOFLWAndFLGOffset,
 			NeedICMP: true,
 		}
 	}

 	pointer := tsOpt.Pointer()
 	// RFC 791 page 22 states: "The smallest legal value is 5."
 	// Since the pointer is 1 based, and the header is 4 bytes long the
 	// pointer must point beyond the header therefore 4 or less is bad.
 	if pointer <= header.IPv4OptionTimestampHdrLength {
 		return &header.IPv4OptParameterProblem{
 			Pointer:  header.IPv4OptTSPointerOffset,
 			NeedICMP: true,
 		}
 	}
 	// To simplify processing below, base further work on the array of timestamps
 	// beyond the header, rather than on the whole option. Also to aid
 	// calculations set 'nextSlot' to be 0 based as in the packet it is 1 based.
 	nextSlot := pointer - (header.IPv4OptionTimestampHdrLength + 1)
 	optLen := tsOpt.Size()
 	dataLength := optLen - header.IPv4OptionTimestampHdrLength

 	// In the section below, we verify the pointer, length and overflow counter
 	// fields of the option. The distinction is in which byte you return as being
 	// in error in the ICMP packet. Offsets 1 (length), 2 pointer)
 	// or 3 (overflowed counter).
 	//
 	// The following RFC sections cover this section:
 	//
 	// RFC 791 (page 22):
 	//    If there is some room but not enough room for a full timestamp
 	//    to be inserted, or the overflow count itself overflows, the
 	//    original datagram is considered to be in error and is discarded.
 	//    In either case an ICMP parameter problem message may be sent to
 	//    the source host [3].
 	//
 	// You can get this situation in two ways. Firstly if the data area is not
 	// a multiple of the entry size or secondly, if the pointer is not at a
 	// multiple of the entry size. The wording of the RFC suggests that
 	// this is not an error until you actually run out of space.
 	if pointer > optLen {
 		// RFC 791 (page 22) says we should switch to using the overflow count.
 		//    If the timestamp data area is already full (the pointer exceeds
 		//    the length) the datagram is forwarded without inserting the
 		//    timestamp, but the overflow count is incremented by one.
 		if flags == header.IPv4OptionTimestampWithPredefinedIPFlag {
 			// By definition we have nothing to do.
 			return nil
 		}

 		if tsOpt.IncOverflow() != 0 {
 			return nil
 		}
 		// The overflow count is also full.
 		return &header.IPv4OptParameterProblem{
 			Pointer:  header.IPv4OptTSOFLWAndFLGOffset,
 			NeedICMP: true,
 		}
 	}
 	if nextSlot+entrySize > dataLength {
 		// The data area isn't full but there isn't room for a new entry.
 		// Either Length or Pointer could be bad.
 		if false {
 			// We must select Pointer for Linux compatibility, even if
 			// only the length is bad.
 			// The Linux code is at (in October 2020)
 			// https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L367-L370
 			//		if (optptr[2]+3 > optlen) {
 			//			pp_ptr = optptr + 2;
 			//			goto error;
 			//		}
 			// which doesn't distinguish between which of optptr[2] or optlen
 			// is wrong, but just arbitrarily decides on optptr+2.
 			if dataLength%entrySize != 0 {
 				// The Data section size should be a multiple of the expected
 				// timestamp entry size.
 				return &header.IPv4OptParameterProblem{
 					Pointer:  header.IPv4OptionLengthOffset,
 					NeedICMP: false,
 				}
 			}
 			// If the size is OK, the pointer must be corrupted.
 		}
 		return &header.IPv4OptParameterProblem{
 			Pointer:  header.IPv4OptTSPointerOffset,
 			NeedICMP: true,
 		}
 	}

 	if usage.actions().timestamp == optionProcess {
 		tsOpt.UpdateTimestamp(localAddress, clock)
 	}
 	return nil
 }

 // handleRecordRoute checks and processes a Record route option. It is much
 // like the timestamp type 1 option, but without timestamps. The passed in
 // address is stored in the option in the correct spot if possible.
 func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Address, usage optionsUsage) *header.IPv4OptParameterProblem {
 	optlen := rrOpt.Size()

 	if optlen < header.IPv4AddressSize+header.IPv4OptionRecordRouteHdrLength {
 		return &header.IPv4OptParameterProblem{
 			Pointer:  header.IPv4OptionLengthOffset,
 			NeedICMP: true,
 		}
 	}

 	pointer := rrOpt.Pointer()
 	// RFC 791 page 20 states:
 	//      The pointer is relative to this option, and the
 	//      smallest legal value for the pointer is 4.
 	// Since the pointer is 1 based, and the header is 3 bytes long the
 	// pointer must point beyond the header therefore 3 or less is bad.
 	if pointer <= header.IPv4OptionRecordRouteHdrLength {
 		return &header.IPv4OptParameterProblem{
 			Pointer:  header.IPv4OptRRPointerOffset,
 			NeedICMP: true,
 		}
 	}

 	// RFC 791 page 21 says
 	//       If the route data area is already full (the pointer exceeds the
 	//       length) the datagram is forwarded without inserting the address
 	//       into the recorded route. If there is some room but not enough
 	//       room for a full address to be inserted, the original datagram is
 	//       considered to be in error and is discarded.  In either case an
 	//       ICMP parameter problem message may be sent to the source
 	//       host.
 	// The use of the words "In either case" suggests that a 'full' RR option
 	// could generate an ICMP at every hop after it fills up. We chose to not
 	// do this (as do most implementations). It is probable that the inclusion
 	// of these words is a copy/paste error from the timestamp option where
 	// there are two failure reasons given.
 	if pointer > optlen {
 		return nil
 	}

 	// The data area isn't full but there isn't room for a new entry.
 	// Either Length or Pointer could be bad. We must select Pointer for Linux
 	// compatibility, even if only the length is bad. NB. pointer is 1 based.
 	if pointer+header.IPv4AddressSize > optlen+1 {
 		if false {
 			// This is what we would do if we were not being Linux compatible.
 			// Check for bad pointer or length value. Must be a multiple of 4 after
 			// accounting for the 3 byte header and not within that header.
 			// RFC 791, page 20 says:
 			//       The pointer is relative to this option, and the
 			//       smallest legal value for the pointer is 4.
 			//
 			//       A recorded route is composed of a series of internet addresses.
 			//       Each internet address is 32 bits or 4 octets.
 			// Linux skips this test so we must too.  See Linux code at:
 			// https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L338-L341
 			//    if (optptr[2]+3 > optlen) {
 			//      pp_ptr = optptr + 2;
 			//      goto error;
 			//    }
 			if (optlen-header.IPv4OptionRecordRouteHdrLength)%header.IPv4AddressSize != 0 {
 				// Length is bad, not on integral number of slots.
 				return &header.IPv4OptParameterProblem{
 					Pointer:  header.IPv4OptionLengthOffset,
 					NeedICMP: true,
 				}
 			}
 			// If not length, the fault must be with the pointer.
 		}
 		return &header.IPv4OptParameterProblem{
 			Pointer:  header.IPv4OptRRPointerOffset,
 			NeedICMP: true,
 		}
 	}
 	if usage.actions().recordRoute == optionVerify {
 		return nil
 	}
 	rrOpt.StoreAddress(localAddress)
 	return nil
 }

 // handleRouterAlert performs sanity checks on a Router Alert option.
 func handleRouterAlert(raOpt header.IPv4OptionRouterAlert) *header.IPv4OptParameterProblem {
 	// Only the zero value is acceptable, as per RFC 2113, section 2.1:
 	//   Value:  A two octet code with the following values:
 	//     0 - Router shall examine packet
 	//     1-65535 - Reserved
 	if raOpt.Value() != header.IPv4OptionRouterAlertValue {
 		return &header.IPv4OptParameterProblem{
 			Pointer:  header.IPv4OptionRouterAlertValueOffset,
 			NeedICMP: true,
 		}
 	}
 	return nil
 }

 type optionTracker struct {
 	timestamp   bool
 	recordRoute bool
 	routerAlert bool
 }

 // processIPOptions parses the IPv4 options and produces a new set of options
 // suitable for use in the next step of packet processing as informed by usage.
 // The original will not be touched.
 //
 // If there were no errors during parsing, the new set of options is returned as
 // a new buffer.
 func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, orig header.IPv4Options, usage optionsUsage) (header.IPv4Options, optionTracker, *header.IPv4OptParameterProblem) {
 	stats := e.stats.ip
 	opts := header.IPv4Options(orig)
 	optIter := opts.MakeIterator()

 	// Except NOP, each option must only appear at most once (RFC 791 section 3.1,
 	// at the definition of every type).
 	// Keep track of each option we find to enable duplicate option detection.
 	var seenOptions [math.MaxUint8 + 1]bool

 	// TODO(https://gvisor.dev/issue/4586): This will need tweaking when we start
 	// really forwarding packets as we may need to get two addresses, for rx and
 	// tx interfaces. We will also have to take usage into account.
 	localAddress := e.MainAddress().Address
 	if len(localAddress) == 0 {
 		h := header.IPv4(pkt.NetworkHeader().View())
 		dstAddr := h.DestinationAddress()
 		if pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(dstAddr) {
 			return nil, optionTracker{}, &header.IPv4OptParameterProblem{
 				NeedICMP: false,
 			}
 		}
 		localAddress = dstAddr
 	}

 	var optionsProcessed optionTracker
 	for {
 		option, done, optProblem := optIter.Next()
 		if done || optProblem != nil {
 			return optIter.Finalize(), optionsProcessed, optProblem
 		}
 		optType := option.Type()
 		if optType == header.IPv4OptionNOPType {
 			optIter.PushNOPOrEnd(optType)
 			continue
 		}
 		if optType == header.IPv4OptionListEndType {
 			optIter.PushNOPOrEnd(optType)
 			return optIter.Finalize(), optionsProcessed, nil
 		}

 		// check for repeating options (multiple NOPs are OK)
 		if seenOptions[optType] {
 			return nil, optionTracker{}, &header.IPv4OptParameterProblem{
 				Pointer:  optIter.ErrCursor,
 				NeedICMP: true,
 			}
 		}
 		seenOptions[optType] = true

 		optLen, optProblem := func() (int, *header.IPv4OptParameterProblem) {
 			switch option := option.(type) {
 			case *header.IPv4OptionTimestamp:
 				stats.OptionTimestampReceived.Increment()
 				optionsProcessed.timestamp = true
 				if usage.actions().timestamp != optionRemove {
 					clock := e.protocol.stack.Clock()
 					newBuffer := optIter.InitReplacement(option)
 					optProblem := handleTimestamp(header.IPv4OptionTimestamp(newBuffer), localAddress, clock, usage)
 					return len(newBuffer), optProblem
 				}

 			case *header.IPv4OptionRecordRoute:
 				stats.OptionRecordRouteReceived.Increment()
 				optionsProcessed.recordRoute = true
 				if usage.actions().recordRoute != optionRemove {
 					newBuffer := optIter.InitReplacement(option)
 					optProblem := handleRecordRoute(header.IPv4OptionRecordRoute(newBuffer), localAddress, usage)
 					return len(newBuffer), optProblem
 				}

 			case *header.IPv4OptionRouterAlert:
 				stats.OptionRouterAlertReceived.Increment()
 				optionsProcessed.routerAlert = true
 				if usage.actions().routerAlert != optionRemove {
 					newBuffer := optIter.InitReplacement(option)
 					optProblem := handleRouterAlert(header.IPv4OptionRouterAlert(newBuffer))
 					return len(newBuffer), optProblem
 				}

 			default:
 				stats.OptionUnknownReceived.Increment()
 				if usage.actions().unknown == optionPass {
 					return len(optIter.InitReplacement(option)), nil
 				}
 			}
 			return 0, nil
 		}()

 		if optProblem != nil {
 			optProblem.Pointer += optIter.ErrCursor
 			return nil, optionTracker{}, optProblem
 		}
 		optIter.ConsumeBuffer(optLen)
 	}
 }