blob: a1660e9a36f7f899129a48e5d16302002ade0db3 [file] [log] [blame]
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package ipv4 contains the implementation of the ipv4 network protocol.
package ipv4
import (
"fmt"
"math"
"reflect"
"sync/atomic"
"time"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/header/parse"
"gvisor.dev/gvisor/pkg/tcpip/network/hash"
"gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
const (
// ReassembleTimeout is the time a packet stays in the reassembly
// system before being evicted.
// As per RFC 791 section 3.2:
// The current recommendation for the initial timer setting is 15 seconds.
// This may be changed as experience with this protocol accumulates.
//
// Considering that it is an old recommendation, we use the same reassembly
// timeout that linux defines, which is 30 seconds:
// https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ip.h#L138
ReassembleTimeout = 30 * time.Second
// ProtocolNumber is the ipv4 protocol number.
ProtocolNumber = header.IPv4ProtocolNumber
// MaxTotalSize is maximum size that can be encoded in the 16-bit
// TotalLength field of the ipv4 header.
MaxTotalSize = 0xffff
// DefaultTTL is the default time-to-live value for this endpoint.
DefaultTTL = 64
// buckets is the number of identifier buckets.
buckets = 2048
// The size of a fragment block, in bytes, as per RFC 791 section 3.1,
// page 14.
fragmentblockSize = 8
)
var ipv4BroadcastAddr = header.IPv4Broadcast.WithPrefix()
var _ stack.LinkResolvableNetworkEndpoint = (*endpoint)(nil)
var _ stack.GroupAddressableEndpoint = (*endpoint)(nil)
var _ stack.AddressableEndpoint = (*endpoint)(nil)
var _ stack.NetworkEndpoint = (*endpoint)(nil)
type endpoint struct {
nic stack.NetworkInterface
dispatcher stack.TransportDispatcher
protocol *protocol
stats sharedStats
// enabled is set to 1 when the enpoint is enabled and 0 when it is
// disabled.
//
// Must be accessed using atomic operations.
enabled uint32
mu struct {
sync.RWMutex
addressableEndpointState stack.AddressableEndpointState
igmp igmpState
}
}
// HandleLinkResolutionFailure implements stack.LinkResolvableNetworkEndpoint.
func (e *endpoint) HandleLinkResolutionFailure(pkt *stack.PacketBuffer) {
// handleControl expects the entire offending packet to be in the packet
// buffer's data field.
pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{
Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
})
pkt.NICID = e.nic.ID()
pkt.NetworkProtocolNumber = ProtocolNumber
// Use the same control type as an ICMPv4 destination host unreachable error
// since the host is considered unreachable if we cannot resolve the link
// address to the next hop.
e.handleControl(&icmpv4DestinationHostUnreachableSockError{}, pkt)
}
// NewEndpoint creates a new ipv4 endpoint.
func (p *protocol) NewEndpoint(nic stack.NetworkInterface, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
e := &endpoint{
nic: nic,
dispatcher: dispatcher,
protocol: p,
}
e.mu.Lock()
e.mu.addressableEndpointState.Init(e)
e.mu.igmp.init(e)
e.mu.Unlock()
tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem())
stackStats := p.stack.Stats()
e.stats.ip.Init(&e.stats.localStats.IP, &stackStats.IP)
e.stats.icmp.init(&e.stats.localStats.ICMP, &stackStats.ICMP.V4)
e.stats.igmp.init(&e.stats.localStats.IGMP, &stackStats.IGMP)
p.mu.Lock()
p.mu.eps[nic.ID()] = e
p.mu.Unlock()
return e
}
func (p *protocol) findEndpointWithAddress(addr tcpip.Address) *endpoint {
p.mu.RLock()
defer p.mu.RUnlock()
for _, e := range p.mu.eps {
if addressEndpoint := e.AcquireAssignedAddress(addr, false /* allowTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
addressEndpoint.DecRef()
return e
}
}
return nil
}
func (p *protocol) forgetEndpoint(nicID tcpip.NICID) {
p.mu.Lock()
defer p.mu.Unlock()
delete(p.mu.eps, nicID)
}
// Enable implements stack.NetworkEndpoint.
func (e *endpoint) Enable() tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
// If the NIC is not enabled, the endpoint can't do anything meaningful so
// don't enable the endpoint.
if !e.nic.Enabled() {
return &tcpip.ErrNotPermitted{}
}
// If the endpoint is already enabled, there is nothing for it to do.
if !e.setEnabled(true) {
return nil
}
// Create an endpoint to receive broadcast packets on this interface.
ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */)
if err != nil {
return err
}
// We have no need for the address endpoint.
ep.DecRef()
// Groups may have been joined while the endpoint was disabled, or the
// endpoint may have left groups from the perspective of IGMP when the
// endpoint was disabled. Either way, we need to let routers know to
// send us multicast traffic.
e.mu.igmp.initializeAll()
// As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
// multicast group. Note, the IANA calls the all-hosts multicast group the
// all-systems multicast group.
if err := e.joinGroupLocked(header.IPv4AllSystems); err != nil {
// joinGroupLocked only returns an error if the group address is not a valid
// IPv4 multicast address.
panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllSystems, err))
}
return nil
}
// Enabled implements stack.NetworkEndpoint.
func (e *endpoint) Enabled() bool {
return e.nic.Enabled() && e.isEnabled()
}
// isEnabled returns true if the endpoint is enabled, regardless of the
// enabled status of the NIC.
func (e *endpoint) isEnabled() bool {
return atomic.LoadUint32(&e.enabled) == 1
}
// setEnabled sets the enabled status for the endpoint.
//
// Returns true if the enabled status was updated.
func (e *endpoint) setEnabled(v bool) bool {
if v {
return atomic.SwapUint32(&e.enabled, 1) == 0
}
return atomic.SwapUint32(&e.enabled, 0) == 1
}
// Disable implements stack.NetworkEndpoint.
func (e *endpoint) Disable() {
e.mu.Lock()
defer e.mu.Unlock()
e.disableLocked()
}
func (e *endpoint) disableLocked() {
if !e.isEnabled() {
return
}
// The endpoint may have already left the multicast group.
switch err := e.leaveGroupLocked(header.IPv4AllSystems); err.(type) {
case nil, *tcpip.ErrBadLocalAddress:
default:
panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err))
}
// Leave groups from the perspective of IGMP so that routers know that
// we are no longer interested in the group.
e.mu.igmp.softLeaveAll()
// The address may have already been removed.
switch err := e.mu.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err.(type) {
case nil, *tcpip.ErrBadLocalAddress:
default:
panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err))
}
// Reset the IGMP V1 present flag.
//
// If the node comes back up on the same network, it will re-learn that it
// needs to perform IGMPv1.
e.mu.igmp.resetV1Present()
if !e.setEnabled(false) {
panic("should have only done work to disable the endpoint if it was enabled")
}
}
// DefaultTTL is the default time-to-live value for this endpoint.
func (e *endpoint) DefaultTTL() uint8 {
return e.protocol.DefaultTTL()
}
// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus
// the network layer max header length.
func (e *endpoint) MTU() uint32 {
networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv4MinimumSize)
if err != nil {
return 0
}
return networkMTU
}
// MaxHeaderLength returns the maximum length needed by ipv4 headers (and
// underlying protocols).
func (e *endpoint) MaxHeaderLength() uint16 {
return e.nic.MaxHeaderLength() + header.IPv4MaximumHeaderSize
}
// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
return e.protocol.Number()
}
func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, options header.IPv4OptionsSerializer) tcpip.Error {
hdrLen := header.IPv4MinimumSize
var optLen int
if options != nil {
optLen = int(options.Length())
}
hdrLen += optLen
if hdrLen > header.IPv4MaximumHeaderSize {
return &tcpip.ErrMessageTooLong{}
}
ip := header.IPv4(pkt.NetworkHeader().Push(hdrLen))
length := pkt.Size()
if length > math.MaxUint16 {
return &tcpip.ErrMessageTooLong{}
}
// RFC 6864 section 4.3 mandates uniqueness of ID values for non-atomic
// datagrams. Since the DF bit is never being set here, all datagrams
// are non-atomic and need an ID.
id := atomic.AddUint32(&e.protocol.ids[hashRoute(srcAddr, dstAddr, params.Protocol, e.protocol.hashIV)%buckets], 1)
ip.Encode(&header.IPv4Fields{
TotalLength: uint16(length),
ID: uint16(id),
TTL: params.TTL,
TOS: params.TOS,
Protocol: uint8(params.Protocol),
SrcAddr: srcAddr,
DstAddr: dstAddr,
Options: options,
})
ip.SetChecksum(^ip.CalculateChecksum())
pkt.NetworkProtocolNumber = ProtocolNumber
return nil
}
// handleFragments fragments pkt and calls the handler function on each
// fragment. It returns the number of fragments handled and the number of
// fragments left to be processed. The IP header must already be present in the
// original packet.
func (e *endpoint) handleFragments(r *stack.Route, gso *stack.GSO, networkMTU uint32, pkt *stack.PacketBuffer, handler func(*stack.PacketBuffer) tcpip.Error) (int, int, tcpip.Error) {
// Round the MTU down to align to 8 bytes.
fragmentPayloadSize := networkMTU &^ 7
networkHeader := header.IPv4(pkt.NetworkHeader().View())
pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadSize, pkt.AvailableHeaderBytes()+len(networkHeader))
var n int
for {
fragPkt, more := buildNextFragment(&pf, networkHeader)
if err := handler(fragPkt); err != nil {
return n, pf.RemainingFragmentCount() + 1, err
}
n++
if !more {
return n, pf.RemainingFragmentCount(), nil
}
}
}
// WritePacket writes a packet to the given destination address and protocol.
func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) tcpip.Error {
if err := e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* options */); err != nil {
return err
}
// iptables filtering. All packets that reach here are locally
// generated.
outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
if ok := e.protocol.stack.IPTables().Check(stack.Output, pkt, gso, r, "" /* preroutingAddr */, "" /* inNicName */, outNicName); !ok {
// iptables is telling us to drop the packet.
e.stats.ip.IPTablesOutputDropped.Increment()
return nil
}
// If the packet is manipulated as per NAT Output rules, handle packet
// based on destination address and do not send the packet to link
// layer.
//
// TODO(gvisor.dev/issue/170): We should do this for every
// packet, rather than only NATted packets, but removing this check
// short circuits broadcasts before they are sent out to other hosts.
if pkt.NatDone {
netHeader := header.IPv4(pkt.NetworkHeader().View())
if ep := e.protocol.findEndpointWithAddress(netHeader.DestinationAddress()); ep != nil {
// Since we rewrote the packet but it is being routed back to us, we
// can safely assume the checksum is valid.
ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
return nil
}
}
return e.writePacket(r, gso, pkt, false /* headerIncluded */)
}
func (e *endpoint) writePacket(r *stack.Route, gso *stack.GSO, pkt *stack.PacketBuffer, headerIncluded bool) tcpip.Error {
if r.Loop&stack.PacketLoop != 0 {
// If the packet was generated by the stack (not a raw/packet endpoint
// where a packet may be written with the header included), then we can
// safely assume the checksum is valid.
e.handleLocalPacket(pkt, !headerIncluded /* canSkipRXChecksum */)
}
if r.Loop&stack.PacketOut == 0 {
return nil
}
stats := e.stats.ip
networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
if err != nil {
stats.OutgoingPacketErrors.Increment()
return err
}
if packetMustBeFragmented(pkt, networkMTU, gso) {
sent, remain, err := e.handleFragments(r, gso, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error {
// TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
// fragment one by one using WritePacket() (current strategy) or if we
// want to create a PacketBufferList from the fragments and feed it to
// WritePackets(). It'll be faster but cost more memory.
return e.nic.WritePacket(r, gso, ProtocolNumber, fragPkt)
})
stats.PacketsSent.IncrementBy(uint64(sent))
stats.OutgoingPacketErrors.IncrementBy(uint64(remain))
return err
}
if err := e.nic.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
stats.OutgoingPacketErrors.Increment()
return err
}
stats.PacketsSent.Increment()
return nil
}
// WritePackets implements stack.NetworkEndpoint.WritePackets.
func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, tcpip.Error) {
if r.Loop&stack.PacketLoop != 0 {
panic("multiple packets in local loop")
}
if r.Loop&stack.PacketOut == 0 {
return pkts.Len(), nil
}
stats := e.stats.ip
for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
if err := e.addIPHeader(r.LocalAddress, r.RemoteAddress, pkt, params, nil /* options */); err != nil {
return 0, err
}
networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
if err != nil {
stats.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
return 0, err
}
if packetMustBeFragmented(pkt, networkMTU, gso) {
// Keep track of the packet that is about to be fragmented so it can be
// removed once the fragmentation is done.
originalPkt := pkt
if _, _, err := e.handleFragments(r, gso, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error {
// Modify the packet list in place with the new fragments.
pkts.InsertAfter(pkt, fragPkt)
pkt = fragPkt
return nil
}); err != nil {
panic(fmt.Sprintf("e.handleFragments(_, _, %d, _, _) = %s", networkMTU, err))
}
// Remove the packet that was just fragmented and process the rest.
pkts.Remove(originalPkt)
}
}
outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
// iptables filtering. All packets that reach here are locally
// generated.
dropped, natPkts := e.protocol.stack.IPTables().CheckPackets(stack.Output, pkts, gso, r, "", outNicName)
stats.IPTablesOutputDropped.IncrementBy(uint64(len(dropped)))
for pkt := range dropped {
pkts.Remove(pkt)
}
// The NAT-ed packets may now be destined for us.
locallyDelivered := 0
for pkt := range natPkts {
ep := e.protocol.findEndpointWithAddress(header.IPv4(pkt.NetworkHeader().View()).DestinationAddress())
if ep == nil {
// The NAT-ed packet is still destined for some remote node.
continue
}
// Do not send the locally destined packet out the NIC.
pkts.Remove(pkt)
// Deliver the packet locally.
ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
locallyDelivered++
}
// The rest of the packets can be delivered to the NIC as a batch.
pktsLen := pkts.Len()
written, err := e.nic.WritePackets(r, gso, pkts, ProtocolNumber)
stats.PacketsSent.IncrementBy(uint64(written))
stats.OutgoingPacketErrors.IncrementBy(uint64(pktsLen - written))
// Dropped packets aren't errors, so include them in the return value.
return locallyDelivered + written + len(dropped), err
}
// WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error {
// The packet already has an IP header, but there are a few required
// checks.
h, ok := pkt.Data().PullUp(header.IPv4MinimumSize)
if !ok {
return &tcpip.ErrMalformedHeader{}
}
hdrLen := header.IPv4(h).HeaderLength()
if hdrLen < header.IPv4MinimumSize {
return &tcpip.ErrMalformedHeader{}
}
h, ok = pkt.Data().PullUp(int(hdrLen))
if !ok {
return &tcpip.ErrMalformedHeader{}
}
ip := header.IPv4(h)
// Always set the total length.
pktSize := pkt.Data().Size()
ip.SetTotalLength(uint16(pktSize))
// Set the source address when zero.
if ip.SourceAddress() == header.IPv4Any {
ip.SetSourceAddress(r.LocalAddress)
}
// Set the destination. If the packet already included a destination, it will
// be part of the route anyways.
ip.SetDestinationAddress(r.RemoteAddress)
// Set the packet ID when zero.
if ip.ID() == 0 {
// RFC 6864 section 4.3 mandates uniqueness of ID values for
// non-atomic datagrams, so assign an ID to all such datagrams
// according to the definition given in RFC 6864 section 4.
if ip.Flags()&header.IPv4FlagDontFragment == 0 || ip.Flags()&header.IPv4FlagMoreFragments != 0 || ip.FragmentOffset() > 0 {
ip.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r.LocalAddress, r.RemoteAddress, 0 /* protocol */, e.protocol.hashIV)%buckets], 1)))
}
}
// Always set the checksum.
ip.SetChecksum(0)
ip.SetChecksum(^ip.CalculateChecksum())
// Populate the packet buffer's network header and don't allow an invalid
// packet to be sent.
//
// Note that parsing only makes sure that the packet is well formed as per the
// wire format. We also want to check if the header's fields are valid before
// sending the packet.
if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().View()).IsValid(pktSize) {
return &tcpip.ErrMalformedHeader{}
}
return e.writePacket(r, nil /* gso */, pkt, true /* headerIncluded */)
}
// forwardPacket attempts to forward a packet to its final destination.
func (e *endpoint) forwardPacket(pkt *stack.PacketBuffer) tcpip.Error {
h := header.IPv4(pkt.NetworkHeader().View())
ttl := h.TTL()
if ttl == 0 {
// As per RFC 792 page 6, Time Exceeded Message,
//
// If the gateway processing a datagram finds the time to live field
// is zero it must discard the datagram. The gateway may also notify
// the source host via the time exceeded message.
return e.protocol.returnError(&icmpReasonTTLExceeded{}, pkt)
}
if opts := h.Options(); len(opts) != 0 {
newOpts, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageForward{})
if optProblem != nil {
if optProblem.NeedICMP {
_ = e.protocol.returnError(&icmpReasonParamProblem{
pointer: optProblem.Pointer,
forwarding: true,
}, pkt)
e.protocol.stack.Stats().MalformedRcvdPackets.Increment()
e.stats.ip.MalformedPacketsReceived.Increment()
}
return nil // option problems are not reported locally.
}
copied := copy(opts, newOpts)
if copied != len(newOpts) {
panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts)))
}
// Since in forwarding we handle all options, including copying those we
// do not recognise, the options region should remain the same size which
// simplifies processing. As we MAY receive a packet with a lot of padded
// bytes after the "end of options list" byte, make sure we copy
// them as the legal padding value (0).
for i := copied; i < len(opts); i++ {
// Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
opts[i] = byte(header.IPv4OptionListEndType)
}
}
dstAddr := h.DestinationAddress()
// Check if the destination is owned by the stack.
if ep := e.protocol.findEndpointWithAddress(dstAddr); ep != nil {
ep.handleValidatedPacket(h, pkt)
return nil
}
r, err := e.protocol.stack.FindRoute(0, "", dstAddr, ProtocolNumber, false /* multicastLoop */)
if err != nil {
return err
}
defer r.Release()
// We need to do a deep copy of the IP packet because
// WriteHeaderIncludedPacket takes ownership of the packet buffer, but we do
// not own it.
newHdr := header.IPv4(stack.PayloadSince(pkt.NetworkHeader()))
// As per RFC 791 page 30, Time to Live,
//
// This field must be decreased at each point that the internet header
// is processed to reflect the time spent processing the datagram.
// Even if no local information is available on the time actually
// spent, the field must be decremented by 1.
newHdr.SetTTL(ttl - 1)
return r.WriteHeaderIncludedPacket(stack.NewPacketBuffer(stack.PacketBufferOptions{
ReserveHeaderBytes: int(r.MaxHeaderLength()),
Data: buffer.View(newHdr).ToVectorisedView(),
}))
}
// HandlePacket is called by the link layer when new ipv4 packets arrive for
// this endpoint.
func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
stats := e.stats.ip
stats.PacketsReceived.Increment()
if !e.isEnabled() {
stats.DisabledPacketsReceived.Increment()
return
}
h, ok := e.protocol.parseAndValidate(pkt)
if !ok {
stats.MalformedPacketsReceived.Increment()
return
}
if !e.nic.IsLoopback() {
if !e.protocol.options.AllowExternalLoopbackTraffic {
if header.IsV4LoopbackAddress(h.SourceAddress()) {
stats.InvalidSourceAddressesReceived.Increment()
return
}
if header.IsV4LoopbackAddress(h.DestinationAddress()) {
stats.InvalidDestinationAddressesReceived.Increment()
return
}
}
if e.protocol.stack.HandleLocal() {
addressEndpoint := e.AcquireAssignedAddress(header.IPv4(pkt.NetworkHeader().View()).SourceAddress(), e.nic.Promiscuous(), stack.CanBePrimaryEndpoint)
if addressEndpoint != nil {
addressEndpoint.DecRef()
// The source address is one of our own, so we never should have gotten
// a packet like this unless HandleLocal is false or our NIC is the
// loopback interface.
stats.InvalidSourceAddressesReceived.Increment()
return
}
}
// Loopback traffic skips the prerouting chain.
inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
if ok := e.protocol.stack.IPTables().Check(stack.Prerouting, pkt, nil, nil, e.MainAddress().Address, inNicName, "" /* outNicName */); !ok {
// iptables is telling us to drop the packet.
stats.IPTablesPreroutingDropped.Increment()
return
}
}
e.handleValidatedPacket(h, pkt)
}
// handleLocalPacket is like HandlePacket except it does not perform the
// prerouting iptables hook or check for loopback traffic that originated from
// outside of the netstack (i.e. martian loopback packets).
func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum bool) {
stats := e.stats.ip
stats.PacketsReceived.Increment()
pkt = pkt.CloneToInbound()
pkt.RXTransportChecksumValidated = canSkipRXChecksum
h, ok := e.protocol.parseAndValidate(pkt)
if !ok {
stats.MalformedPacketsReceived.Increment()
return
}
e.handleValidatedPacket(h, pkt)
}
func (e *endpoint) handleValidatedPacket(h header.IPv4, pkt *stack.PacketBuffer) {
pkt.NICID = e.nic.ID()
stats := e.stats
srcAddr := h.SourceAddress()
dstAddr := h.DestinationAddress()
// As per RFC 1122 section 3.2.1.3:
// When a host sends any datagram, the IP source address MUST
// be one of its own IP addresses (but not a broadcast or
// multicast address).
if srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) {
stats.ip.InvalidSourceAddressesReceived.Increment()
return
}
// Make sure the source address is not a subnet-local broadcast address.
if addressEndpoint := e.AcquireAssignedAddress(srcAddr, false /* createTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
subnet := addressEndpoint.Subnet()
addressEndpoint.DecRef()
if subnet.IsBroadcast(srcAddr) {
stats.ip.InvalidSourceAddressesReceived.Increment()
return
}
}
// Before we do any processing, note if the packet was received as some
// sort of broadcast. The destination address should be an address we own
// or a group we joined.
if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil {
subnet := addressEndpoint.AddressWithPrefix().Subnet()
addressEndpoint.DecRef()
pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast
} else if !e.IsInGroup(dstAddr) {
if !e.protocol.Forwarding() {
stats.ip.InvalidDestinationAddressesReceived.Increment()
return
}
_ = e.forwardPacket(pkt)
return
}
// iptables filtering. All packets that reach here are intended for
// this machine and will not be forwarded.
inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
if ok := e.protocol.stack.IPTables().Check(stack.Input, pkt, nil, nil, "" /* preroutingAddr */, inNicName, "" /* outNicName */); !ok {
// iptables is telling us to drop the packet.
stats.ip.IPTablesInputDropped.Increment()
return
}
if h.More() || h.FragmentOffset() != 0 {
if pkt.Data().Size()+pkt.TransportHeader().View().Size() == 0 {
// Drop the packet as it's marked as a fragment but has
// no payload.
stats.ip.MalformedPacketsReceived.Increment()
stats.ip.MalformedFragmentsReceived.Increment()
return
}
if opts := h.Options(); len(opts) != 0 {
// If there are options we need to check them before we do assembly
// or we could be assembling errant packets. However we do not change the
// options as that could lead to double processing later.
if _, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageVerify{}); optProblem != nil {
if optProblem.NeedICMP {
_ = e.protocol.returnError(&icmpReasonParamProblem{
pointer: optProblem.Pointer,
}, pkt)
e.protocol.stack.Stats().MalformedRcvdPackets.Increment()
e.stats.ip.MalformedPacketsReceived.Increment()
}
return
}
}
// The packet is a fragment, let's try to reassemble it.
start := h.FragmentOffset()
// Drop the fragment if the size of the reassembled payload would exceed the
// maximum payload size.
//
// Note that this addition doesn't overflow even on 32bit architecture
// because pkt.Data().Size() should not exceed 65535 (the max IP datagram
// size). Otherwise the packet would've been rejected as invalid before
// reaching here.
if int(start)+pkt.Data().Size() > header.IPv4MaximumPayloadSize {
stats.ip.MalformedPacketsReceived.Increment()
stats.ip.MalformedFragmentsReceived.Increment()
return
}
proto := h.Protocol()
resPkt, _, ready, err := e.protocol.fragmentation.Process(
// As per RFC 791 section 2.3, the identification value is unique
// for a source-destination pair and protocol.
fragmentation.FragmentID{
Source: h.SourceAddress(),
Destination: h.DestinationAddress(),
ID: uint32(h.ID()),
Protocol: proto,
},
start,
start+uint16(pkt.Data().Size())-1,
h.More(),
proto,
pkt,
)
if err != nil {
stats.ip.MalformedPacketsReceived.Increment()
stats.ip.MalformedFragmentsReceived.Increment()
return
}
if !ready {
return
}
pkt = resPkt
h = header.IPv4(pkt.NetworkHeader().View())
// The reassembler doesn't take care of fixing up the header, so we need
// to do it here.
h.SetTotalLength(uint16(pkt.Data().Size() + len((h))))
h.SetFlagsFragmentOffset(0, 0)
}
stats.ip.PacketsDelivered.Increment()
p := h.TransportProtocol()
if p == header.ICMPv4ProtocolNumber {
// TODO(gvisor.dev/issues/3810): when we sort out ICMP and transport
// headers, the setting of the transport number here should be
// unnecessary and removed.
pkt.TransportProtocolNumber = p
e.handleICMP(pkt)
return
}
// ICMP handles options itself but do it here for all remaining destinations.
var hasRouterAlertOption bool
if opts := h.Options(); len(opts) != 0 {
newOpts, processedOpts, optProblem := e.processIPOptions(pkt, opts, &optionUsageReceive{})
if optProblem != nil {
if optProblem.NeedICMP {
_ = e.protocol.returnError(&icmpReasonParamProblem{
pointer: optProblem.Pointer,
}, pkt)
e.protocol.stack.Stats().MalformedRcvdPackets.Increment()
stats.ip.MalformedPacketsReceived.Increment()
}
return
}
hasRouterAlertOption = processedOpts.routerAlert
copied := copy(opts, newOpts)
if copied != len(newOpts) {
panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts)))
}
for i := copied; i < len(opts); i++ {
// Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
opts[i] = byte(header.IPv4OptionListEndType)
}
}
if p == header.IGMPProtocolNumber {
e.mu.Lock()
e.mu.igmp.handleIGMP(pkt, hasRouterAlertOption)
e.mu.Unlock()
return
}
switch res := e.dispatcher.DeliverTransportPacket(p, pkt); res {
case stack.TransportPacketHandled:
case stack.TransportPacketDestinationPortUnreachable:
// As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination
// Unreachable messages with code:
// 3 (Port Unreachable), when the designated transport protocol
// (e.g., UDP) is unable to demultiplex the datagram but has no
// protocol mechanism to inform the sender.
_ = e.protocol.returnError(&icmpReasonPortUnreachable{}, pkt)
case stack.TransportPacketProtocolUnreachable:
// As per RFC: 1122 Section 3.2.2.1
// A host SHOULD generate Destination Unreachable messages with code:
// 2 (Protocol Unreachable), when the designated transport protocol
// is not supported
_ = e.protocol.returnError(&icmpReasonProtoUnreachable{}, pkt)
default:
panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
}
}
// Close cleans up resources associated with the endpoint.
func (e *endpoint) Close() {
e.mu.Lock()
e.disableLocked()
e.mu.addressableEndpointState.Cleanup()
e.mu.Unlock()
e.protocol.forgetEndpoint(e.nic.ID())
}
// AddAndAcquirePermanentAddress implements stack.AddressableEndpoint.
func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, tcpip.Error) {
e.mu.Lock()
defer e.mu.Unlock()
ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated)
if err == nil {
e.mu.igmp.sendQueuedReports()
}
return ep, err
}
// RemovePermanentAddress implements stack.AddressableEndpoint.
func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
return e.mu.addressableEndpointState.RemovePermanentAddress(addr)
}
// MainAddress implements stack.AddressableEndpoint.
func (e *endpoint) MainAddress() tcpip.AddressWithPrefix {
e.mu.RLock()
defer e.mu.RUnlock()
return e.mu.addressableEndpointState.MainAddress()
}
// AcquireAssignedAddress implements stack.AddressableEndpoint.
func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
e.mu.Lock()
defer e.mu.Unlock()
loopback := e.nic.IsLoopback()
return e.mu.addressableEndpointState.AcquireAssignedAddressOrMatching(localAddr, func(addressEndpoint stack.AddressEndpoint) bool {
subnet := addressEndpoint.Subnet()
// IPv4 has a notion of a subnet broadcast address and considers the
// loopback interface bound to an address's whole subnet (on linux).
return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr))
}, allowTemp, tempPEB)
}
// AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
e.mu.RLock()
defer e.mu.RUnlock()
return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired)
}
// acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress
// but with locking requirements
//
// Precondition: igmp.ep.mu must be read locked.
func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired)
}
// PrimaryAddresses implements stack.AddressableEndpoint.
func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix {
e.mu.RLock()
defer e.mu.RUnlock()
return e.mu.addressableEndpointState.PrimaryAddresses()
}
// PermanentAddresses implements stack.AddressableEndpoint.
func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
e.mu.RLock()
defer e.mu.RUnlock()
return e.mu.addressableEndpointState.PermanentAddresses()
}
// JoinGroup implements stack.GroupAddressableEndpoint.
func (e *endpoint) JoinGroup(addr tcpip.Address) tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
return e.joinGroupLocked(addr)
}
// joinGroupLocked is like JoinGroup but with locking requirements.
//
// Precondition: e.mu must be locked.
func (e *endpoint) joinGroupLocked(addr tcpip.Address) tcpip.Error {
if !header.IsV4MulticastAddress(addr) {
return &tcpip.ErrBadAddress{}
}
e.mu.igmp.joinGroup(addr)
return nil
}
// LeaveGroup implements stack.GroupAddressableEndpoint.
func (e *endpoint) LeaveGroup(addr tcpip.Address) tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
return e.leaveGroupLocked(addr)
}
// leaveGroupLocked is like LeaveGroup but with locking requirements.
//
// Precondition: e.mu must be locked.
func (e *endpoint) leaveGroupLocked(addr tcpip.Address) tcpip.Error {
return e.mu.igmp.leaveGroup(addr)
}
// IsInGroup implements stack.GroupAddressableEndpoint.
func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
e.mu.RLock()
defer e.mu.RUnlock()
return e.mu.igmp.isInGroup(addr)
}
// Stats implements stack.NetworkEndpoint.
func (e *endpoint) Stats() stack.NetworkEndpointStats {
return &e.stats.localStats
}
var _ stack.ForwardingNetworkProtocol = (*protocol)(nil)
var _ stack.NetworkProtocol = (*protocol)(nil)
var _ fragmentation.TimeoutHandler = (*protocol)(nil)
type protocol struct {
stack *stack.Stack
mu struct {
sync.RWMutex
// eps is keyed by NICID to allow protocol methods to retrieve an endpoint
// when handling a packet, by looking at which NIC handled the packet.
eps map[tcpip.NICID]*endpoint
}
// defaultTTL is the current default TTL for the protocol. Only the
// uint8 portion of it is meaningful.
//
// Must be accessed using atomic operations.
defaultTTL uint32
// forwarding is set to 1 when the protocol has forwarding enabled and 0
// when it is disabled.
//
// Must be accessed using atomic operations.
forwarding uint32
ids []uint32
hashIV uint32
fragmentation *fragmentation.Fragmentation
options Options
}
// Number returns the ipv4 protocol number.
func (p *protocol) Number() tcpip.NetworkProtocolNumber {
return ProtocolNumber
}
// MinimumPacketSize returns the minimum valid ipv4 packet size.
func (p *protocol) MinimumPacketSize() int {
return header.IPv4MinimumSize
}
// DefaultPrefixLen returns the IPv4 default prefix length.
func (p *protocol) DefaultPrefixLen() int {
return header.IPv4AddressSize * 8
}
// ParseAddresses implements NetworkProtocol.ParseAddresses.
func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
h := header.IPv4(v)
return h.SourceAddress(), h.DestinationAddress()
}
// SetOption implements NetworkProtocol.SetOption.
func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error {
switch v := option.(type) {
case *tcpip.DefaultTTLOption:
p.SetDefaultTTL(uint8(*v))
return nil
default:
return &tcpip.ErrUnknownProtocolOption{}
}
}
// Option implements NetworkProtocol.Option.
func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error {
switch v := option.(type) {
case *tcpip.DefaultTTLOption:
*v = tcpip.DefaultTTLOption(p.DefaultTTL())
return nil
default:
return &tcpip.ErrUnknownProtocolOption{}
}
}
// SetDefaultTTL sets the default TTL for endpoints created with this protocol.
func (p *protocol) SetDefaultTTL(ttl uint8) {
atomic.StoreUint32(&p.defaultTTL, uint32(ttl))
}
// DefaultTTL returns the default TTL for endpoints created with this protocol.
func (p *protocol) DefaultTTL() uint8 {
return uint8(atomic.LoadUint32(&p.defaultTTL))
}
// Close implements stack.TransportProtocol.Close.
func (*protocol) Close() {}
// Wait implements stack.TransportProtocol.Wait.
func (*protocol) Wait() {}
// parseAndValidate parses the packet (including its transport layer header) and
// returns the parsed IP header.
//
// Returns true if the IP header was successfully parsed.
func (p *protocol) parseAndValidate(pkt *stack.PacketBuffer) (header.IPv4, bool) {
transProtoNum, hasTransportHdr, ok := p.Parse(pkt)
if !ok {
return nil, false
}
h := header.IPv4(pkt.NetworkHeader().View())
// Do not include the link header's size when calculating the size of the IP
// packet.
if !h.IsValid(pkt.Size() - pkt.LinkHeader().View().Size()) {
return nil, false
}
// There has been some confusion regarding verifying checksums. We need
// just look for negative 0 (0xffff) as the checksum, as it's not possible to
// get positive 0 (0) for the checksum. Some bad implementations could get it
// when doing entry replacement in the early days of the Internet,
// however the lore that one needs to check for both persists.
//
// RFC 1624 section 1 describes the source of this confusion as:
// [the partial recalculation method described in RFC 1071] computes a
// result for certain cases that differs from the one obtained from
// scratch (one's complement of one's complement sum of the original
// fields).
//
// However RFC 1624 section 5 clarifies that if using the verification method
// "recommended by RFC 1071, it does not matter if an intermediate system
// generated a -0 instead of +0".
//
// RFC1071 page 1 specifies the verification method as:
// (3) To check a checksum, the 1's complement sum is computed over the
// same set of octets, including the checksum field. If the result
// is all 1 bits (-0 in 1's complement arithmetic), the check
// succeeds.
if h.CalculateChecksum() != 0xffff {
return nil, false
}
if hasTransportHdr {
switch err := p.stack.ParsePacketBufferTransport(transProtoNum, pkt); err {
case stack.ParsedOK:
case stack.UnknownTransportProtocol, stack.TransportLayerParseError:
// The transport layer will handle unknown protocols and transport layer
// parsing errors.
default:
panic(fmt.Sprintf("unexpected error parsing transport header = %d", err))
}
}
return h, true
}
// Parse implements stack.NetworkProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
if ok := parse.IPv4(pkt); !ok {
return 0, false, false
}
ipHdr := header.IPv4(pkt.NetworkHeader().View())
return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true
}
// Forwarding implements stack.ForwardingNetworkProtocol.
func (p *protocol) Forwarding() bool {
return uint8(atomic.LoadUint32(&p.forwarding)) == 1
}
// SetForwarding implements stack.ForwardingNetworkProtocol.
func (p *protocol) SetForwarding(v bool) {
if v {
atomic.StoreUint32(&p.forwarding, 1)
} else {
atomic.StoreUint32(&p.forwarding, 0)
}
}
// calculateNetworkMTU calculates the network-layer payload MTU based on the
// link-layer payload mtu.
func calculateNetworkMTU(linkMTU, networkHeaderSize uint32) (uint32, tcpip.Error) {
if linkMTU < header.IPv4MinimumMTU {
return 0, &tcpip.ErrInvalidEndpointState{}
}
// As per RFC 791 section 3.1, an IPv4 header cannot exceed 60 bytes in
// length:
// The maximal internet header is 60 octets, and a typical internet header
// is 20 octets, allowing a margin for headers of higher level protocols.
if networkHeaderSize > header.IPv4MaximumHeaderSize {
return 0, &tcpip.ErrMalformedHeader{}
}
networkMTU := linkMTU
if networkMTU > MaxTotalSize {
networkMTU = MaxTotalSize
}
return networkMTU - uint32(networkHeaderSize), nil
}
func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32, gso *stack.GSO) bool {
payload := pkt.TransportHeader().View().Size() + pkt.Data().Size()
return (gso == nil || gso.Type == stack.GSONone) && uint32(payload) > networkMTU
}
// addressToUint32 translates an IPv4 address into its little endian uint32
// representation.
//
// This function does the same thing as binary.LittleEndian.Uint32 but operates
// on a tcpip.Address (a string) without the need to convert it to a byte slice,
// which would cause an allocation.
func addressToUint32(addr tcpip.Address) uint32 {
_ = addr[3] // bounds check hint to compiler
return uint32(addr[0]) | uint32(addr[1])<<8 | uint32(addr[2])<<16 | uint32(addr[3])<<24
}
// hashRoute calculates a hash value for the given source/destination pair using
// the addresses, transport protocol number and a 32-bit number to generate the
// hash.
func hashRoute(srcAddr, dstAddr tcpip.Address, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 {
a := addressToUint32(srcAddr)
b := addressToUint32(dstAddr)
return hash.Hash3Words(a, b, uint32(protocol), hashIV)
}
// Options holds options to configure a new protocol.
type Options struct {
// IGMP holds options for IGMP.
IGMP IGMPOptions
// AllowExternalLoopbackTraffic indicates that inbound loopback packets (i.e.
// martian loopback packets) should be accepted.
AllowExternalLoopbackTraffic bool
}
// NewProtocolWithOptions returns an IPv4 network protocol.
func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
ids := make([]uint32, buckets)
// Randomly initialize hashIV and the ids.
r := hash.RandN32(1 + buckets)
for i := range ids {
ids[i] = r[i]
}
hashIV := r[buckets]
return func(s *stack.Stack) stack.NetworkProtocol {
p := &protocol{
stack: s,
ids: ids,
hashIV: hashIV,
defaultTTL: DefaultTTL,
options: opts,
}
p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p)
p.mu.eps = make(map[tcpip.NICID]*endpoint)
return p
}
}
// NewProtocol is equivalent to NewProtocolWithOptions with an empty Options.
func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
return NewProtocolWithOptions(Options{})(s)
}
func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) {
fragPkt, offset, copied, more := pf.BuildNextFragment()
fragPkt.NetworkProtocolNumber = ProtocolNumber
originalIPHeaderLength := len(originalIPHeader)
nextFragIPHeader := header.IPv4(fragPkt.NetworkHeader().Push(originalIPHeaderLength))
fragPkt.NetworkProtocolNumber = ProtocolNumber
if copied := copy(nextFragIPHeader, originalIPHeader); copied != len(originalIPHeader) {
panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got = %d, want = %d", copied, originalIPHeaderLength))
}
flags := originalIPHeader.Flags()
if more {
flags |= header.IPv4FlagMoreFragments
}
nextFragIPHeader.SetFlagsFragmentOffset(flags, uint16(offset))
nextFragIPHeader.SetTotalLength(uint16(nextFragIPHeader.HeaderLength()) + uint16(copied))
nextFragIPHeader.SetChecksum(0)
nextFragIPHeader.SetChecksum(^nextFragIPHeader.CalculateChecksum())
return fragPkt, more
}
// optionAction describes possible actions that may be taken on an option
// while processing it.
type optionAction uint8
const (
// optionRemove says that the option should not be in the output option set.
optionRemove optionAction = iota
// optionProcess says that the option should be fully processed.
optionProcess
// optionVerify says the option should be checked and passed unchanged.
optionVerify
// optionPass says to pass the output set without checking.
optionPass
)
// optionActions list what to do for each option in a given scenario.
type optionActions struct {
// timestamp controls what to do with a Timestamp option.
timestamp optionAction
// recordRoute controls what to do with a Record Route option.
recordRoute optionAction
// routerAlert controls what to do with a Router Alert option.
routerAlert optionAction
// unknown controls what to do with an unknown option.
unknown optionAction
}
// optionsUsage specifies the ways options may be operated upon for a given
// scenario during packet processing.
type optionsUsage interface {
actions() optionActions
}
// optionUsageVerify implements optionsUsage for when we just want to check
// fragments. Don't change anything, just check and reject if bad. No
// replacement options are generated.
type optionUsageVerify struct{}
// actions implements optionsUsage.
func (*optionUsageVerify) actions() optionActions {
return optionActions{
timestamp: optionVerify,
recordRoute: optionVerify,
routerAlert: optionVerify,
unknown: optionRemove,
}
}
// optionUsageReceive implements optionsUsage for packets we will pass
// to the transport layer (with the exception of Echo requests).
type optionUsageReceive struct{}
// actions implements optionsUsage.
func (*optionUsageReceive) actions() optionActions {
return optionActions{
timestamp: optionProcess,
recordRoute: optionProcess,
routerAlert: optionVerify,
unknown: optionPass,
}
}
// optionUsageForward implements optionsUsage for packets about to be forwarded.
// All options are passed on regardless of whether we recognise them, however
// we do process the Timestamp and Record Route options.
type optionUsageForward struct{}
// actions implements optionsUsage.
func (*optionUsageForward) actions() optionActions {
return optionActions{
timestamp: optionProcess,
recordRoute: optionProcess,
routerAlert: optionVerify,
unknown: optionPass,
}
}
// optionUsageEcho implements optionsUsage for echo packet processing.
// Only Timestamp and RecordRoute are processed and sent back.
type optionUsageEcho struct{}
// actions implements optionsUsage.
func (*optionUsageEcho) actions() optionActions {
return optionActions{
timestamp: optionProcess,
recordRoute: optionProcess,
routerAlert: optionVerify,
unknown: optionRemove,
}
}
// handleTimestamp does any required processing on a Timestamp option
// in place.
func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Address, clock tcpip.Clock, usage optionsUsage) *header.IPv4OptParameterProblem {
flags := tsOpt.Flags()
var entrySize uint8
switch flags {
case header.IPv4OptionTimestampOnlyFlag:
entrySize = header.IPv4OptionTimestampSize
case
header.IPv4OptionTimestampWithIPFlag,
header.IPv4OptionTimestampWithPredefinedIPFlag:
entrySize = header.IPv4OptionTimestampWithAddrSize
default:
return &header.IPv4OptParameterProblem{
Pointer: header.IPv4OptTSOFLWAndFLGOffset,
NeedICMP: true,
}
}
pointer := tsOpt.Pointer()
// RFC 791 page 22 states: "The smallest legal value is 5."
// Since the pointer is 1 based, and the header is 4 bytes long the
// pointer must point beyond the header therefore 4 or less is bad.
if pointer <= header.IPv4OptionTimestampHdrLength {
return &header.IPv4OptParameterProblem{
Pointer: header.IPv4OptTSPointerOffset,
NeedICMP: true,
}
}
// To simplify processing below, base further work on the array of timestamps
// beyond the header, rather than on the whole option. Also to aid
// calculations set 'nextSlot' to be 0 based as in the packet it is 1 based.
nextSlot := pointer - (header.IPv4OptionTimestampHdrLength + 1)
optLen := tsOpt.Size()
dataLength := optLen - header.IPv4OptionTimestampHdrLength
// In the section below, we verify the pointer, length and overflow counter
// fields of the option. The distinction is in which byte you return as being
// in error in the ICMP packet. Offsets 1 (length), 2 pointer)
// or 3 (overflowed counter).
//
// The following RFC sections cover this section:
//
// RFC 791 (page 22):
// If there is some room but not enough room for a full timestamp
// to be inserted, or the overflow count itself overflows, the
// original datagram is considered to be in error and is discarded.
// In either case an ICMP parameter problem message may be sent to
// the source host [3].
//
// You can get this situation in two ways. Firstly if the data area is not
// a multiple of the entry size or secondly, if the pointer is not at a
// multiple of the entry size. The wording of the RFC suggests that
// this is not an error until you actually run out of space.
if pointer > optLen {
// RFC 791 (page 22) says we should switch to using the overflow count.
// If the timestamp data area is already full (the pointer exceeds
// the length) the datagram is forwarded without inserting the
// timestamp, but the overflow count is incremented by one.
if flags == header.IPv4OptionTimestampWithPredefinedIPFlag {
// By definition we have nothing to do.
return nil
}
if tsOpt.IncOverflow() != 0 {
return nil
}
// The overflow count is also full.
return &header.IPv4OptParameterProblem{
Pointer: header.IPv4OptTSOFLWAndFLGOffset,
NeedICMP: true,
}
}
if nextSlot+entrySize > dataLength {
// The data area isn't full but there isn't room for a new entry.
// Either Length or Pointer could be bad.
if false {
// We must select Pointer for Linux compatibility, even if
// only the length is bad.
// The Linux code is at (in October 2020)
// https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L367-L370
// if (optptr[2]+3 > optlen) {
// pp_ptr = optptr + 2;
// goto error;
// }
// which doesn't distinguish between which of optptr[2] or optlen
// is wrong, but just arbitrarily decides on optptr+2.
if dataLength%entrySize != 0 {
// The Data section size should be a multiple of the expected
// timestamp entry size.
return &header.IPv4OptParameterProblem{
Pointer: header.IPv4OptionLengthOffset,
NeedICMP: false,
}
}
// If the size is OK, the pointer must be corrupted.
}
return &header.IPv4OptParameterProblem{
Pointer: header.IPv4OptTSPointerOffset,
NeedICMP: true,
}
}
if usage.actions().timestamp == optionProcess {
tsOpt.UpdateTimestamp(localAddress, clock)
}
return nil
}
// handleRecordRoute checks and processes a Record route option. It is much
// like the timestamp type 1 option, but without timestamps. The passed in
// address is stored in the option in the correct spot if possible.
func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Address, usage optionsUsage) *header.IPv4OptParameterProblem {
optlen := rrOpt.Size()
if optlen < header.IPv4AddressSize+header.IPv4OptionRecordRouteHdrLength {
return &header.IPv4OptParameterProblem{
Pointer: header.IPv4OptionLengthOffset,
NeedICMP: true,
}
}
pointer := rrOpt.Pointer()
// RFC 791 page 20 states:
// The pointer is relative to this option, and the
// smallest legal value for the pointer is 4.
// Since the pointer is 1 based, and the header is 3 bytes long the
// pointer must point beyond the header therefore 3 or less is bad.
if pointer <= header.IPv4OptionRecordRouteHdrLength {
return &header.IPv4OptParameterProblem{
Pointer: header.IPv4OptRRPointerOffset,
NeedICMP: true,
}
}
// RFC 791 page 21 says
// If the route data area is already full (the pointer exceeds the
// length) the datagram is forwarded without inserting the address
// into the recorded route. If there is some room but not enough
// room for a full address to be inserted, the original datagram is
// considered to be in error and is discarded. In either case an
// ICMP parameter problem message may be sent to the source
// host.
// The use of the words "In either case" suggests that a 'full' RR option
// could generate an ICMP at every hop after it fills up. We chose to not
// do this (as do most implementations). It is probable that the inclusion
// of these words is a copy/paste error from the timestamp option where
// there are two failure reasons given.
if pointer > optlen {
return nil
}
// The data area isn't full but there isn't room for a new entry.
// Either Length or Pointer could be bad. We must select Pointer for Linux
// compatibility, even if only the length is bad. NB. pointer is 1 based.
if pointer+header.IPv4AddressSize > optlen+1 {
if false {
// This is what we would do if we were not being Linux compatible.
// Check for bad pointer or length value. Must be a multiple of 4 after
// accounting for the 3 byte header and not within that header.
// RFC 791, page 20 says:
// The pointer is relative to this option, and the
// smallest legal value for the pointer is 4.
//
// A recorded route is composed of a series of internet addresses.
// Each internet address is 32 bits or 4 octets.
// Linux skips this test so we must too. See Linux code at:
// https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L338-L341
// if (optptr[2]+3 > optlen) {
// pp_ptr = optptr + 2;
// goto error;
// }
if (optlen-header.IPv4OptionRecordRouteHdrLength)%header.IPv4AddressSize != 0 {
// Length is bad, not on integral number of slots.
return &header.IPv4OptParameterProblem{
Pointer: header.IPv4OptionLengthOffset,
NeedICMP: true,
}
}
// If not length, the fault must be with the pointer.
}
return &header.IPv4OptParameterProblem{
Pointer: header.IPv4OptRRPointerOffset,
NeedICMP: true,
}
}
if usage.actions().recordRoute == optionVerify {
return nil
}
rrOpt.StoreAddress(localAddress)
return nil
}
// handleRouterAlert performs sanity checks on a Router Alert option.
func handleRouterAlert(raOpt header.IPv4OptionRouterAlert) *header.IPv4OptParameterProblem {
// Only the zero value is acceptable, as per RFC 2113, section 2.1:
// Value: A two octet code with the following values:
// 0 - Router shall examine packet
// 1-65535 - Reserved
if raOpt.Value() != header.IPv4OptionRouterAlertValue {
return &header.IPv4OptParameterProblem{
Pointer: header.IPv4OptionRouterAlertValueOffset,
NeedICMP: true,
}
}
return nil
}
type optionTracker struct {
timestamp bool
recordRoute bool
routerAlert bool
}
// processIPOptions parses the IPv4 options and produces a new set of options
// suitable for use in the next step of packet processing as informed by usage.
// The original will not be touched.
//
// If there were no errors during parsing, the new set of options is returned as
// a new buffer.
func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, orig header.IPv4Options, usage optionsUsage) (header.IPv4Options, optionTracker, *header.IPv4OptParameterProblem) {
stats := e.stats.ip
opts := header.IPv4Options(orig)
optIter := opts.MakeIterator()
// Except NOP, each option must only appear at most once (RFC 791 section 3.1,
// at the definition of every type).
// Keep track of each option we find to enable duplicate option detection.
var seenOptions [math.MaxUint8 + 1]bool
// TODO(https://gvisor.dev/issue/4586): This will need tweaking when we start
// really forwarding packets as we may need to get two addresses, for rx and
// tx interfaces. We will also have to take usage into account.
localAddress := e.MainAddress().Address
if len(localAddress) == 0 {
h := header.IPv4(pkt.NetworkHeader().View())
dstAddr := h.DestinationAddress()
if pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(dstAddr) {
return nil, optionTracker{}, &header.IPv4OptParameterProblem{
NeedICMP: false,
}
}
localAddress = dstAddr
}
var optionsProcessed optionTracker
for {
option, done, optProblem := optIter.Next()
if done || optProblem != nil {
return optIter.Finalize(), optionsProcessed, optProblem
}
optType := option.Type()
if optType == header.IPv4OptionNOPType {
optIter.PushNOPOrEnd(optType)
continue
}
if optType == header.IPv4OptionListEndType {
optIter.PushNOPOrEnd(optType)
return optIter.Finalize(), optionsProcessed, nil
}
// check for repeating options (multiple NOPs are OK)
if seenOptions[optType] {
return nil, optionTracker{}, &header.IPv4OptParameterProblem{
Pointer: optIter.ErrCursor,
NeedICMP: true,
}
}
seenOptions[optType] = true
optLen, optProblem := func() (int, *header.IPv4OptParameterProblem) {
switch option := option.(type) {
case *header.IPv4OptionTimestamp:
stats.OptionTimestampReceived.Increment()
optionsProcessed.timestamp = true
if usage.actions().timestamp != optionRemove {
clock := e.protocol.stack.Clock()
newBuffer := optIter.InitReplacement(option)
optProblem := handleTimestamp(header.IPv4OptionTimestamp(newBuffer), localAddress, clock, usage)
return len(newBuffer), optProblem
}
case *header.IPv4OptionRecordRoute:
stats.OptionRecordRouteReceived.Increment()
optionsProcessed.recordRoute = true
if usage.actions().recordRoute != optionRemove {
newBuffer := optIter.InitReplacement(option)
optProblem := handleRecordRoute(header.IPv4OptionRecordRoute(newBuffer), localAddress, usage)
return len(newBuffer), optProblem
}
case *header.IPv4OptionRouterAlert:
stats.OptionRouterAlertReceived.Increment()
optionsProcessed.routerAlert = true
if usage.actions().routerAlert != optionRemove {
newBuffer := optIter.InitReplacement(option)
optProblem := handleRouterAlert(header.IPv4OptionRouterAlert(newBuffer))
return len(newBuffer), optProblem
}
default:
stats.OptionUnknownReceived.Increment()
if usage.actions().unknown == optionPass {
return len(optIter.InitReplacement(option)), nil
}
}
return 0, nil
}()
if optProblem != nil {
optProblem.Pointer += optIter.ErrCursor
return nil, optionTracker{}, optProblem
}
optIter.ConsumeBuffer(optLen)
}
}