blob: 6d95e16642ddba3613a8cbf68ed3c7b5d6828f06 [file] [log] [blame]
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package stack
import (
"fmt"
"sync"
"time"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
)
const (
// immediateDuration is a duration of zero for scheduling work that needs to
// be done immediately but asynchronously to avoid deadlock.
immediateDuration time.Duration = 0
)
// NeighborEntry describes a neighboring device in the local network.
type NeighborEntry struct {
Addr tcpip.Address
LinkAddr tcpip.LinkAddress
State NeighborState
UpdatedAtNanos int64
}
// NeighborState defines the state of a NeighborEntry within the Neighbor
// Unreachability Detection state machine, as per RFC 4861 section 7.3.2 and
// RFC 7048.
type NeighborState uint8
const (
// Unknown means reachability has not been verified yet. This is the initial
// state of entries that have been created automatically by the Neighbor
// Unreachability Detection state machine.
Unknown NeighborState = iota
// Incomplete means that there is an outstanding request to resolve the
// address.
Incomplete
// Reachable means the path to the neighbor is functioning properly for both
// receive and transmit paths.
Reachable
// Stale means reachability to the neighbor is unknown, but packets are still
// able to be transmitted to the possibly stale link address.
Stale
// Delay means reachability to the neighbor is unknown and pending
// confirmation from an upper-level protocol like TCP, but packets are still
// able to be transmitted to the possibly stale link address.
Delay
// Probe means a reachability confirmation is actively being sought by
// periodically retransmitting reachability probes until a reachability
// confirmation is received, or until the maximum number of probes has been
// sent.
Probe
// Static describes entries that have been explicitly added by the user. They
// do not expire and are not deleted until explicitly removed.
Static
// Unreachable means reachability confirmation failed; the maximum number of
// reachability probes has been sent and no replies have been received.
//
// TODO(gvisor.dev/issue/5472): Add the following sentence when we implement
// RFC 7048: "Packets continue to be sent to the neighbor while
// re-attempting to resolve the address."
Unreachable
)
type timer struct {
// done indicates to the timer that the timer was stopped.
done *bool
timer tcpip.Timer
}
// neighborEntry implements a neighbor entry's individual node behavior, as per
// RFC 4861 section 7.3.3. Neighbor Unreachability Detection operates in
// parallel with the sending of packets to a neighbor, necessitating the
// entry's lock to be acquired for all operations.
type neighborEntry struct {
neighborEntryEntry
cache *neighborCache
// nudState points to the Neighbor Unreachability Detection configuration.
nudState *NUDState
mu struct {
sync.RWMutex
neigh NeighborEntry
// done is closed when address resolution is complete. It is nil iff s is
// incomplete and resolution is not yet in progress.
done chan struct{}
// onResolve is called with the result of address resolution.
onResolve []func(LinkResolutionResult)
isRouter bool
timer timer
}
}
// newNeighborEntry creates a neighbor cache entry starting at the default
// state, Unknown. Transition out of Unknown by calling either
// `handlePacketQueuedLocked` or `handleProbeLocked` on the newly created
// neighborEntry.
func newNeighborEntry(cache *neighborCache, remoteAddr tcpip.Address, nudState *NUDState) *neighborEntry {
n := &neighborEntry{
cache: cache,
nudState: nudState,
}
n.mu.Lock()
n.mu.neigh = NeighborEntry{
Addr: remoteAddr,
State: Unknown,
}
n.mu.Unlock()
return n
}
// newStaticNeighborEntry creates a neighbor cache entry starting at the
// Static state. The entry can only transition out of Static by directly
// calling `setStateLocked`.
func newStaticNeighborEntry(cache *neighborCache, addr tcpip.Address, linkAddr tcpip.LinkAddress, state *NUDState) *neighborEntry {
entry := NeighborEntry{
Addr: addr,
LinkAddr: linkAddr,
State: Static,
UpdatedAtNanos: cache.nic.stack.clock.NowNanoseconds(),
}
n := &neighborEntry{
cache: cache,
nudState: state,
}
n.mu.Lock()
n.mu.neigh = entry
n.mu.Unlock()
return n
}
// notifyCompletionLocked notifies those waiting for address resolution, with
// the link address if resolution completed successfully.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) notifyCompletionLocked(err tcpip.Error) {
res := LinkResolutionResult{LinkAddress: e.mu.neigh.LinkAddr, Err: err}
for _, callback := range e.mu.onResolve {
callback(res)
}
e.mu.onResolve = nil
if ch := e.mu.done; ch != nil {
close(ch)
e.mu.done = nil
// Dequeue the pending packets in a new goroutine to not hold up the current
// goroutine as writing packets may be a costly operation.
//
// At the time of writing, when writing packets, a neighbor's link address
// is resolved (which ends up obtaining the entry's lock) while holding the
// link resolution queue's lock. Dequeuing packets in a new goroutine avoids
// a lock ordering violation.
go e.cache.nic.linkResQueue.dequeue(ch, e.mu.neigh.LinkAddr, err)
}
}
// dispatchAddEventLocked signals to stack's NUD Dispatcher that the entry has
// been added.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) dispatchAddEventLocked() {
if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil {
nudDisp.OnNeighborAdded(e.cache.nic.id, e.mu.neigh)
}
}
// dispatchChangeEventLocked signals to stack's NUD Dispatcher that the entry
// has changed state or link-layer address.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) dispatchChangeEventLocked() {
if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil {
nudDisp.OnNeighborChanged(e.cache.nic.id, e.mu.neigh)
}
}
// dispatchRemoveEventLocked signals to stack's NUD Dispatcher that the entry
// has been removed.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) dispatchRemoveEventLocked() {
if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil {
nudDisp.OnNeighborRemoved(e.cache.nic.id, e.mu.neigh)
}
}
// cancelTimerLocked cancels the currently scheduled action, if there is one.
// Entries in Unknown, Stale, or Static state do not have a scheduled action.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) cancelTimerLocked() {
if e.mu.timer.timer != nil {
e.mu.timer.timer.Stop()
*e.mu.timer.done = true
e.mu.timer = timer{}
}
}
// removeLocked prepares the entry for removal.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) removeLocked() {
e.mu.neigh.UpdatedAtNanos = e.cache.nic.stack.clock.NowNanoseconds()
e.dispatchRemoveEventLocked()
e.cancelTimerLocked()
// TODO(https://gvisor.dev/issues/5583): test the case where this function is
// called during resolution; that can happen in at least these scenarios:
//
// - manual address removal during resolution
//
// - neighbor cache eviction during resolution
e.notifyCompletionLocked(&tcpip.ErrAborted{})
}
// setStateLocked transitions the entry to the specified state immediately.
//
// Follows the logic defined in RFC 4861 section 7.3.3.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) setStateLocked(next NeighborState) {
e.cancelTimerLocked()
prev := e.mu.neigh.State
e.mu.neigh.State = next
e.mu.neigh.UpdatedAtNanos = e.cache.nic.stack.clock.NowNanoseconds()
config := e.nudState.Config()
switch next {
case Incomplete:
panic(fmt.Sprintf("should never transition to Incomplete with setStateLocked; neigh = %#v, prev state = %s", e.mu.neigh, prev))
case Reachable:
// Protected by e.mu.
done := false
e.mu.timer = timer{
done: &done,
timer: e.cache.nic.stack.Clock().AfterFunc(e.nudState.ReachableTime(), func() {
e.mu.Lock()
defer e.mu.Unlock()
if done {
// The timer was stopped because the entry changed state.
return
}
e.setStateLocked(Stale)
e.dispatchChangeEventLocked()
}),
}
case Delay:
// Protected by e.mu.
done := false
e.mu.timer = timer{
done: &done,
timer: e.cache.nic.stack.Clock().AfterFunc(config.DelayFirstProbeTime, func() {
e.mu.Lock()
defer e.mu.Unlock()
if done {
// The timer was stopped because the entry changed state.
return
}
e.setStateLocked(Probe)
e.dispatchChangeEventLocked()
}),
}
case Probe:
// Protected by e.mu.
done := false
remaining := config.MaxUnicastProbes
addr := e.mu.neigh.Addr
linkAddr := e.mu.neigh.LinkAddr
// Send a probe in another gorountine to free this thread of execution
// for finishing the state transition. This is necessary to escape the
// currently held lock so we can send the probe message without holding
// a shared lock.
e.mu.timer = timer{
done: &done,
timer: e.cache.nic.stack.Clock().AfterFunc(0, func() {
var err tcpip.Error = &tcpip.ErrTimeout{}
if remaining != 0 {
err = e.cache.linkRes.LinkAddressRequest(addr, "" /* localAddr */, linkAddr)
}
e.mu.Lock()
defer e.mu.Unlock()
if done {
// The timer was stopped because the entry changed state.
return
}
if err != nil {
e.setStateLocked(Unreachable)
e.notifyCompletionLocked(err)
e.dispatchChangeEventLocked()
return
}
remaining--
e.mu.timer.timer.Reset(config.RetransmitTimer)
}),
}
case Unreachable:
case Unknown, Stale, Static:
// Do nothing
default:
panic(fmt.Sprintf("Invalid state transition from %q to %q", prev, next))
}
}
// handlePacketQueuedLocked advances the state machine according to a packet
// being queued for outgoing transmission.
//
// Follows the logic defined in RFC 4861 section 7.3.3.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) handlePacketQueuedLocked(localAddr tcpip.Address) {
switch e.mu.neigh.State {
case Unknown, Unreachable:
prev := e.mu.neigh.State
e.mu.neigh.State = Incomplete
e.mu.neigh.UpdatedAtNanos = e.cache.nic.stack.clock.NowNanoseconds()
switch prev {
case Unknown:
e.dispatchAddEventLocked()
case Unreachable:
e.dispatchChangeEventLocked()
e.cache.nic.stats.Neighbor.UnreachableEntryLookups.Increment()
}
config := e.nudState.Config()
// Protected by e.mu.
done := false
remaining := config.MaxMulticastProbes
addr := e.mu.neigh.Addr
// Send a probe in another gorountine to free this thread of execution
// for finishing the state transition. This is necessary to escape the
// currently held lock so we can send the probe message without holding
// a shared lock.
e.mu.timer = timer{
done: &done,
timer: e.cache.nic.stack.Clock().AfterFunc(0, func() {
var err tcpip.Error = &tcpip.ErrTimeout{}
if remaining != 0 {
// As per RFC 4861 section 7.2.2:
//
// If the source address of the packet prompting the solicitation is
// the same as one of the addresses assigned to the outgoing interface,
// that address SHOULD be placed in the IP Source Address of the
// outgoing solicitation.
//
err = e.cache.linkRes.LinkAddressRequest(addr, localAddr, "" /* linkAddr */)
}
e.mu.Lock()
defer e.mu.Unlock()
if done {
// The timer was stopped because the entry changed state.
return
}
if err != nil {
e.setStateLocked(Unreachable)
e.notifyCompletionLocked(err)
e.dispatchChangeEventLocked()
return
}
remaining--
e.mu.timer.timer.Reset(config.RetransmitTimer)
}),
}
case Stale:
e.setStateLocked(Delay)
e.dispatchChangeEventLocked()
case Incomplete, Reachable, Delay, Probe, Static:
// Do nothing
default:
panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State))
}
}
// handleProbeLocked processes an incoming neighbor probe (e.g. ARP request or
// Neighbor Solicitation for ARP or NDP, respectively).
//
// Follows the logic defined in RFC 4861 section 7.2.3.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) handleProbeLocked(remoteLinkAddr tcpip.LinkAddress) {
// Probes MUST be silently discarded if the target address is tentative, does
// not exist, or not bound to the NIC as per RFC 4861 section 7.2.3. These
// checks MUST be done by the NetworkEndpoint.
switch e.mu.neigh.State {
case Unknown:
e.mu.neigh.LinkAddr = remoteLinkAddr
e.setStateLocked(Stale)
e.dispatchAddEventLocked()
case Incomplete:
// "If an entry already exists, and the cached link-layer address
// differs from the one in the received Source Link-Layer option, the
// cached address should be replaced by the received address, and the
// entry's reachability state MUST be set to STALE."
// - RFC 4861 section 7.2.3
e.mu.neigh.LinkAddr = remoteLinkAddr
e.setStateLocked(Stale)
e.notifyCompletionLocked(nil)
e.dispatchChangeEventLocked()
case Reachable, Delay, Probe:
if e.mu.neigh.LinkAddr != remoteLinkAddr {
e.mu.neigh.LinkAddr = remoteLinkAddr
e.setStateLocked(Stale)
e.dispatchChangeEventLocked()
}
case Stale:
if e.mu.neigh.LinkAddr != remoteLinkAddr {
e.mu.neigh.LinkAddr = remoteLinkAddr
e.dispatchChangeEventLocked()
}
case Unreachable:
// TODO(gvisor.dev/issue/5472): Do not change the entry if the link
// address is the same, as per RFC 7048.
e.mu.neigh.LinkAddr = remoteLinkAddr
e.setStateLocked(Stale)
e.dispatchChangeEventLocked()
case Static:
// Do nothing
default:
panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State))
}
}
// handleConfirmationLocked processes an incoming neighbor confirmation
// (e.g. ARP reply or Neighbor Advertisement for ARP or NDP, respectively).
//
// Follows the state machine defined by RFC 4861 section 7.2.5.
//
// TODO(gvisor.dev/issue/2277): To protect against ARP poisoning and other
// attacks against NDP functions, Secure Neighbor Discovery (SEND) Protocol
// should be deployed where preventing access to the broadcast segment might
// not be possible. SEND uses RSA key pairs to produce Cryptographically
// Generated Addresses (CGA), as defined in RFC 3972. This ensures that the
// claimed source of an NDP message is the owner of the claimed address.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) {
switch e.mu.neigh.State {
case Incomplete:
if len(linkAddr) == 0 {
// "If the link layer has addresses and no Target Link-Layer Address
// option is included, the receiving node SHOULD silently discard the
// received advertisement." - RFC 4861 section 7.2.5
break
}
e.mu.neigh.LinkAddr = linkAddr
if flags.Solicited {
e.setStateLocked(Reachable)
} else {
e.setStateLocked(Stale)
}
e.dispatchChangeEventLocked()
e.mu.isRouter = flags.IsRouter
e.notifyCompletionLocked(nil)
// "Note that the Override flag is ignored if the entry is in the
// INCOMPLETE state." - RFC 4861 section 7.2.5
case Reachable, Stale, Delay, Probe:
isLinkAddrDifferent := len(linkAddr) != 0 && e.mu.neigh.LinkAddr != linkAddr
if isLinkAddrDifferent {
if !flags.Override {
if e.mu.neigh.State == Reachable {
e.setStateLocked(Stale)
e.dispatchChangeEventLocked()
}
break
}
e.mu.neigh.LinkAddr = linkAddr
if !flags.Solicited {
if e.mu.neigh.State != Stale {
e.setStateLocked(Stale)
e.dispatchChangeEventLocked()
} else {
// Notify the LinkAddr change, even though NUD state hasn't changed.
e.dispatchChangeEventLocked()
}
break
}
}
if flags.Solicited && (flags.Override || !isLinkAddrDifferent) {
wasReachable := e.mu.neigh.State == Reachable
// Set state to Reachable again to refresh timers.
e.setStateLocked(Reachable)
e.notifyCompletionLocked(nil)
if !wasReachable {
e.dispatchChangeEventLocked()
}
}
if e.mu.isRouter && !flags.IsRouter && header.IsV6UnicastAddress(e.mu.neigh.Addr) {
// "In those cases where the IsRouter flag changes from TRUE to FALSE as
// a result of this update, the node MUST remove that router from the
// Default Router List and update the Destination Cache entries for all
// destinations using that neighbor as a router as specified in Section
// 7.3.3. This is needed to detect when a node that is used as a router
// stops forwarding packets due to being configured as a host."
// - RFC 4861 section 7.2.5
//
// TODO(gvisor.dev/issue/4085): Remove the special casing we do for IPv6
// here.
ep, ok := e.cache.nic.networkEndpoints[header.IPv6ProtocolNumber]
if !ok {
panic(fmt.Sprintf("have a neighbor entry for an IPv6 router but no IPv6 network endpoint"))
}
if ndpEP, ok := ep.(NDPEndpoint); ok {
ndpEP.InvalidateDefaultRouter(e.mu.neigh.Addr)
}
}
e.mu.isRouter = flags.IsRouter
case Unknown, Unreachable, Static:
// Do nothing
default:
panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State))
}
}
// handleUpperLevelConfirmationLocked processes an incoming upper-level protocol
// (e.g. TCP acknowledgements) reachability confirmation.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) handleUpperLevelConfirmationLocked() {
switch e.mu.neigh.State {
case Reachable, Stale, Delay, Probe:
wasReachable := e.mu.neigh.State == Reachable
// Set state to Reachable again to refresh timers.
e.setStateLocked(Reachable)
if !wasReachable {
e.dispatchChangeEventLocked()
}
case Unknown, Incomplete, Unreachable, Static:
// Do nothing
default:
panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State))
}
}