| // Copyright 2018 The gVisor Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // +build linux |
| |
| // Package fdbased provides the implemention of data-link layer endpoints |
| // backed by boundary-preserving file descriptors (e.g., TUN devices, |
| // seqpacket/datagram sockets). |
| // |
| // FD based endpoints can be used in the networking stack by calling New() to |
| // create a new endpoint, and then passing it as an argument to |
| // Stack.CreateNIC(). |
| package fdbased |
| |
| import ( |
| "fmt" |
| "syscall" |
| |
| "github.com/google/netstack/tcpip" |
| "github.com/google/netstack/tcpip/buffer" |
| "github.com/google/netstack/tcpip/header" |
| "github.com/google/netstack/tcpip/link/rawfile" |
| "github.com/google/netstack/tcpip/stack" |
| ) |
| |
| // linkDispatcher reads packets from the link FD and dispatches them to the |
| // NetworkDispatcher. |
| type linkDispatcher interface { |
| dispatch() (bool, *tcpip.Error) |
| } |
| |
| // PacketDispatchMode are the various supported methods of receiving and |
| // dispatching packets from the underlying FD. |
| type PacketDispatchMode int |
| |
| const ( |
| // Readv is the default dispatch mode and is the least performant of the |
| // dispatch options but the one that is supported by all underlying FD |
| // types. |
| Readv PacketDispatchMode = iota |
| // RecvMMsg enables use of recvmmsg() syscall instead of readv() to |
| // read inbound packets. This reduces # of syscalls needed to process |
| // packets. |
| // |
| // NOTE: recvmmsg() is only supported for sockets, so if the underlying |
| // FD is not a socket then the code will still fall back to the readv() |
| // path. |
| RecvMMsg |
| // PacketMMap enables use of PACKET_RX_RING to receive packets from the |
| // NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The |
| // primary use-case for this is runsc which uses an AF_PACKET FD to |
| // receive packets from the veth device. |
| PacketMMap |
| ) |
| |
| type endpoint struct { |
| // fd is the file descriptor used to send and receive packets. |
| fd int |
| |
| // mtu (maximum transmission unit) is the maximum size of a packet. |
| mtu uint32 |
| |
| // hdrSize specifies the link-layer header size. If set to 0, no header |
| // is added/removed; otherwise an ethernet header is used. |
| hdrSize int |
| |
| // addr is the address of the endpoint. |
| addr tcpip.LinkAddress |
| |
| // caps holds the endpoint capabilities. |
| caps stack.LinkEndpointCapabilities |
| |
| // closed is a function to be called when the FD's peer (if any) closes |
| // its end of the communication pipe. |
| closed func(*tcpip.Error) |
| |
| inboundDispatcher linkDispatcher |
| dispatcher stack.NetworkDispatcher |
| |
| // packetDispatchMode controls the packet dispatcher used by this |
| // endpoint. |
| packetDispatchMode PacketDispatchMode |
| |
| // gsoMaxSize is the maximum GSO packet size. It is zero if GSO is |
| // disabled. |
| gsoMaxSize uint32 |
| } |
| |
| // Options specify the details about the fd-based endpoint to be created. |
| type Options struct { |
| FD int |
| MTU uint32 |
| EthernetHeader bool |
| ClosedFunc func(*tcpip.Error) |
| Address tcpip.LinkAddress |
| SaveRestore bool |
| DisconnectOk bool |
| GSOMaxSize uint32 |
| PacketDispatchMode PacketDispatchMode |
| TXChecksumOffload bool |
| RXChecksumOffload bool |
| } |
| |
| // New creates a new fd-based endpoint. |
| // |
| // Makes fd non-blocking, but does not take ownership of fd, which must remain |
| // open for the lifetime of the returned endpoint. |
| func New(opts *Options) (tcpip.LinkEndpointID, error) { |
| if err := syscall.SetNonblock(opts.FD, true); err != nil { |
| return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", opts.FD, err) |
| } |
| |
| caps := stack.LinkEndpointCapabilities(0) |
| if opts.RXChecksumOffload { |
| caps |= stack.CapabilityRXChecksumOffload |
| } |
| |
| if opts.TXChecksumOffload { |
| caps |= stack.CapabilityTXChecksumOffload |
| } |
| |
| hdrSize := 0 |
| if opts.EthernetHeader { |
| hdrSize = header.EthernetMinimumSize |
| caps |= stack.CapabilityResolutionRequired |
| } |
| |
| if opts.SaveRestore { |
| caps |= stack.CapabilitySaveRestore |
| } |
| |
| if opts.DisconnectOk { |
| caps |= stack.CapabilityDisconnectOk |
| } |
| |
| e := &endpoint{ |
| fd: opts.FD, |
| mtu: opts.MTU, |
| caps: caps, |
| closed: opts.ClosedFunc, |
| addr: opts.Address, |
| hdrSize: hdrSize, |
| packetDispatchMode: opts.PacketDispatchMode, |
| } |
| |
| isSocket, err := isSocketFD(e.fd) |
| if err != nil { |
| return 0, err |
| } |
| if isSocket { |
| if opts.GSOMaxSize != 0 { |
| e.caps |= stack.CapabilityGSO |
| e.gsoMaxSize = opts.GSOMaxSize |
| } |
| } |
| e.inboundDispatcher, err = createInboundDispatcher(e, isSocket) |
| if err != nil { |
| return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err) |
| } |
| |
| return stack.RegisterLinkEndpoint(e), nil |
| } |
| |
| func createInboundDispatcher(e *endpoint, isSocket bool) (linkDispatcher, error) { |
| // By default use the readv() dispatcher as it works with all kinds of |
| // FDs (tap/tun/unix domain sockets and af_packet). |
| inboundDispatcher, err := newReadVDispatcher(e.fd, e) |
| if err != nil { |
| return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", e.fd, e, err) |
| } |
| |
| if isSocket { |
| switch e.packetDispatchMode { |
| case PacketMMap: |
| inboundDispatcher, err = newPacketMMapDispatcher(e.fd, e) |
| if err != nil { |
| return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", e.fd, e, err) |
| } |
| case RecvMMsg: |
| // If the provided FD is a socket then we optimize |
| // packet reads by using recvmmsg() instead of read() to |
| // read packets in a batch. |
| inboundDispatcher, err = newRecvMMsgDispatcher(e.fd, e) |
| if err != nil { |
| return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", e.fd, e, err) |
| } |
| } |
| } |
| return inboundDispatcher, nil |
| } |
| |
| func isSocketFD(fd int) (bool, error) { |
| var stat syscall.Stat_t |
| if err := syscall.Fstat(fd, &stat); err != nil { |
| return false, fmt.Errorf("syscall.Fstat(%v,...) failed: %v", fd, err) |
| } |
| return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK, nil |
| } |
| |
| // Attach launches the goroutine that reads packets from the file descriptor and |
| // dispatches them via the provided dispatcher. |
| func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) { |
| e.dispatcher = dispatcher |
| // Link endpoints are not savable. When transportation endpoints are |
| // saved, they stop sending outgoing packets and all incoming packets |
| // are rejected. |
| go e.dispatchLoop() |
| } |
| |
| // IsAttached implements stack.LinkEndpoint.IsAttached. |
| func (e *endpoint) IsAttached() bool { |
| return e.dispatcher != nil |
| } |
| |
| // MTU implements stack.LinkEndpoint.MTU. It returns the value initialized |
| // during construction. |
| func (e *endpoint) MTU() uint32 { |
| return e.mtu |
| } |
| |
| // Capabilities implements stack.LinkEndpoint.Capabilities. |
| func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities { |
| return e.caps |
| } |
| |
| // MaxHeaderLength returns the maximum size of the link-layer header. |
| func (e *endpoint) MaxHeaderLength() uint16 { |
| return uint16(e.hdrSize) |
| } |
| |
| // LinkAddress returns the link address of this endpoint. |
| func (e *endpoint) LinkAddress() tcpip.LinkAddress { |
| return e.addr |
| } |
| |
| // virtioNetHdr is declared in linux/virtio_net.h. |
| type virtioNetHdr struct { |
| flags uint8 |
| gsoType uint8 |
| hdrLen uint16 |
| gsoSize uint16 |
| csumStart uint16 |
| csumOffset uint16 |
| } |
| |
| // These constants are declared in linux/virtio_net.h. |
| const ( |
| _VIRTIO_NET_HDR_F_NEEDS_CSUM = 1 |
| |
| _VIRTIO_NET_HDR_GSO_TCPV4 = 1 |
| _VIRTIO_NET_HDR_GSO_TCPV6 = 4 |
| ) |
| |
| // WritePacket writes outbound packets to the file descriptor. If it is not |
| // currently writable, the packet is dropped. |
| func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error { |
| if e.hdrSize > 0 { |
| // Add ethernet header if needed. |
| eth := header.Ethernet(hdr.Prepend(header.EthernetMinimumSize)) |
| ethHdr := &header.EthernetFields{ |
| DstAddr: r.RemoteLinkAddress, |
| Type: protocol, |
| } |
| |
| // Preserve the src address if it's set in the route. |
| if r.LocalLinkAddress != "" { |
| ethHdr.SrcAddr = r.LocalLinkAddress |
| } else { |
| ethHdr.SrcAddr = e.addr |
| } |
| eth.Encode(ethHdr) |
| } |
| |
| if e.Capabilities()&stack.CapabilityGSO != 0 { |
| vnetHdr := virtioNetHdr{} |
| vnetHdrBuf := vnetHdrToByteSlice(&vnetHdr) |
| if gso != nil { |
| vnetHdr.hdrLen = uint16(hdr.UsedLength()) |
| if gso.NeedsCsum { |
| vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM |
| vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen |
| vnetHdr.csumOffset = gso.CsumOffset |
| } |
| if gso.Type != stack.GSONone && uint16(payload.Size()) > gso.MSS { |
| switch gso.Type { |
| case stack.GSOTCPv4: |
| vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4 |
| case stack.GSOTCPv6: |
| vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6 |
| default: |
| panic(fmt.Sprintf("Unknown gso type: %v", gso.Type)) |
| } |
| vnetHdr.gsoSize = gso.MSS |
| } |
| } |
| |
| return rawfile.NonBlockingWrite3(e.fd, vnetHdrBuf, hdr.View(), payload.ToView()) |
| } |
| |
| if payload.Size() == 0 { |
| return rawfile.NonBlockingWrite(e.fd, hdr.View()) |
| } |
| |
| return rawfile.NonBlockingWrite3(e.fd, hdr.View(), payload.ToView(), nil) |
| } |
| |
| // WriteRawPacket writes a raw packet directly to the file descriptor. |
| func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error { |
| return rawfile.NonBlockingWrite(e.fd, packet) |
| } |
| |
| // dispatchLoop reads packets from the file descriptor in a loop and dispatches |
| // them to the network stack. |
| func (e *endpoint) dispatchLoop() *tcpip.Error { |
| for { |
| cont, err := e.inboundDispatcher.dispatch() |
| if err != nil || !cont { |
| if e.closed != nil { |
| e.closed(err) |
| } |
| return err |
| } |
| } |
| } |
| |
| // GSOMaxSize returns the maximum GSO packet size. |
| func (e *endpoint) GSOMaxSize() uint32 { |
| return e.gsoMaxSize |
| } |
| |
| // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes |
| // to the FD, but does not read from it. All reads come from injected packets. |
| type InjectableEndpoint struct { |
| endpoint |
| |
| dispatcher stack.NetworkDispatcher |
| } |
| |
| // Attach saves the stack network-layer dispatcher for use later when packets |
| // are injected. |
| func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) { |
| e.dispatcher = dispatcher |
| } |
| |
| // Inject injects an inbound packet. |
| func (e *InjectableEndpoint) Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) { |
| e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, vv) |
| } |
| |
| // NewInjectable creates a new fd-based InjectableEndpoint. |
| func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) (tcpip.LinkEndpointID, *InjectableEndpoint) { |
| syscall.SetNonblock(fd, true) |
| |
| e := &InjectableEndpoint{endpoint: endpoint{ |
| fd: fd, |
| mtu: mtu, |
| caps: capabilities, |
| }} |
| |
| return stack.RegisterLinkEndpoint(e), e |
| } |