| // Copyright 2019 The gVisor Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Package packet provides the implementation of packet sockets (see |
| // packet(7)). Packet sockets allow applications to: |
| // |
| // * manually write and inspect link, network, and transport headers |
| // * receive all traffic of a given network protocol, or all protocols |
| // |
| // Packet sockets are similar to raw sockets, but provide even more power to |
| // users, letting them effectively talk directly to the network device. |
| // |
| // Packet sockets skip the input and output iptables chains. |
| package packet |
| |
| import ( |
| "fmt" |
| "io" |
| |
| "gvisor.dev/gvisor/pkg/sync" |
| "gvisor.dev/gvisor/pkg/tcpip" |
| "gvisor.dev/gvisor/pkg/tcpip/buffer" |
| "gvisor.dev/gvisor/pkg/tcpip/header" |
| "gvisor.dev/gvisor/pkg/tcpip/stack" |
| "gvisor.dev/gvisor/pkg/waiter" |
| ) |
| |
| // +stateify savable |
| type packet struct { |
| packetEntry |
| // data holds the actual packet data, including any headers and |
| // payload. |
| data buffer.VectorisedView `state:".(buffer.VectorisedView)"` |
| // timestampNS is the unix time at which the packet was received. |
| timestampNS int64 |
| // senderAddr is the network address of the sender. |
| senderAddr tcpip.FullAddress |
| // packetInfo holds additional information like the protocol |
| // of the packet etc. |
| packetInfo tcpip.LinkPacketInfo |
| } |
| |
| // endpoint is the packet socket implementation of tcpip.Endpoint. It is legal |
| // to have goroutines make concurrent calls into the endpoint. |
| // |
| // Lock order: |
| // endpoint.mu |
| // endpoint.rcvMu |
| // |
| // +stateify savable |
| type endpoint struct { |
| stack.TransportEndpointInfo |
| tcpip.DefaultSocketOptionsHandler |
| |
| // The following fields are initialized at creation time and are |
| // immutable. |
| stack *stack.Stack `state:"manual"` |
| netProto tcpip.NetworkProtocolNumber |
| waiterQueue *waiter.Queue |
| cooked bool |
| |
| // The following fields are used to manage the receive queue and are |
| // protected by rcvMu. |
| rcvMu sync.Mutex `state:"nosave"` |
| rcvList packetList |
| rcvBufSizeMax int `state:".(int)"` |
| rcvBufSize int |
| rcvClosed bool |
| |
| // The following fields are protected by mu. |
| mu sync.RWMutex `state:"nosave"` |
| closed bool |
| stats tcpip.TransportEndpointStats `state:"nosave"` |
| bound bool |
| boundNIC tcpip.NICID |
| |
| // lastErrorMu protects lastError. |
| lastErrorMu sync.Mutex `state:"nosave"` |
| lastError tcpip.Error |
| |
| // ops is used to get socket level options. |
| ops tcpip.SocketOptions |
| } |
| |
| // NewEndpoint returns a new packet endpoint. |
| func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) { |
| ep := &endpoint{ |
| stack: s, |
| TransportEndpointInfo: stack.TransportEndpointInfo{ |
| NetProto: netProto, |
| }, |
| cooked: cooked, |
| netProto: netProto, |
| waiterQueue: waiterQueue, |
| rcvBufSizeMax: 32 * 1024, |
| } |
| ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits) |
| |
| // Override with stack defaults. |
| var ss tcpip.SendBufferSizeOption |
| if err := s.Option(&ss); err == nil { |
| ep.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) |
| } |
| |
| var rs stack.ReceiveBufferSizeOption |
| if err := s.Option(&rs); err == nil { |
| ep.rcvBufSizeMax = rs.Default |
| } |
| |
| if err := s.RegisterPacketEndpoint(0, netProto, ep); err != nil { |
| return nil, err |
| } |
| return ep, nil |
| } |
| |
| // Abort implements stack.TransportEndpoint.Abort. |
| func (ep *endpoint) Abort() { |
| ep.Close() |
| } |
| |
| // Close implements tcpip.Endpoint.Close. |
| func (ep *endpoint) Close() { |
| ep.mu.Lock() |
| defer ep.mu.Unlock() |
| |
| if ep.closed { |
| return |
| } |
| |
| ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep) |
| |
| ep.rcvMu.Lock() |
| defer ep.rcvMu.Unlock() |
| |
| // Clear the receive list. |
| ep.rcvClosed = true |
| ep.rcvBufSize = 0 |
| for !ep.rcvList.Empty() { |
| ep.rcvList.Remove(ep.rcvList.Front()) |
| } |
| |
| ep.closed = true |
| ep.bound = false |
| ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut) |
| } |
| |
| // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf. |
| func (ep *endpoint) ModerateRecvBuf(copied int) {} |
| |
| // Read implements tcpip.Endpoint.Read. |
| func (ep *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { |
| ep.rcvMu.Lock() |
| |
| // If there's no data to read, return that read would block or that the |
| // endpoint is closed. |
| if ep.rcvList.Empty() { |
| var err tcpip.Error = &tcpip.ErrWouldBlock{} |
| if ep.rcvClosed { |
| ep.stats.ReadErrors.ReadClosed.Increment() |
| err = &tcpip.ErrClosedForReceive{} |
| } |
| ep.rcvMu.Unlock() |
| return tcpip.ReadResult{}, err |
| } |
| |
| packet := ep.rcvList.Front() |
| if !opts.Peek { |
| ep.rcvList.Remove(packet) |
| ep.rcvBufSize -= packet.data.Size() |
| } |
| |
| ep.rcvMu.Unlock() |
| |
| res := tcpip.ReadResult{ |
| Total: packet.data.Size(), |
| ControlMessages: tcpip.ControlMessages{ |
| HasTimestamp: true, |
| Timestamp: packet.timestampNS, |
| }, |
| } |
| if opts.NeedRemoteAddr { |
| res.RemoteAddr = packet.senderAddr |
| } |
| if opts.NeedLinkPacketInfo { |
| res.LinkPacketInfo = packet.packetInfo |
| } |
| |
| n, err := packet.data.ReadTo(dst, opts.Peek) |
| if n == 0 && err != nil { |
| return res, &tcpip.ErrBadBuffer{} |
| } |
| res.Count = n |
| return res, nil |
| } |
| |
| func (*endpoint) Write(tcpip.Payloader, tcpip.WriteOptions) (int64, tcpip.Error) { |
| // TODO(gvisor.dev/issue/173): Implement. |
| return 0, &tcpip.ErrInvalidOptionValue{} |
| } |
| |
| // Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be |
| // disconnected, and this function always returns tpcip.ErrNotSupported. |
| func (*endpoint) Disconnect() tcpip.Error { |
| return &tcpip.ErrNotSupported{} |
| } |
| |
| // Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be |
| // connected, and this function always returnes *tcpip.ErrNotSupported. |
| func (*endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { |
| return &tcpip.ErrNotSupported{} |
| } |
| |
| // Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used |
| // with Shutdown, and this function always returns *tcpip.ErrNotSupported. |
| func (*endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { |
| return &tcpip.ErrNotSupported{} |
| } |
| |
| // Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with |
| // Listen, and this function always returns *tcpip.ErrNotSupported. |
| func (*endpoint) Listen(backlog int) tcpip.Error { |
| return &tcpip.ErrNotSupported{} |
| } |
| |
| // Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with |
| // Accept, and this function always returns *tcpip.ErrNotSupported. |
| func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { |
| return nil, nil, &tcpip.ErrNotSupported{} |
| } |
| |
| // Bind implements tcpip.Endpoint.Bind. |
| func (ep *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error { |
| // TODO(gvisor.dev/issue/173): Add Bind support. |
| |
| // "By default, all packets of the specified protocol type are passed |
| // to a packet socket. To get packets only from a specific interface |
| // use bind(2) specifying an address in a struct sockaddr_ll to bind |
| // the packet socket to an interface. Fields used for binding are |
| // sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex." |
| // - packet(7). |
| |
| ep.mu.Lock() |
| defer ep.mu.Unlock() |
| |
| if ep.bound && ep.boundNIC == addr.NIC { |
| // If the NIC being bound is the same then just return success. |
| return nil |
| } |
| |
| // Unregister endpoint with all the nics. |
| ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep) |
| ep.bound = false |
| |
| // Bind endpoint to receive packets from specific interface. |
| if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil { |
| return err |
| } |
| |
| ep.bound = true |
| ep.boundNIC = addr.NIC |
| |
| return nil |
| } |
| |
| // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress. |
| func (*endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { |
| return tcpip.FullAddress{}, &tcpip.ErrNotSupported{} |
| } |
| |
| // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress. |
| func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { |
| // Even a connected socket doesn't return a remote address. |
| return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} |
| } |
| |
| // Readiness implements tcpip.Endpoint.Readiness. |
| func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { |
| // The endpoint is always writable. |
| result := waiter.EventOut & mask |
| |
| // Determine whether the endpoint is readable. |
| if (mask & waiter.EventIn) != 0 { |
| ep.rcvMu.Lock() |
| if !ep.rcvList.Empty() || ep.rcvClosed { |
| result |= waiter.EventIn |
| } |
| ep.rcvMu.Unlock() |
| } |
| |
| return result |
| } |
| |
| // SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be |
| // used with SetSockOpt, and this function always returns |
| // *tcpip.ErrNotSupported. |
| func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { |
| switch opt.(type) { |
| case *tcpip.SocketDetachFilterOption: |
| return nil |
| |
| default: |
| return &tcpip.ErrUnknownProtocolOption{} |
| } |
| } |
| |
| // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt. |
| func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { |
| switch opt { |
| case tcpip.ReceiveBufferSizeOption: |
| // Make sure the receive buffer size is within the min and max |
| // allowed. |
| var rs stack.ReceiveBufferSizeOption |
| if err := ep.stack.Option(&rs); err != nil { |
| panic(fmt.Sprintf("s.Option(%#v) = %s", rs, err)) |
| } |
| if v > rs.Max { |
| v = rs.Max |
| } |
| if v < rs.Min { |
| v = rs.Min |
| } |
| ep.rcvMu.Lock() |
| ep.rcvBufSizeMax = v |
| ep.rcvMu.Unlock() |
| return nil |
| |
| default: |
| return &tcpip.ErrUnknownProtocolOption{} |
| } |
| } |
| |
| func (ep *endpoint) LastError() tcpip.Error { |
| ep.lastErrorMu.Lock() |
| defer ep.lastErrorMu.Unlock() |
| |
| err := ep.lastError |
| ep.lastError = nil |
| return err |
| } |
| |
| // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. |
| func (ep *endpoint) UpdateLastError(err tcpip.Error) { |
| ep.lastErrorMu.Lock() |
| ep.lastError = err |
| ep.lastErrorMu.Unlock() |
| } |
| |
| // GetSockOpt implements tcpip.Endpoint.GetSockOpt. |
| func (ep *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { |
| return &tcpip.ErrNotSupported{} |
| } |
| |
| // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. |
| func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { |
| switch opt { |
| case tcpip.ReceiveQueueSizeOption: |
| v := 0 |
| ep.rcvMu.Lock() |
| if !ep.rcvList.Empty() { |
| p := ep.rcvList.Front() |
| v = p.data.Size() |
| } |
| ep.rcvMu.Unlock() |
| return v, nil |
| |
| case tcpip.ReceiveBufferSizeOption: |
| ep.rcvMu.Lock() |
| v := ep.rcvBufSizeMax |
| ep.rcvMu.Unlock() |
| return v, nil |
| |
| default: |
| return -1, &tcpip.ErrUnknownProtocolOption{} |
| } |
| } |
| |
| // HandlePacket implements stack.PacketEndpoint.HandlePacket. |
| func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) { |
| ep.rcvMu.Lock() |
| |
| // Drop the packet if our buffer is currently full. |
| if ep.rcvClosed { |
| ep.rcvMu.Unlock() |
| ep.stack.Stats().DroppedPackets.Increment() |
| ep.stats.ReceiveErrors.ClosedReceiver.Increment() |
| return |
| } |
| |
| if ep.rcvBufSize >= ep.rcvBufSizeMax { |
| ep.rcvMu.Unlock() |
| ep.stack.Stats().DroppedPackets.Increment() |
| ep.stats.ReceiveErrors.ReceiveBufferOverflow.Increment() |
| return |
| } |
| |
| wasEmpty := ep.rcvBufSize == 0 |
| |
| // Push new packet into receive list and increment the buffer size. |
| var packet packet |
| // TODO(gvisor.dev/issue/173): Return network protocol. |
| if !pkt.LinkHeader().View().IsEmpty() { |
| // Get info directly from the ethernet header. |
| hdr := header.Ethernet(pkt.LinkHeader().View()) |
| packet.senderAddr = tcpip.FullAddress{ |
| NIC: nicID, |
| Addr: tcpip.Address(hdr.SourceAddress()), |
| } |
| packet.packetInfo.Protocol = netProto |
| packet.packetInfo.PktType = pkt.PktType |
| } else { |
| // Guess the would-be ethernet header. |
| packet.senderAddr = tcpip.FullAddress{ |
| NIC: nicID, |
| Addr: tcpip.Address(localAddr), |
| } |
| packet.packetInfo.Protocol = netProto |
| packet.packetInfo.PktType = pkt.PktType |
| } |
| |
| if ep.cooked { |
| // Cooked packets can simply be queued. |
| switch pkt.PktType { |
| case tcpip.PacketHost: |
| packet.data = pkt.Data().ExtractVV() |
| case tcpip.PacketOutgoing: |
| // Strip Link Header. |
| var combinedVV buffer.VectorisedView |
| if v := pkt.NetworkHeader().View(); !v.IsEmpty() { |
| combinedVV.AppendView(v) |
| } |
| if v := pkt.TransportHeader().View(); !v.IsEmpty() { |
| combinedVV.AppendView(v) |
| } |
| combinedVV.Append(pkt.Data().ExtractVV()) |
| packet.data = combinedVV |
| default: |
| panic(fmt.Sprintf("unexpected PktType in pkt: %+v", pkt)) |
| } |
| |
| } else { |
| // Raw packets need their ethernet headers prepended before |
| // queueing. |
| var linkHeader buffer.View |
| if pkt.PktType != tcpip.PacketOutgoing { |
| if pkt.LinkHeader().View().IsEmpty() { |
| // We weren't provided with an actual ethernet header, |
| // so fake one. |
| ethFields := header.EthernetFields{ |
| SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}), |
| DstAddr: localAddr, |
| Type: netProto, |
| } |
| fakeHeader := make(header.Ethernet, header.EthernetMinimumSize) |
| fakeHeader.Encode(ðFields) |
| linkHeader = buffer.View(fakeHeader) |
| } else { |
| linkHeader = append(buffer.View(nil), pkt.LinkHeader().View()...) |
| } |
| combinedVV := linkHeader.ToVectorisedView() |
| combinedVV.Append(pkt.Data().ExtractVV()) |
| packet.data = combinedVV |
| } else { |
| packet.data = buffer.NewVectorisedView(pkt.Size(), pkt.Views()) |
| } |
| } |
| packet.timestampNS = ep.stack.Clock().NowNanoseconds() |
| |
| ep.rcvList.PushBack(&packet) |
| ep.rcvBufSize += packet.data.Size() |
| |
| ep.rcvMu.Unlock() |
| ep.stats.PacketsReceived.Increment() |
| // Notify waiters that there's data to be read. |
| if wasEmpty { |
| ep.waiterQueue.Notify(waiter.EventIn) |
| } |
| } |
| |
| // State implements socket.Socket.State. |
| func (*endpoint) State() uint32 { |
| return 0 |
| } |
| |
| // Info returns a copy of the endpoint info. |
| func (ep *endpoint) Info() tcpip.EndpointInfo { |
| ep.mu.RLock() |
| // Make a copy of the endpoint info. |
| ret := ep.TransportEndpointInfo |
| ep.mu.RUnlock() |
| return &ret |
| } |
| |
| // Stats returns a pointer to the endpoint stats. |
| func (ep *endpoint) Stats() tcpip.EndpointStats { |
| return &ep.stats |
| } |
| |
| // SetOwner implements tcpip.Endpoint.SetOwner. |
| func (ep *endpoint) SetOwner(owner tcpip.PacketOwner) {} |
| |
| // SocketOptions implements tcpip.Endpoint.SocketOptions. |
| func (ep *endpoint) SocketOptions() *tcpip.SocketOptions { |
| return &ep.ops |
| } |