| // Copyright 2020 The gVisor Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package tcp |
| |
| import ( |
| "time" |
| |
| "gvisor.dev/gvisor/pkg/tcpip" |
| "gvisor.dev/gvisor/pkg/tcpip/seqnum" |
| ) |
| |
| const ( |
| // wcDelayedACKTimeout is the recommended maximum delayed ACK timer |
| // value as defined in the RFC. It stands for worst case delayed ACK |
| // timer (WCDelAckT). When FlightSize is 1, PTO is inflated by |
| // WCDelAckT time to compensate for a potential long delayed ACK timer |
| // at the receiver. |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5. |
| wcDelayedACKTimeout = 200 * time.Millisecond |
| |
| // tcpRACKRecoveryThreshold is the number of loss recoveries for which |
| // the reorder window is inflated and after that the reorder window is |
| // reset to its initial value of minRTT/4. |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2. |
| tcpRACKRecoveryThreshold = 16 |
| ) |
| |
| // RACK is a loss detection algorithm used in TCP to detect packet loss and |
| // reordering using transmission timestamp of the packets instead of packet or |
| // sequence counts. To use RACK, SACK should be enabled on the connection. |
| |
| // rackControl stores the rack related fields. |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-6.1 |
| // |
| // +stateify savable |
| type rackControl struct { |
| // dsackSeen indicates if the connection has seen a DSACK. |
| dsackSeen bool |
| |
| // endSequence is the ending TCP sequence number of the most recent |
| // acknowledged segment. |
| endSequence seqnum.Value |
| |
| // exitedRecovery indicates if the connection is exiting loss recovery. |
| // This flag is set if the sender is leaving the recovery after |
| // receiving an ACK and is reset during updating of reorder window. |
| exitedRecovery bool |
| |
| // fack is the highest selectively or cumulatively acknowledged |
| // sequence. |
| fack seqnum.Value |
| |
| // minRTT is the estimated minimum RTT of the connection. |
| minRTT time.Duration |
| |
| // reorderSeen indicates if reordering has been detected on this |
| // connection. |
| reorderSeen bool |
| |
| // reoWnd is the reordering window time used for recording packet |
| // transmission times. It is used to defer the moment at which RACK |
| // marks a packet lost. |
| reoWnd time.Duration |
| |
| // reoWndIncr is the multiplier applied to adjust reorder window. |
| reoWndIncr uint8 |
| |
| // reoWndPersist is the number of loss recoveries before resetting |
| // reorder window. |
| reoWndPersist int8 |
| |
| // rtt is the RTT of the most recently delivered packet on the |
| // connection (either cumulatively acknowledged or selectively |
| // acknowledged) that was not marked invalid as a possible spurious |
| // retransmission. |
| rtt time.Duration |
| |
| // rttSeq is the SND.NXT when rtt is updated. |
| rttSeq seqnum.Value |
| |
| // xmitTime is the latest transmission timestamp of the most recent |
| // acknowledged segment. |
| xmitTime time.Time `state:".(unixTime)"` |
| |
| // tlpRxtOut indicates whether there is an unacknowledged |
| // TLP retransmission. |
| tlpRxtOut bool |
| |
| // tlpHighRxt the value of sender.sndNxt at the time of sending |
| // a TLP retransmission. |
| tlpHighRxt seqnum.Value |
| |
| // snd is a reference to the sender. |
| snd *sender |
| } |
| |
| // init initializes RACK specific fields. |
| func (rc *rackControl) init(snd *sender, iss seqnum.Value) { |
| rc.fack = iss |
| rc.reoWndIncr = 1 |
| rc.snd = snd |
| } |
| |
| // update will update the RACK related fields when an ACK has been received. |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2 |
| func (rc *rackControl) update(seg *segment, ackSeg *segment) { |
| rtt := time.Now().Sub(seg.xmitTime) |
| tsOffset := rc.snd.ep.tsOffset |
| |
| // If the ACK is for a retransmitted packet, do not update if it is a |
| // spurious inference which is determined by below checks: |
| // 1. When Timestamping option is available, if the TSVal is less than |
| // the transmit time of the most recent retransmitted packet. |
| // 2. When RTT calculated for the packet is less than the smoothed RTT |
| // for the connection. |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 |
| // step 2 |
| if seg.xmitCount > 1 { |
| if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 { |
| if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, tsOffset) { |
| return |
| } |
| } |
| if rtt < rc.minRTT { |
| return |
| } |
| } |
| |
| rc.rtt = rtt |
| |
| // The sender can either track a simple global minimum of all RTT |
| // measurements from the connection, or a windowed min-filtered value |
| // of recent RTT measurements. This implementation keeps track of the |
| // simple global minimum of all RTTs for the connection. |
| if rtt < rc.minRTT || rc.minRTT == 0 { |
| rc.minRTT = rtt |
| } |
| |
| // Update rc.xmitTime and rc.endSequence to the transmit time and |
| // ending sequence number of the packet which has been acknowledged |
| // most recently. |
| endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) |
| if rc.xmitTime.Before(seg.xmitTime) || (seg.xmitTime.Equal(rc.xmitTime) && rc.endSequence.LessThan(endSeq)) { |
| rc.xmitTime = seg.xmitTime |
| rc.endSequence = endSeq |
| } |
| } |
| |
| // detectReorder detects if packet reordering has been observed. |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 |
| // * Step 3: Detect data segment reordering. |
| // To detect reordering, the sender looks for original data segments being |
| // delivered out of order. To detect such cases, the sender tracks the |
| // highest sequence selectively or cumulatively acknowledged in the RACK.fack |
| // variable. The name "fack" stands for the most "Forward ACK" (this term is |
| // adopted from [FACK]). If a never retransmitted segment that's below |
| // RACK.fack is (selectively or cumulatively) acknowledged, it has been |
| // delivered out of order. The sender sets RACK.reord to TRUE if such segment |
| // is identified. |
| func (rc *rackControl) detectReorder(seg *segment) { |
| endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) |
| if rc.fack.LessThan(endSeq) { |
| rc.fack = endSeq |
| return |
| } |
| |
| if endSeq.LessThan(rc.fack) && seg.xmitCount == 1 { |
| rc.reorderSeen = true |
| } |
| } |
| |
| func (rc *rackControl) setDSACKSeen(dsackSeen bool) { |
| rc.dsackSeen = dsackSeen |
| } |
| |
| // shouldSchedulePTO dictates whether we should schedule a PTO or not. |
| // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. |
| func (s *sender) shouldSchedulePTO() bool { |
| // Schedule PTO only if RACK loss detection is enabled. |
| return s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 && |
| // The connection supports SACK. |
| s.ep.sackPermitted && |
| // The connection is not in loss recovery. |
| (s.state != tcpip.RTORecovery && s.state != tcpip.SACKRecovery) && |
| // The connection has no SACKed sequences in the SACK scoreboard. |
| s.ep.scoreboard.Sacked() == 0 |
| } |
| |
| // schedulePTO schedules the probe timeout as defined in |
| // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. |
| func (s *sender) schedulePTO() { |
| pto := time.Second |
| s.rtt.Lock() |
| if s.rtt.srttInited && s.rtt.srtt > 0 { |
| pto = s.rtt.srtt * 2 |
| if s.outstanding == 1 { |
| pto += wcDelayedACKTimeout |
| } |
| } |
| s.rtt.Unlock() |
| |
| now := time.Now() |
| if s.resendTimer.enabled() { |
| if now.Add(pto).After(s.resendTimer.target) { |
| pto = s.resendTimer.target.Sub(now) |
| } |
| s.resendTimer.disable() |
| } |
| |
| s.probeTimer.enable(pto) |
| } |
| |
| // probeTimerExpired is the same as TLP_send_probe() as defined in |
| // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.2. |
| func (s *sender) probeTimerExpired() tcpip.Error { |
| if !s.probeTimer.checkExpiration() { |
| return nil |
| } |
| |
| var dataSent bool |
| if s.writeNext != nil && s.writeNext.xmitCount == 0 && s.outstanding < s.sndCwnd { |
| dataSent = s.maybeSendSegment(s.writeNext, int(s.ep.scoreboard.SMSS()), s.sndUna.Add(s.sndWnd)) |
| if dataSent { |
| s.outstanding += s.pCount(s.writeNext, s.maxPayloadSize) |
| s.writeNext = s.writeNext.Next() |
| } |
| } |
| |
| if !dataSent && !s.rc.tlpRxtOut { |
| var highestSeqXmit *segment |
| for highestSeqXmit = s.writeList.Front(); highestSeqXmit != nil; highestSeqXmit = highestSeqXmit.Next() { |
| if highestSeqXmit.xmitCount == 0 { |
| // Nothing in writeList is transmitted, no need to send a probe. |
| highestSeqXmit = nil |
| break |
| } |
| if highestSeqXmit.Next() == nil || highestSeqXmit.Next().xmitCount == 0 { |
| // Either everything in writeList has been transmitted or the next |
| // sequence has not been transmitted. Either way this is the highest |
| // sequence segment that was transmitted. |
| break |
| } |
| } |
| |
| if highestSeqXmit != nil { |
| dataSent = s.maybeSendSegment(highestSeqXmit, int(s.ep.scoreboard.SMSS()), s.sndUna.Add(s.sndWnd)) |
| if dataSent { |
| s.rc.tlpRxtOut = true |
| s.rc.tlpHighRxt = s.sndNxt |
| } |
| } |
| } |
| |
| // Whether or not the probe was sent, the sender must arm the resend timer, |
| // not the probe timer. This ensures that the sender does not send repeated, |
| // back-to-back tail loss probes. |
| s.postXmit(dataSent, false /* shouldScheduleProbe */) |
| return nil |
| } |
| |
| // detectTLPRecovery detects if recovery was accomplished by the loss probes |
| // and updates TLP state accordingly. |
| // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.3. |
| func (s *sender) detectTLPRecovery(ack seqnum.Value, rcvdSeg *segment) { |
| if !(s.ep.sackPermitted && s.rc.tlpRxtOut) { |
| return |
| } |
| |
| // Step 1. |
| if s.isDupAck(rcvdSeg) && ack == s.rc.tlpHighRxt { |
| var sbAboveTLPHighRxt bool |
| for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { |
| if s.rc.tlpHighRxt.LessThan(sb.End) { |
| sbAboveTLPHighRxt = true |
| break |
| } |
| } |
| if !sbAboveTLPHighRxt { |
| // TLP episode is complete. |
| s.rc.tlpRxtOut = false |
| } |
| } |
| |
| if s.rc.tlpRxtOut && s.rc.tlpHighRxt.LessThanEq(ack) { |
| // TLP episode is complete. |
| s.rc.tlpRxtOut = false |
| if !checkDSACK(rcvdSeg) { |
| // Step 2. Either the original packet or the retransmission (in the |
| // form of a probe) was lost. Invoke a congestion control response |
| // equivalent to fast recovery. |
| s.cc.HandleLossDetected() |
| s.enterRecovery() |
| s.leaveRecovery() |
| } |
| } |
| } |
| |
| // updateRACKReorderWindow updates the reorder window. |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 |
| // * Step 4: Update RACK reordering window |
| // To handle the prevalent small degree of reordering, RACK.reo_wnd serves as |
| // an allowance for settling time before marking a packet lost. RACK starts |
| // initially with a conservative window of min_RTT/4. If no reordering has |
| // been observed RACK uses reo_wnd of zero during loss recovery, in order to |
| // retransmit quickly, or when the number of DUPACKs exceeds the classic |
| // DUPACKthreshold. |
| func (rc *rackControl) updateRACKReorderWindow(ackSeg *segment) { |
| dsackSeen := rc.dsackSeen |
| snd := rc.snd |
| |
| // React to DSACK once per round trip. |
| // If SND.UNA < RACK.rtt_seq: |
| // RACK.dsack = false |
| if snd.sndUna.LessThan(rc.rttSeq) { |
| dsackSeen = false |
| } |
| |
| // If RACK.dsack: |
| // RACK.reo_wnd_incr += 1 |
| // RACK.dsack = false |
| // RACK.rtt_seq = SND.NXT |
| // RACK.reo_wnd_persist = 16 |
| if dsackSeen { |
| rc.reoWndIncr++ |
| dsackSeen = false |
| rc.rttSeq = snd.sndNxt |
| rc.reoWndPersist = tcpRACKRecoveryThreshold |
| } else if rc.exitedRecovery { |
| // Else if exiting loss recovery: |
| // RACK.reo_wnd_persist -= 1 |
| // If RACK.reo_wnd_persist <= 0: |
| // RACK.reo_wnd_incr = 1 |
| rc.reoWndPersist-- |
| if rc.reoWndPersist <= 0 { |
| rc.reoWndIncr = 1 |
| } |
| rc.exitedRecovery = false |
| } |
| |
| // Reorder window is zero during loss recovery, or when the number of |
| // DUPACKs exceeds the classic DUPACKthreshold. |
| // If RACK.reord is FALSE: |
| // If in loss recovery: (If in fast or timeout recovery) |
| // RACK.reo_wnd = 0 |
| // Return |
| // Else if RACK.pkts_sacked >= RACK.dupthresh: |
| // RACK.reo_wnd = 0 |
| // return |
| if !rc.reorderSeen { |
| if snd.state == tcpip.RTORecovery || snd.state == tcpip.SACKRecovery { |
| rc.reoWnd = 0 |
| return |
| } |
| |
| if snd.sackedOut >= nDupAckThreshold { |
| rc.reoWnd = 0 |
| return |
| } |
| } |
| |
| // Calculate reorder window. |
| // RACK.reo_wnd = RACK.min_RTT / 4 * RACK.reo_wnd_incr |
| // RACK.reo_wnd = min(RACK.reo_wnd, SRTT) |
| snd.rtt.Lock() |
| srtt := snd.rtt.srtt |
| snd.rtt.Unlock() |
| rc.reoWnd = time.Duration((int64(rc.minRTT) / 4) * int64(rc.reoWndIncr)) |
| if srtt < rc.reoWnd { |
| rc.reoWnd = srtt |
| } |
| } |
| |
| func (rc *rackControl) exitRecovery() { |
| rc.exitedRecovery = true |
| } |
| |
| // detectLoss marks the segment as lost if the reordering window has elapsed |
| // and the ACK is not received. It will also arm the reorder timer. |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 Step 5. |
| func (rc *rackControl) detectLoss(rcvTime time.Time) int { |
| var timeout time.Duration |
| numLost := 0 |
| for seg := rc.snd.writeList.Front(); seg != nil && seg.xmitCount != 0; seg = seg.Next() { |
| if rc.snd.ep.scoreboard.IsSACKED(seg.sackBlock()) { |
| continue |
| } |
| |
| if seg.lost && seg.xmitCount == 1 { |
| numLost++ |
| continue |
| } |
| |
| endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) |
| if seg.xmitTime.Before(rc.xmitTime) || (seg.xmitTime.Equal(rc.xmitTime) && rc.endSequence.LessThan(endSeq)) { |
| timeRemaining := seg.xmitTime.Sub(rcvTime) + rc.rtt + rc.reoWnd |
| if timeRemaining <= 0 { |
| seg.lost = true |
| numLost++ |
| } else if timeRemaining > timeout { |
| timeout = timeRemaining |
| } |
| } |
| } |
| |
| if timeout != 0 && !rc.snd.reorderTimer.enabled() { |
| rc.snd.reorderTimer.enable(timeout) |
| } |
| return numLost |
| } |
| |
| // reorderTimerExpired will retransmit the segments which have not been acked |
| // before the reorder timer expired. |
| func (rc *rackControl) reorderTimerExpired() tcpip.Error { |
| // Check if the timer actually expired or if it's a spurious wake due |
| // to a previously orphaned runtime timer. |
| if !rc.snd.reorderTimer.checkExpiration() { |
| return nil |
| } |
| |
| numLost := rc.detectLoss(time.Now()) |
| if numLost == 0 { |
| return nil |
| } |
| |
| fastRetransmit := false |
| if !rc.snd.fr.active { |
| rc.snd.cc.HandleLossDetected() |
| rc.snd.enterRecovery() |
| fastRetransmit = true |
| } |
| |
| rc.DoRecovery(nil, fastRetransmit) |
| return nil |
| } |
| |
| // DoRecovery implements lossRecovery.DoRecovery. |
| func (rc *rackControl) DoRecovery(_ *segment, fastRetransmit bool) { |
| snd := rc.snd |
| if fastRetransmit { |
| snd.resendSegment() |
| } |
| |
| var dataSent bool |
| // Iterate the writeList and retransmit the segments which are marked |
| // as lost by RACK. |
| for seg := snd.writeList.Front(); seg != nil && seg.xmitCount > 0; seg = seg.Next() { |
| if seg == snd.writeNext { |
| break |
| } |
| |
| if !seg.lost { |
| continue |
| } |
| |
| // Reset seg.lost as it is already SACKed. |
| if snd.ep.scoreboard.IsSACKED(seg.sackBlock()) { |
| seg.lost = false |
| continue |
| } |
| |
| // Check the congestion window after entering recovery. |
| if snd.outstanding >= snd.sndCwnd { |
| break |
| } |
| |
| if sent := snd.maybeSendSegment(seg, int(snd.ep.scoreboard.SMSS()), snd.sndUna.Add(snd.sndWnd)); !sent { |
| break |
| } |
| dataSent = true |
| snd.outstanding += snd.pCount(seg, snd.maxPayloadSize) |
| } |
| |
| snd.postXmit(dataSent, true /* shouldScheduleProbe */) |
| } |