2  * Copyright (c) 2004-2016 Apple Inc. All rights reserved. 
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 
   6  * This file contains Original Code and/or Modifications of Original Code 
   7  * as defined in and that are subject to the Apple Public Source License 
   8  * Version 2.0 (the 'License'). You may not use this file except in 
   9  * compliance with the License. The rights granted to you under the License 
  10  * may not be used to create, or enable the creation or redistribution of, 
  11  * unlawful or unlicensed copies of an Apple operating system, or to 
  12  * circumvent, violate, or enable the circumvention or violation of, any 
  13  * terms of an Apple operating system software license agreement. 
  15  * Please obtain a copy of the License at 
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file. 
  18  * The Original Code and all software distributed under the License are 
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
  23  * Please see the License for the specific language governing rights and 
  24  * limitations under the License. 
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 
  30  *      The Regents of the University of California.  All rights reserved. 
  32  * Redistribution and use in source and binary forms, with or without 
  33  * modification, are permitted provided that the following conditions 
  35  * 1. Redistributions of source code must retain the above copyright 
  36  *    notice, this list of conditions and the following disclaimer. 
  37  * 2. Redistributions in binary form must reproduce the above copyright 
  38  *    notice, this list of conditions and the following disclaimer in the 
  39  *    documentation and/or other materials provided with the distribution. 
  40  * 3. All advertising materials mentioning features or use of this software 
  41  *    must display the following acknowledgement: 
  42  *      This product includes software developed by the University of 
  43  *      California, Berkeley and its contributors. 
  44  * 4. Neither the name of the University nor the names of its contributors 
  45  *    may be used to endorse or promote products derived from this software 
  46  *    without specific prior written permission. 
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
  65 #include <sys/param.h> 
  66 #include <sys/systm.h> 
  67 #include <sys/kernel.h> 
  68 #include <sys/sysctl.h> 
  70 #include <sys/domain.h> 
  71 #include <sys/protosw.h> 
  72 #include <sys/socket.h> 
  73 #include <sys/socketvar.h> 
  75 #include <kern/zalloc.h> 
  77 #include <net/route.h> 
  79 #include <netinet/in.h> 
  80 #include <netinet/in_systm.h> 
  81 #include <netinet/ip.h> 
  82 #include <netinet/in_pcb.h> 
  83 #include <netinet/ip_var.h> 
  85 #include <netinet6/in6_pcb.h> 
  86 #include <netinet/ip6.h> 
  87 #include <netinet6/ip6_var.h> 
  89 #include <netinet/tcp.h> 
  91 #include <netinet/tcp_fsm.h> 
  92 #include <netinet/tcp_seq.h> 
  93 #include <netinet/tcp_timer.h> 
  94 #include <netinet/tcp_var.h> 
  95 #include <netinet/tcpip.h> 
  96 #include <netinet/tcp_cache.h> 
  98 #include <netinet/tcp_debug.h> 
 100 #include <sys/kdebug.h> 
 103 #include <netinet6/ipsec.h> 
 106 #include <libkern/OSAtomic.h> 
 108 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, sack
, CTLFLAG_RW 
| CTLFLAG_LOCKED
, 
 109     int, tcp_do_sack
, 1, "Enable/Disable TCP SACK support"); 
 110 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, sack_maxholes
, CTLFLAG_RW 
| CTLFLAG_LOCKED
, 
 111     static int, tcp_sack_maxholes
, 128, 
 112     "Maximum number of TCP SACK holes allowed per connection"); 
 114 SYSCTL_SKMEM_TCP_INT(OID_AUTO
, sack_globalmaxholes
, 
 115     CTLFLAG_RW 
| CTLFLAG_LOCKED
, static int, tcp_sack_globalmaxholes
, 65536, 
 116     "Global maximum number of TCP SACK holes"); 
 118 static SInt32 tcp_sack_globalholes 
= 0; 
 119 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, sack_globalholes
, CTLFLAG_RD 
| CTLFLAG_LOCKED
, 
 120     &tcp_sack_globalholes
, 0, 
 121     "Global number of TCP SACK holes currently allocated"); 
 123 static int tcp_detect_reordering 
= 1; 
 124 static int tcp_dsack_ignore_hw_duplicates 
= 0; 
 126 #if (DEVELOPMENT || DEBUG) 
 127 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, detect_reordering
, 
 128     CTLFLAG_RW 
| CTLFLAG_LOCKED
, 
 129     &tcp_detect_reordering
, 0, ""); 
 131 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, ignore_hw_duplicates
, 
 132     CTLFLAG_RW 
| CTLFLAG_LOCKED
, 
 133     &tcp_dsack_ignore_hw_duplicates
, 0, ""); 
 134 #endif /* (DEVELOPMENT || DEBUG) */ 
 136 extern struct zone 
*sack_hole_zone
; 
 138 #define TCP_VALIDATE_SACK_SEQ_NUMBERS(_tp_, _sb_, _ack_) \ 
 139     (SEQ_GT((_sb_)->end, (_sb_)->start) && \ 
 140     SEQ_GT((_sb_)->start, (_tp_)->snd_una) && \ 
 141     SEQ_GT((_sb_)->start, (_ack_)) && \ 
 142     SEQ_LT((_sb_)->start, (_tp_)->snd_max) && \ 
 143     SEQ_GT((_sb_)->end, (_tp_)->snd_una) && \ 
 144     SEQ_LEQ((_sb_)->end, (_tp_)->snd_max)) 
 147  * This function is called upon receipt of new valid data (while not in header 
 148  * prediction mode), and it updates the ordered list of sacks. 
 151 tcp_update_sack_list(struct tcpcb 
*tp
, tcp_seq rcv_start
, tcp_seq rcv_end
) 
 154          * First reported block MUST be the most recent one.  Subsequent 
 155          * blocks SHOULD be in the order in which they arrived at the 
 156          * receiver.  These two conditions make the implementation fully 
 157          * compliant with RFC 2018. 
 159         struct sackblk head_blk
, saved_blks
[MAX_SACK_BLKS
]; 
 160         int num_head
, num_saved
, i
; 
 162         /* SACK block for the received segment. */ 
 163         head_blk
.start 
= rcv_start
; 
 164         head_blk
.end 
= rcv_end
; 
 167          * Merge updated SACK blocks into head_blk, and 
 168          * save unchanged SACK blocks into saved_blks[]. 
 169          * num_saved will have the number of the saved SACK blocks. 
 172         for (i 
= 0; i 
< tp
->rcv_numsacks
; i
++) { 
 173                 tcp_seq start 
= tp
->sackblks
[i
].start
; 
 174                 tcp_seq end 
= tp
->sackblks
[i
].end
; 
 175                 if (SEQ_GEQ(start
, end
) || SEQ_LEQ(start
, tp
->rcv_nxt
)) { 
 177                          * Discard this SACK block. 
 179                 } else if (SEQ_LEQ(head_blk
.start
, end
) && 
 180                     SEQ_GEQ(head_blk
.end
, start
)) { 
 182                          * Merge this SACK block into head_blk. 
 183                          * This SACK block itself will be discarded. 
 185                         if (SEQ_GT(head_blk
.start
, start
)) { 
 186                                 head_blk
.start 
= start
; 
 188                         if (SEQ_LT(head_blk
.end
, end
)) { 
 193                          * Save this SACK block. 
 195                         saved_blks
[num_saved
].start 
= start
; 
 196                         saved_blks
[num_saved
].end 
= end
; 
 202          * Update SACK list in tp->sackblks[]. 
 205         if (SEQ_GT(head_blk
.start
, tp
->rcv_nxt
)) { 
 207                  * The received data segment is an out-of-order segment. 
 208                  * Put head_blk at the top of SACK list. 
 210                 tp
->sackblks
[0] = head_blk
; 
 213                  * If the number of saved SACK blocks exceeds its limit, 
 214                  * discard the last SACK block. 
 216                 if (num_saved 
>= MAX_SACK_BLKS
) { 
 222                  * Copy the saved SACK blocks back. 
 224                 bcopy(saved_blks
, &tp
->sackblks
[num_head
], 
 225                     sizeof(struct sackblk
) * num_saved
); 
 228         /* Save the number of SACK blocks. */ 
 229         tp
->rcv_numsacks 
= num_head 
+ num_saved
; 
 231         /* If we are requesting SACK recovery, reset the stretch-ack state 
 232          * so that connection will generate more acks after recovery and 
 233          * sender's cwnd will open. 
 235         if ((tp
->t_flags 
& TF_STRETCHACK
) != 0 && tp
->rcv_numsacks 
> 0) { 
 236                 tcp_reset_stretch_ack(tp
); 
 240         if (tp
->acc_iaj 
> 0 && tp
->rcv_numsacks 
> 0) { 
 243 #endif /* TRAFFIC_MGT */ 
 247  * Delete all receiver-side SACK information. 
 250 tcp_clean_sackreport( struct tcpcb 
*tp
) 
 252         tp
->rcv_numsacks 
= 0; 
 253         bzero(&tp
->sackblks
[0], sizeof(struct sackblk
) * MAX_SACK_BLKS
); 
 257  * Allocate struct sackhole. 
 259 static struct sackhole 
* 
 260 tcp_sackhole_alloc(struct tcpcb 
*tp
, tcp_seq start
, tcp_seq end
) 
 262         struct sackhole 
*hole
; 
 264         if (tp
->snd_numholes 
>= tcp_sack_maxholes 
|| 
 265             tcp_sack_globalholes 
>= tcp_sack_globalmaxholes
) { 
 266                 tcpstat
.tcps_sack_sboverflow
++; 
 270         hole 
= (struct sackhole 
*)zalloc(sack_hole_zone
); 
 280         OSIncrementAtomic(&tcp_sack_globalholes
); 
 286  * Free struct sackhole. 
 289 tcp_sackhole_free(struct tcpcb 
*tp
, struct sackhole 
*hole
) 
 291         zfree(sack_hole_zone
, hole
); 
 294         OSDecrementAtomic(&tcp_sack_globalholes
); 
 298  * Insert new SACK hole into scoreboard. 
 300 static struct sackhole 
* 
 301 tcp_sackhole_insert(struct tcpcb 
*tp
, tcp_seq start
, tcp_seq end
, 
 302     struct sackhole 
*after
) 
 304         struct sackhole 
*hole
; 
 306         /* Allocate a new SACK hole. */ 
 307         hole 
= tcp_sackhole_alloc(tp
, start
, end
); 
 311         hole
->rxmit_start 
= tcp_now
; 
 312         /* Insert the new SACK hole into scoreboard */ 
 314                 TAILQ_INSERT_AFTER(&tp
->snd_holes
, after
, hole
, scblink
); 
 316                 TAILQ_INSERT_TAIL(&tp
->snd_holes
, hole
, scblink
); 
 319         /* Update SACK hint. */ 
 320         if (tp
->sackhint
.nexthole 
== NULL
) { 
 321                 tp
->sackhint
.nexthole 
= hole
; 
 328  * Remove SACK hole from scoreboard. 
 331 tcp_sackhole_remove(struct tcpcb 
*tp
, struct sackhole 
*hole
) 
 333         /* Update SACK hint. */ 
 334         if (tp
->sackhint
.nexthole 
== hole
) { 
 335                 tp
->sackhint
.nexthole 
= TAILQ_NEXT(hole
, scblink
); 
 338         /* Remove this SACK hole. */ 
 339         TAILQ_REMOVE(&tp
->snd_holes
, hole
, scblink
); 
 341         /* Free this SACK hole. */ 
 342         tcp_sackhole_free(tp
, hole
); 
 345  * When a new ack with SACK is received, check if it indicates packet 
 346  * reordering. If there is packet reordering, the socket is marked and 
 347  * the late time offset by which the packet was reordered with 
 348  * respect to its closest neighboring packets is computed. 
 351 tcp_sack_detect_reordering(struct tcpcb 
*tp
, struct sackhole 
*s
, 
 352     tcp_seq sacked_seq
, tcp_seq snd_fack
) 
 354         int32_t rext 
= 0, reordered 
= 0; 
 357          * If the SACK hole is past snd_fack, this is from new SACK 
 358          * information, so we can ignore it. 
 360         if (SEQ_GT(s
->end
, snd_fack
)) { 
 364          * If there has been a retransmit timeout, then the timestamp on 
 365          * the SACK segment will be newer. This might lead to a 
 366          * false-positive. Avoid re-ordering detection in this case. 
 368         if (tp
->t_rxtshift 
> 0) { 
 373          * Detect reordering from SACK information by checking 
 374          * if recently sacked data was never retransmitted from this hole. 
 376         if (SEQ_LT(s
->rxmit
, sacked_seq
)) { 
 378                 tcpstat
.tcps_avoid_rxmt
++; 
 382                 if (tcp_detect_reordering 
== 1 && 
 383                     !(tp
->t_flagsext 
& TF_PKTS_REORDERED
)) { 
 384                         tp
->t_flagsext 
|= TF_PKTS_REORDERED
; 
 385                         tcpstat
.tcps_detect_reordering
++; 
 388                 tcpstat
.tcps_reordered_pkts
++; 
 389                 tp
->t_reordered_pkts
++; 
 392                  * If reordering is seen on a connection wth ECN enabled, 
 393                  * increment the heuristic 
 395                 if (TCP_ECN_ENABLED(tp
)) { 
 396                         INP_INC_IFNET_STAT(tp
->t_inpcb
, ecn_fallback_reorder
); 
 397                         tcpstat
.tcps_ecn_fallback_reorder
++; 
 398                         tcp_heuristic_ecn_aggressive(tp
); 
 401                 VERIFY(SEQ_GEQ(snd_fack
, s
->rxmit
)); 
 403                 if (s
->rxmit_start 
> 0) { 
 404                         rext 
= timer_diff(tcp_now
, 0, s
->rxmit_start
, 0); 
 410                          * We take the maximum reorder window to schedule 
 411                          * DELAYFR timer as that will take care of jitter 
 412                          * on the network path. 
 414                          * Computing average and standard deviation seems 
 415                          * to cause unnecessary retransmissions when there 
 418                          * We set a maximum of SRTT/2 and a minimum of 
 419                          * 10 ms on the reorder window. 
 421                         tp
->t_reorderwin 
= max(tp
->t_reorderwin
, rext
); 
 422                         tp
->t_reorderwin 
= min(tp
->t_reorderwin
, 
 423                             (tp
->t_srtt 
>> (TCP_RTT_SHIFT 
- 1))); 
 424                         tp
->t_reorderwin 
= max(tp
->t_reorderwin
, 10); 
 430  * Process cumulative ACK and the TCP SACK option to update the scoreboard. 
 431  * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of 
 432  * the sequence space). 
 435 tcp_sack_doack(struct tcpcb 
*tp
, struct tcpopt 
*to
, struct tcphdr 
*th
, 
 436     u_int32_t 
*newbytes_acked
) 
 438         struct sackhole 
*cur
, *temp
; 
 439         struct sackblk sack
, sack_blocks
[TCP_MAX_SACK 
+ 1], *sblkp
; 
 440         int i
, j
, num_sack_blks
; 
 441         tcp_seq old_snd_fack 
= 0, th_ack 
= th
->th_ack
; 
 445          * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, 
 446          * treat [SND.UNA, SEG.ACK) as if it is a SACK block. 
 448         if (SEQ_LT(tp
->snd_una
, th_ack
) && !TAILQ_EMPTY(&tp
->snd_holes
)) { 
 449                 sack_blocks
[num_sack_blks
].start 
= tp
->snd_una
; 
 450                 sack_blocks
[num_sack_blks
++].end 
= th_ack
; 
 453          * Append received valid SACK blocks to sack_blocks[]. 
 454          * Check that the SACK block range is valid. 
 456         for (i 
= 0; i 
< to
->to_nsacks
; i
++) { 
 457                 bcopy((to
->to_sacks 
+ i 
* TCPOLEN_SACK
), 
 458                     &sack
, sizeof(sack
)); 
 459                 sack
.start 
= ntohl(sack
.start
); 
 460                 sack
.end 
= ntohl(sack
.end
); 
 461                 if (TCP_VALIDATE_SACK_SEQ_NUMBERS(tp
, &sack
, th_ack
)) { 
 462                         sack_blocks
[num_sack_blks
++] = sack
; 
 467          * Return if SND.UNA is not advanced and no valid SACK block 
 470         if (num_sack_blks 
== 0) { 
 474         VERIFY(num_sack_blks 
<= (TCP_MAX_SACK 
+ 1)); 
 476          * Sort the SACK blocks so we can update the scoreboard 
 477          * with just one pass. The overhead of sorting upto 4+1 elements 
 478          * is less than making upto 4+1 passes over the scoreboard. 
 480         for (i 
= 0; i 
< num_sack_blks
; i
++) { 
 481                 for (j 
= i 
+ 1; j 
< num_sack_blks
; j
++) { 
 482                         if (SEQ_GT(sack_blocks
[i
].end
, sack_blocks
[j
].end
)) { 
 483                                 sack 
= sack_blocks
[i
]; 
 484                                 sack_blocks
[i
] = sack_blocks
[j
]; 
 485                                 sack_blocks
[j
] = sack
; 
 489         if (TAILQ_EMPTY(&tp
->snd_holes
)) { 
 491                  * Empty scoreboard. Need to initialize snd_fack (it may be 
 492                  * uninitialized or have a bogus value). Scoreboard holes 
 493                  * (from the sack blocks received) are created later below (in 
 494                  * the logic that adds holes to the tail of the scoreboard). 
 496                 tp
->snd_fack 
= SEQ_MAX(tp
->snd_una
, th_ack
); 
 497                 *newbytes_acked 
+= (tp
->snd_fack 
- tp
->snd_una
); 
 500         old_snd_fack 
= tp
->snd_fack
; 
 502          * In the while-loop below, incoming SACK blocks (sack_blocks[]) 
 503          * and SACK holes (snd_holes) are traversed from their tails with 
 504          * just one pass in order to reduce the number of compares especially 
 505          * when the bandwidth-delay product is large. 
 506          * Note: Typically, in the first RTT of SACK recovery, the highest 
 507          * three or four SACK blocks with the same ack number are received. 
 508          * In the second RTT, if retransmitted data segments are not lost, 
 509          * the highest three or four SACK blocks with ack number advancing 
 512         sblkp 
= &sack_blocks
[num_sack_blks 
- 1];        /* Last SACK block */ 
 513         if (SEQ_LT(tp
->snd_fack
, sblkp
->start
)) { 
 515                  * The highest SACK block is beyond fack. 
 516                  * Append new SACK hole at the tail. 
 517                  * If the second or later highest SACK blocks are also 
 518                  * beyond the current fack, they will be inserted by 
 519                  * way of hole splitting in the while-loop below. 
 521                 temp 
= tcp_sackhole_insert(tp
, tp
->snd_fack
, sblkp
->start
, NULL
); 
 523                         tp
->snd_fack 
= sblkp
->end
; 
 524                         *newbytes_acked 
+= (sblkp
->end 
- sblkp
->start
); 
 526                         /* Go to the previous sack block. */ 
 530                          * We failed to add a new hole based on the current 
 531                          * sack block.  Skip over all the sack blocks that 
 532                          * fall completely to the right of snd_fack and proceed 
 533                          * to trim the scoreboard based on the remaining sack 
 534                          * blocks. This also trims the scoreboard for th_ack 
 535                          * (which is sack_blocks[0]). 
 537                         while (sblkp 
>= sack_blocks 
&& 
 538                             SEQ_LT(tp
->snd_fack
, sblkp
->start
)) { 
 541                         if (sblkp 
>= sack_blocks 
&& 
 542                             SEQ_LT(tp
->snd_fack
, sblkp
->end
)) { 
 543                                 *newbytes_acked 
+= (sblkp
->end 
- tp
->snd_fack
); 
 544                                 tp
->snd_fack 
= sblkp
->end
; 
 547         } else if (SEQ_LT(tp
->snd_fack
, sblkp
->end
)) { 
 548                 /* fack is advanced. */ 
 549                 *newbytes_acked 
+= (sblkp
->end 
- tp
->snd_fack
); 
 550                 tp
->snd_fack 
= sblkp
->end
; 
 552         /* We must have at least one SACK hole in scoreboard */ 
 553         cur 
= TAILQ_LAST(&tp
->snd_holes
, sackhole_head
); /* Last SACK hole */ 
 555          * Since the incoming sack blocks are sorted, we can process them 
 556          * making one sweep of the scoreboard. 
 558         while (sblkp 
>= sack_blocks 
&& cur 
!= NULL
) { 
 559                 if (SEQ_GEQ(sblkp
->start
, cur
->end
)) { 
 561                          * SACKs data beyond the current hole. 
 562                          * Go to the previous sack block. 
 567                 if (SEQ_LEQ(sblkp
->end
, cur
->start
)) { 
 569                          * SACKs data before the current hole. 
 570                          * Go to the previous hole. 
 572                         cur 
= TAILQ_PREV(cur
, sackhole_head
, scblink
); 
 575                 tp
->sackhint
.sack_bytes_rexmit 
-= (cur
->rxmit 
- cur
->start
); 
 576                 if (SEQ_LEQ(sblkp
->start
, cur
->start
)) { 
 577                         /* Data acks at least the beginning of hole */ 
 578                         if (SEQ_GEQ(sblkp
->end
, cur
->end
)) { 
 579                                 /* Acks entire hole, so delete hole */ 
 580                                 *newbytes_acked 
+= (cur
->end 
- cur
->start
); 
 582                                 tcp_sack_detect_reordering(tp
, cur
, 
 583                                     cur
->end
, old_snd_fack
); 
 585                                 cur 
= TAILQ_PREV(cur
, sackhole_head
, scblink
); 
 586                                 tcp_sackhole_remove(tp
, temp
); 
 588                                  * The sack block may ack all or part of the next 
 589                                  * hole too, so continue onto the next hole. 
 593                                 /* Move start of hole forward */ 
 594                                 *newbytes_acked 
+= (sblkp
->end 
- cur
->start
); 
 595                                 tcp_sack_detect_reordering(tp
, cur
, 
 596                                     sblkp
->end
, old_snd_fack
); 
 597                                 cur
->start 
= sblkp
->end
; 
 598                                 cur
->rxmit 
= SEQ_MAX(cur
->rxmit
, cur
->start
); 
 601                         /* Data acks at least the end of hole */ 
 602                         if (SEQ_GEQ(sblkp
->end
, cur
->end
)) { 
 603                                 /* Move end of hole backward */ 
 604                                 *newbytes_acked 
+= (cur
->end 
- sblkp
->start
); 
 605                                 tcp_sack_detect_reordering(tp
, cur
, 
 606                                     cur
->end
, old_snd_fack
); 
 607                                 cur
->end 
= sblkp
->start
; 
 608                                 cur
->rxmit 
= SEQ_MIN(cur
->rxmit
, cur
->end
); 
 611                                  * ACKs some data in the middle of a hole; 
 612                                  * need to split current hole 
 614                                 *newbytes_acked 
+= (sblkp
->end 
- sblkp
->start
); 
 615                                 tcp_sack_detect_reordering(tp
, cur
, 
 616                                     sblkp
->end
, old_snd_fack
); 
 617                                 temp 
= tcp_sackhole_insert(tp
, sblkp
->end
, 
 620                                         if (SEQ_GT(cur
->rxmit
, temp
->rxmit
)) { 
 621                                                 temp
->rxmit 
= cur
->rxmit
; 
 622                                                 tp
->sackhint
.sack_bytes_rexmit
 
 626                                         cur
->end 
= sblkp
->start
; 
 627                                         cur
->rxmit 
= SEQ_MIN(cur
->rxmit
, 
 630                                          * Reset the rxmit_start to that of 
 631                                          * the current hole as that will 
 632                                          * help to compute the reorder 
 635                                         temp
->rxmit_start 
= cur
->rxmit_start
; 
 639                 tp
->sackhint
.sack_bytes_rexmit 
+= (cur
->rxmit 
- cur
->start
); 
 641                  * Testing sblkp->start against cur->start tells us whether 
 642                  * we're done with the sack block or the sack hole. 
 643                  * Accordingly, we advance one or the other. 
 645                 if (SEQ_LEQ(sblkp
->start
, cur
->start
)) { 
 646                         cur 
= TAILQ_PREV(cur
, sackhole_head
, scblink
); 
 654  * Free all SACK holes to clear the scoreboard. 
 657 tcp_free_sackholes(struct tcpcb 
*tp
) 
 661         while ((q 
= TAILQ_FIRST(&tp
->snd_holes
)) != NULL
) { 
 662                 tcp_sackhole_remove(tp
, q
); 
 664         tp
->sackhint
.sack_bytes_rexmit 
= 0; 
 665         tp
->sackhint
.nexthole 
= NULL
; 
 666         tp
->sack_newdata 
= 0; 
 670  * Partial ack handling within a sack recovery episode. 
 671  * Keeping this very simple for now. When a partial ack 
 672  * is received, force snd_cwnd to a value that will allow 
 673  * the sender to transmit no more than 2 segments. 
 674  * If necessary, a better scheme can be adopted at a 
 675  * later point, but for now, the goal is to prevent the 
 676  * sender from bursting a large amount of data in the midst 
 680 tcp_sack_partialack(struct tcpcb 
*tp
, struct tcphdr 
*th
) 
 684         tp
->t_timer
[TCPT_REXMT
] = 0; 
 686         /* send one or 2 segments based on how much new data was acked */ 
 687         if (((BYTES_ACKED(th
, tp
)) / tp
->t_maxseg
) > 2) { 
 690         tp
->snd_cwnd 
= (tp
->sackhint
.sack_bytes_rexmit 
+ 
 691             (tp
->snd_nxt 
- tp
->sack_newdata
) + 
 692             num_segs 
* tp
->t_maxseg
); 
 693         if (tp
->snd_cwnd 
> tp
->snd_ssthresh
) { 
 694                 tp
->snd_cwnd 
= tp
->snd_ssthresh
; 
 696         if (SEQ_LT(tp
->snd_fack
, tp
->snd_recover
) && 
 697             tp
->snd_fack 
== th
->th_ack 
&& TAILQ_EMPTY(&tp
->snd_holes
)) { 
 698                 struct sackhole 
*temp
; 
 700                  * we received a partial ack but there is no sack_hole 
 701                  * that will cover the remaining seq space. In this case, 
 702                  * create a hole from snd_fack to snd_recover so that 
 703                  * the sack recovery will continue. 
 705                 temp 
= tcp_sackhole_insert(tp
, tp
->snd_fack
, 
 706                     tp
->snd_recover
, NULL
); 
 708                         tp
->snd_fack 
= tp
->snd_recover
; 
 711         (void) tcp_output(tp
); 
 715  * Debug version of tcp_sack_output() that walks the scoreboard. Used for 
 716  * now to sanity check the hint. 
 718 static struct sackhole 
* 
 719 tcp_sack_output_debug(struct tcpcb 
*tp
, int *sack_bytes_rexmt
) 
 723         *sack_bytes_rexmt 
= 0; 
 724         TAILQ_FOREACH(p
, &tp
->snd_holes
, scblink
) { 
 725                 if (SEQ_LT(p
->rxmit
, p
->end
)) { 
 726                         if (SEQ_LT(p
->rxmit
, tp
->snd_una
)) {/* old SACK hole */ 
 729                         *sack_bytes_rexmt 
+= (p
->rxmit 
- p
->start
); 
 732                 *sack_bytes_rexmt 
+= (p
->rxmit 
- p
->start
); 
 738  * Returns the next hole to retransmit and the number of retransmitted bytes 
 739  * from the scoreboard. We store both the next hole and the number of 
 740  * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK 
 741  * reception). This avoids scoreboard traversals completely. 
 743  * The loop here will traverse *at most* one link. Here's the argument. 
 744  * For the loop to traverse more than 1 link before finding the next hole to 
 745  * retransmit, we would need to have at least 1 node following the current hint 
 746  * with (rxmit == end). But, for all holes following the current hint, 
 747  * (start == rxmit), since we have not yet retransmitted from them. Therefore, 
 748  * in order to traverse more 1 link in the loop below, we need to have at least 
 749  * one node following the current hint with (start == rxmit == end). 
 750  * But that can't happen, (start == end) means that all the data in that hole 
 751  * has been sacked, in which case, the hole would have been removed from the 
 755 tcp_sack_output(struct tcpcb 
*tp
, int *sack_bytes_rexmt
) 
 757         struct sackhole 
*hole 
= NULL
, *dbg_hole 
= NULL
; 
 760         dbg_hole 
= tcp_sack_output_debug(tp
, &dbg_bytes_rexmt
); 
 761         *sack_bytes_rexmt 
= tp
->sackhint
.sack_bytes_rexmit
; 
 762         hole 
= tp
->sackhint
.nexthole
; 
 763         if (hole 
== NULL 
|| SEQ_LT(hole
->rxmit
, hole
->end
)) { 
 766         while ((hole 
= TAILQ_NEXT(hole
, scblink
)) != NULL
) { 
 767                 if (SEQ_LT(hole
->rxmit
, hole
->end
)) { 
 768                         tp
->sackhint
.nexthole 
= hole
; 
 773         if (dbg_hole 
!= hole
) { 
 774                 printf("%s: Computed sack hole not the same as cached value\n", __func__
); 
 777         if (*sack_bytes_rexmt 
!= dbg_bytes_rexmt
) { 
 778                 printf("%s: Computed sack_bytes_retransmitted (%d) not " 
 779                     "the same as cached value (%d)\n", 
 780                     __func__
, dbg_bytes_rexmt
, *sack_bytes_rexmt
); 
 781                 *sack_bytes_rexmt 
= dbg_bytes_rexmt
; 
 787  * After a timeout, the SACK list may be rebuilt.  This SACK information 
 788  * should be used to avoid retransmitting SACKed data.  This function 
 789  * traverses the SACK list to see if snd_nxt should be moved forward. 
 792 tcp_sack_adjust(struct tcpcb 
*tp
) 
 794         struct sackhole 
*p
, *cur 
= TAILQ_FIRST(&tp
->snd_holes
); 
 797                 return; /* No holes */ 
 799         if (SEQ_GEQ(tp
->snd_nxt
, tp
->snd_fack
)) { 
 800                 return; /* We're already beyond any SACKed blocks */ 
 803          * Two cases for which we want to advance snd_nxt: 
 804          * i) snd_nxt lies between end of one hole and beginning of another 
 805          * ii) snd_nxt lies between end of last hole and snd_fack 
 807         while ((p 
= TAILQ_NEXT(cur
, scblink
)) != NULL
) { 
 808                 if (SEQ_LT(tp
->snd_nxt
, cur
->end
)) { 
 811                 if (SEQ_GEQ(tp
->snd_nxt
, p
->start
)) { 
 814                         tp
->snd_nxt 
= p
->start
; 
 818         if (SEQ_LT(tp
->snd_nxt
, cur
->end
)) { 
 821         tp
->snd_nxt 
= tp
->snd_fack
; 
 826  * This function returns TRUE if more than (tcprexmtthresh - 1) * SMSS 
 827  * bytes with sequence numbers greater than snd_una have been SACKed. 
 830 tcp_sack_byte_islost(struct tcpcb 
*tp
) 
 832         u_int32_t unacked_bytes
, sndhole_bytes 
= 0; 
 833         struct sackhole 
*sndhole
; 
 834         if (!SACK_ENABLED(tp
) || IN_FASTRECOVERY(tp
) || 
 835             TAILQ_EMPTY(&tp
->snd_holes
) || 
 836             (tp
->t_flagsext 
& TF_PKTS_REORDERED
)) { 
 840         unacked_bytes 
= tp
->snd_max 
- tp
->snd_una
; 
 842         TAILQ_FOREACH(sndhole
, &tp
->snd_holes
, scblink
) { 
 843                 sndhole_bytes 
+= (sndhole
->end 
- sndhole
->start
); 
 846         VERIFY(unacked_bytes 
>= sndhole_bytes
); 
 847         return (unacked_bytes 
- sndhole_bytes
) > 
 848                ((tcprexmtthresh 
- 1) * tp
->t_maxseg
); 
 852  * Process any DSACK options that might be present on an input packet 
 856 tcp_sack_process_dsack(struct tcpcb 
*tp
, struct tcpopt 
*to
, 
 859         struct sackblk first_sack
, second_sack
; 
 860         struct tcp_rxt_seg 
*rxseg
; 
 862         bcopy(to
->to_sacks
, &first_sack
, sizeof(first_sack
)); 
 863         first_sack
.start 
= ntohl(first_sack
.start
); 
 864         first_sack
.end 
= ntohl(first_sack
.end
); 
 866         if (to
->to_nsacks 
> 1) { 
 867                 bcopy((to
->to_sacks 
+ TCPOLEN_SACK
), &second_sack
, 
 868                     sizeof(second_sack
)); 
 869                 second_sack
.start 
= ntohl(second_sack
.start
); 
 870                 second_sack
.end 
= ntohl(second_sack
.end
); 
 873         if (SEQ_LT(first_sack
.start
, th
->th_ack
) && 
 874             SEQ_LEQ(first_sack
.end
, th
->th_ack
)) { 
 876                  * There is a dsack option reporting a duplicate segment 
 877                  * also covered by cumulative acknowledgement. 
 879                  * Validate the sequence numbers before looking at dsack 
 880                  * option. The duplicate notification can come after 
 881                  * snd_una moves forward. In order to set a window of valid 
 882                  * sequence numbers to look for, we set a maximum send 
 883                  * window within which the DSACK option will be processed. 
 885                 if (!(TCP_DSACK_SEQ_IN_WINDOW(tp
, first_sack
.start
, th
->th_ack
) && 
 886                     TCP_DSACK_SEQ_IN_WINDOW(tp
, first_sack
.end
, th
->th_ack
))) { 
 888                         to
->to_sacks 
+= TCPOLEN_SACK
; 
 889                         tcpstat
.tcps_dsack_recvd_old
++; 
 892                          * returning true here so that the ack will not be 
 893                          * treated as duplicate ack. 
 897         } else if (to
->to_nsacks 
> 1 && 
 898             SEQ_LEQ(second_sack
.start
, first_sack
.start
) && 
 899             SEQ_GEQ(second_sack
.end
, first_sack
.end
)) { 
 901                  * there is a dsack option in the first block not 
 902                  * covered by the cumulative acknowledgement but covered 
 903                  * by the second sack block. 
 905                  * verify the sequence numbes on the second sack block 
 906                  * before processing the DSACK option. Returning false 
 907                  * here will treat the ack as a duplicate ack. 
 909                 if (!TCP_VALIDATE_SACK_SEQ_NUMBERS(tp
, &second_sack
, 
 912                         to
->to_sacks 
+= TCPOLEN_SACK
; 
 913                         tcpstat
.tcps_dsack_recvd_old
++; 
 917                 /* no dsack options, proceed with processing the sack */ 
 921         /* Update the tcpopt pointer to exclude dsack block */ 
 923         to
->to_sacks 
+= TCPOLEN_SACK
; 
 924         tcpstat
.tcps_dsack_recvd
++; 
 927         /* ignore DSACK option, if DSACK is disabled */ 
 928         if (tp
->t_flagsext 
& TF_DISABLE_DSACK
) { 
 932         /* If the DSACK is for TLP mark it as such */ 
 933         if ((tp
->t_flagsext 
& TF_SENT_TLPROBE
) && 
 934             first_sack
.end 
== tp
->t_tlphighrxt
) { 
 935                 if ((rxseg 
= tcp_rxtseg_find(tp
, first_sack
.start
, 
 936                     (first_sack
.end 
- 1))) != NULL
) { 
 937                         rxseg
->rx_flags 
|= TCP_RXT_DSACK_FOR_TLP
; 
 940         /* Update the sender's retransmit segment state */ 
 941         if (((tp
->t_rxtshift 
== 1 && first_sack
.start 
== tp
->snd_una
) || 
 942             ((tp
->t_flagsext 
& TF_SENT_TLPROBE
) && 
 943             first_sack
.end 
== tp
->t_tlphighrxt
)) && 
 944             TAILQ_EMPTY(&tp
->snd_holes
) && 
 945             SEQ_GT(th
->th_ack
, tp
->snd_una
)) { 
 947                  * If the dsack is for a retransmitted packet and one of 
 948                  * the two cases is true, it indicates ack loss: 
 949                  * - retransmit timeout and first_sack.start == snd_una 
 950                  * - TLP probe and first_sack.end == tlphighrxt 
 952                  * Ignore dsack and do not update state when there is 
 955                 tcpstat
.tcps_dsack_ackloss
++; 
 958         } else if ((rxseg 
= tcp_rxtseg_find(tp
, first_sack
.start
, 
 959             (first_sack
.end 
- 1))) == NULL
) { 
 961                  * Duplicate notification was not triggered by a 
 962                  * retransmission. This might be due to network duplication, 
 963                  * disable further DSACK processing. 
 965                 if (!tcp_dsack_ignore_hw_duplicates
) { 
 966                         tp
->t_flagsext 
|= TF_DISABLE_DSACK
; 
 967                         tcpstat
.tcps_dsack_disable
++; 
 971                  * If the segment was retransmitted only once, mark it as 
 972                  * spurious. Otherwise ignore the duplicate notification. 
 974                 if (rxseg
->rx_count 
== 1) { 
 975                         rxseg
->rx_flags 
|= TCP_RXT_SPURIOUS
; 
 977                         rxseg
->rx_flags 
&= ~TCP_RXT_SPURIOUS
;