bsd/netinet/tcp_sack.c

   1 /*
   2  * Copyright (c) 2004-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  30  *      The Regents of the University of California.  All rights reserved.
  31  *
  32  * Redistribution and use in source and binary forms, with or without
  33  * modification, are permitted provided that the following conditions
  34  * are met:
  35  * 1. Redistributions of source code must retain the above copyright
  36  *      notice, this list of conditions and the following disclaimer.
  37  * 2. Redistributions in binary form must reproduce the above copyright
  38  *      notice, this list of conditions and the following disclaimer in the
  39  *      documentation and/or other materials provided with the distribution.
  40  * 3. All advertising materials mentioning features or use of this software
  41  *      must display the following acknowledgement:
  42  *      This product includes software developed by the University of
  43  *      California, Berkeley and its contributors.
  44  * 4. Neither the name of the University nor the names of its contributors
  45  *      may be used to endorse or promote products derived from this software
  46  *      without specific prior written permission.
  47  *
  48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  58  * SUCH DAMAGE.
  59  *
  60  */
  61
  62 #define _IP_VHL
  63
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/kernel.h>
  68 #include <sys/sysctl.h>
  69 #include <sys/mbuf.h>
  70 #include <sys/domain.h>
  71 #include <sys/protosw.h>
  72 #include <sys/socket.h>
  73 #include <sys/socketvar.h>
  74
  75 #include <kern/zalloc.h>
  76
  77 #include <net/route.h>
  78
  79 #include <netinet/in.h>
  80 #include <netinet/in_systm.h>
  81 #include <netinet/ip.h>
  82 #include <netinet/in_pcb.h>
  83 #include <netinet/ip_var.h>
  84 #include <netinet6/in6_pcb.h>
  85 #include <netinet/ip6.h>
  86 #include <netinet6/ip6_var.h>
  87 #include <netinet/tcp.h>
  88 #include <netinet/tcp_fsm.h>
  89 #include <netinet/tcp_seq.h>
  90 #include <netinet/tcp_timer.h>
  91 #include <netinet/tcp_var.h>
  92 #include <netinet/tcpip.h>
  93 #include <netinet/tcp_cache.h>
  94 #if TCPDEBUG
  95 #include <netinet/tcp_debug.h>
  96 #endif
  97 #include <sys/kdebug.h>
  98
  99 #if IPSEC
 100 #include <netinet6/ipsec.h>
 101 #endif /*IPSEC*/
 102
 103 #include <libkern/OSAtomic.h>
 104
 105 SYSCTL_SKMEM_TCP_INT(OID_AUTO, sack, CTLFLAG_RW | CTLFLAG_LOCKED,
 106     int, tcp_do_sack, 1, "Enable/Disable TCP SACK support");
 107 SYSCTL_SKMEM_TCP_INT(OID_AUTO, sack_maxholes, CTLFLAG_RW | CTLFLAG_LOCKED,
 108     static int, tcp_sack_maxholes, 128,
 109     "Maximum number of TCP SACK holes allowed per connection");
 110
 111 SYSCTL_SKMEM_TCP_INT(OID_AUTO, sack_globalmaxholes,
 112     CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_sack_globalmaxholes, 65536,
 113     "Global maximum number of TCP SACK holes");
 114
 115 static SInt32 tcp_sack_globalholes = 0;
 116 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalholes, CTLFLAG_RD | CTLFLAG_LOCKED,
 117     &tcp_sack_globalholes, 0,
 118     "Global number of TCP SACK holes currently allocated");
 119
 120 extern struct zone *sack_hole_zone;
 121
 122 #define TCP_VALIDATE_SACK_SEQ_NUMBERS(_tp_, _sb_, _ack_) \
 123     (SEQ_GT((_sb_)->end, (_sb_)->start) && \
 124     SEQ_GT((_sb_)->start, (_tp_)->snd_una) && \
 125     SEQ_GT((_sb_)->start, (_ack_)) && \
 126     SEQ_LT((_sb_)->start, (_tp_)->snd_max) && \
 127     SEQ_GT((_sb_)->end, (_tp_)->snd_una) && \
 128     SEQ_LEQ((_sb_)->end, (_tp_)->snd_max))
 129
 130 /*
 131  * This function is called upon receipt of new valid data (while not in header
 132  * prediction mode), and it updates the ordered list of sacks.
 133  */
 134 void
 135 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
 136 {
 137         /*
 138          * First reported block MUST be the most recent one.  Subsequent
 139          * blocks SHOULD be in the order in which they arrived at the
 140          * receiver.  These two conditions make the implementation fully
 141          * compliant with RFC 2018.
 142          */
 143         struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
 144         int num_head, num_saved, i;
 145
 146         /* SACK block for the received segment. */
 147         head_blk.start = rcv_start;
 148         head_blk.end = rcv_end;
 149
 150         /*
 151          * Merge updated SACK blocks into head_blk, and
 152          * save unchanged SACK blocks into saved_blks[].
 153          * num_saved will have the number of the saved SACK blocks.
 154          */
 155         num_saved = 0;
 156         for (i = 0; i < tp->rcv_numsacks; i++) {
 157                 tcp_seq start = tp->sackblks[i].start;
 158                 tcp_seq end = tp->sackblks[i].end;
 159                 if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
 160                         /*
 161                          * Discard this SACK block.
 162                          */
 163                 } else if (SEQ_LEQ(head_blk.start, end) &&
 164                     SEQ_GEQ(head_blk.end, start)) {
 165                         /*
 166                          * Merge this SACK block into head_blk.
 167                          * This SACK block itself will be discarded.
 168                          */
 169                         if (SEQ_GT(head_blk.start, start)) {
 170                                 head_blk.start = start;
 171                         }
 172                         if (SEQ_LT(head_blk.end, end)) {
 173                                 head_blk.end = end;
 174                         }
 175                 } else {
 176                         /*
 177                          * Save this SACK block.
 178                          */
 179                         saved_blks[num_saved].start = start;
 180                         saved_blks[num_saved].end = end;
 181                         num_saved++;
 182                 }
 183         }
 184
 185         /*
 186          * Update SACK list in tp->sackblks[].
 187          */
 188         num_head = 0;
 189         if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
 190                 /*
 191                  * The received data segment is an out-of-order segment.
 192                  * Put head_blk at the top of SACK list.
 193                  */
 194                 tp->sackblks[0] = head_blk;
 195                 num_head = 1;
 196                 /*
 197                  * If the number of saved SACK blocks exceeds its limit,
 198                  * discard the last SACK block.
 199                  */
 200                 if (num_saved >= MAX_SACK_BLKS) {
 201                         num_saved--;
 202                 }
 203         }
 204         if (num_saved > 0) {
 205                 /*
 206                  * Copy the saved SACK blocks back.
 207                  */
 208                 bcopy(saved_blks, &tp->sackblks[num_head], sizeof(struct sackblk) * num_saved);
 209         }
 210
 211         /* Save the number of SACK blocks. */
 212         tp->rcv_numsacks = num_head + num_saved;
 213
 214         /* If we are requesting SACK recovery, reset the stretch-ack state
 215          * so that connection will generate more acks after recovery and
 216          * sender's cwnd will open.
 217          */
 218         if ((tp->t_flags & TF_STRETCHACK) != 0 && tp->rcv_numsacks > 0) {
 219                 tcp_reset_stretch_ack(tp);
 220         }
 221         if (tp->rcv_numsacks > 0) {
 222                 tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
 223         }
 224
 225 #if TRAFFIC_MGT
 226         if (tp->acc_iaj > 0 && tp->rcv_numsacks > 0) {
 227                 reset_acc_iaj(tp);
 228         }
 229 #endif /* TRAFFIC_MGT */
 230 }
 231
 232 /*
 233  * Delete all receiver-side SACK information.
 234  */
 235 void
 236 tcp_clean_sackreport( struct tcpcb *tp)
 237 {
 238         tp->rcv_numsacks = 0;
 239         bzero(&tp->sackblks[0], sizeof(struct sackblk) * MAX_SACK_BLKS);
 240 }
 241
 242 /*
 243  * Allocate struct sackhole.
 244  */
 245 static struct sackhole *
 246 tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end)
 247 {
 248         struct sackhole *hole;
 249
 250         if (tp->snd_numholes >= tcp_sack_maxholes ||
 251             tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
 252                 tcpstat.tcps_sack_sboverflow++;
 253                 return NULL;
 254         }
 255
 256         hole = (struct sackhole *)zalloc(sack_hole_zone);
 257         if (hole == NULL) {
 258                 return NULL;
 259         }
 260
 261         hole->start = start;
 262         hole->end = end;
 263         hole->rxmit = start;
 264
 265         tp->snd_numholes++;
 266         OSIncrementAtomic(&tcp_sack_globalholes);
 267
 268         return hole;
 269 }
 270
 271 /*
 272  * Free struct sackhole.
 273  */
 274 static void
 275 tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole)
 276 {
 277         zfree(sack_hole_zone, hole);
 278
 279         tp->snd_numholes--;
 280         OSDecrementAtomic(&tcp_sack_globalholes);
 281 }
 282
 283 /*
 284  * Insert new SACK hole into scoreboard.
 285  */
 286 static struct sackhole *
 287 tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end,
 288     struct sackhole *after)
 289 {
 290         struct sackhole *hole;
 291
 292         /* Allocate a new SACK hole. */
 293         hole = tcp_sackhole_alloc(tp, start, end);
 294         if (hole == NULL) {
 295                 return NULL;
 296         }
 297         hole->rxmit_start = tcp_now;
 298         /* Insert the new SACK hole into scoreboard */
 299         if (after != NULL) {
 300                 TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink);
 301         } else {
 302                 TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink);
 303         }
 304
 305         /* Update SACK hint. */
 306         if (tp->sackhint.nexthole == NULL) {
 307                 tp->sackhint.nexthole = hole;
 308         }
 309
 310         return hole;
 311 }
 312
 313 /*
 314  * Remove SACK hole from scoreboard.
 315  */
 316 static void
 317 tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole)
 318 {
 319         /* Update SACK hint. */
 320         if (tp->sackhint.nexthole == hole) {
 321                 tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink);
 322         }
 323
 324         /* Remove this SACK hole. */
 325         TAILQ_REMOVE(&tp->snd_holes, hole, scblink);
 326
 327         /* Free this SACK hole. */
 328         tcp_sackhole_free(tp, hole);
 329 }
 330 /*
 331  * When a new ack with SACK is received, check if it indicates packet
 332  * reordering. If there is packet reordering, the socket is marked and
 333  * the late time offset by which the packet was reordered with
 334  * respect to its closest neighboring packets is computed.
 335  */
 336 static void
 337 tcp_sack_detect_reordering(struct tcpcb *tp, struct sackhole *s,
 338     tcp_seq sacked_seq, tcp_seq snd_fack)
 339 {
 340         int32_t rext = 0, reordered = 0;
 341
 342         /*
 343          * If the SACK hole is past snd_fack, this is from new SACK
 344          * information, so we can ignore it.
 345          */
 346         if (SEQ_GT(s->end, snd_fack)) {
 347                 return;
 348         }
 349         /*
 350          * If there has been a retransmit timeout, then the timestamp on
 351          * the SACK segment will be newer. This might lead to a
 352          * false-positive. Avoid re-ordering detection in this case.
 353          */
 354         if (tp->t_rxtshift > 0) {
 355                 return;
 356         }
 357
 358         /*
 359          * Detect reordering from SACK information by checking
 360          * if recently sacked data was never retransmitted from this hole.
 361          *
 362          * First, we look for the byte in the list of retransmitted segments. This one
 363          * will contain even the segments that are retransmitted thanks to RTO/TLP.
 364          *
 365          * Then, we check the sackhole which indicates whether or not the sackhole
 366          * was subject to retransmission.
 367          */
 368         if (SEQ_LT(s->rxmit, sacked_seq) &&
 369             (!tcp_do_better_lr || tcp_rxtseg_find(tp, sacked_seq - 1, sacked_seq - 1) == NULL)) {
 370                 reordered = 1;
 371                 tcpstat.tcps_avoid_rxmt++;
 372         }
 373
 374         if (reordered) {
 375                 if (!(tp->t_flagsext & TF_PKTS_REORDERED)) {
 376                         tp->t_flagsext |= TF_PKTS_REORDERED;
 377                         tcpstat.tcps_detect_reordering++;
 378                 }
 379
 380                 tcpstat.tcps_reordered_pkts++;
 381                 tp->t_reordered_pkts++;
 382
 383                 /*
 384                  * If reordering is seen on a connection wth ECN enabled,
 385                  * increment the heuristic
 386                  */
 387                 if (TCP_ECN_ENABLED(tp)) {
 388                         INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_reorder);
 389                         tcpstat.tcps_ecn_fallback_reorder++;
 390                         tcp_heuristic_ecn_aggressive(tp);
 391                 }
 392
 393                 VERIFY(SEQ_GEQ(snd_fack, s->rxmit));
 394
 395                 if (s->rxmit_start > 0) {
 396                         rext = timer_diff(tcp_now, 0, s->rxmit_start, 0);
 397                         if (rext < 0) {
 398                                 return;
 399                         }
 400
 401                         /*
 402                          * We take the maximum reorder window to schedule
 403                          * DELAYFR timer as that will take care of jitter
 404                          * on the network path.
 405                          *
 406                          * Computing average and standard deviation seems
 407                          * to cause unnecessary retransmissions when there
 408                          * is high jitter.
 409                          *
 410                          * We set a maximum of SRTT/2 and a minimum of
 411                          * 10 ms on the reorder window.
 412                          */
 413                         tp->t_reorderwin = max(tp->t_reorderwin, rext);
 414                         tp->t_reorderwin = min(tp->t_reorderwin,
 415                             (tp->t_srtt >> (TCP_RTT_SHIFT - 1)));
 416                         tp->t_reorderwin = max(tp->t_reorderwin, 10);
 417                 }
 418         }
 419 }
 420
 421 static void
 422 tcp_sack_update_byte_counter(struct tcpcb *tp, uint32_t start, uint32_t end,
 423     uint32_t *newbytes_acked, uint32_t *towards_fr_acked)
 424 {
 425         *newbytes_acked += (end - start);
 426         if (SEQ_GEQ(start, tp->send_highest_sack)) {
 427                 *towards_fr_acked += (end - start);
 428         }
 429 }
 430
 431 /*
 432  * Process cumulative ACK and the TCP SACK option to update the scoreboard.
 433  * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
 434  * the sequence space).
 435  */
 436 void
 437 tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th,
 438     u_int32_t *newbytes_acked, uint32_t *after_rexmit_acked)
 439 {
 440         struct sackhole *cur, *temp;
 441         struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
 442         int i, j, num_sack_blks;
 443         tcp_seq old_snd_fack = 0, th_ack = th->th_ack;
 444
 445         num_sack_blks = 0;
 446         /*
 447          * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
 448          * treat [SND.UNA, SEG.ACK) as if it is a SACK block.
 449          */
 450         if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
 451                 sack_blocks[num_sack_blks].start = tp->snd_una;
 452                 sack_blocks[num_sack_blks++].end = th_ack;
 453         }
 454         /*
 455          * Append received valid SACK blocks to sack_blocks[].
 456          * Check that the SACK block range is valid.
 457          */
 458         for (i = 0; i < to->to_nsacks; i++) {
 459                 bcopy((to->to_sacks + i * TCPOLEN_SACK),
 460                     &sack, sizeof(sack));
 461                 sack.start = ntohl(sack.start);
 462                 sack.end = ntohl(sack.end);
 463                 if (TCP_VALIDATE_SACK_SEQ_NUMBERS(tp, &sack, th_ack)) {
 464                         sack_blocks[num_sack_blks++] = sack;
 465                 }
 466         }
 467
 468         /*
 469          * Return if SND.UNA is not advanced and no valid SACK block
 470          * is received.
 471          */
 472         if (num_sack_blks == 0) {
 473                 return;
 474         }
 475
 476         VERIFY(num_sack_blks <= (TCP_MAX_SACK + 1));
 477         /*
 478          * Sort the SACK blocks so we can update the scoreboard
 479          * with just one pass. The overhead of sorting upto 4+1 elements
 480          * is less than making upto 4+1 passes over the scoreboard.
 481          */
 482         for (i = 0; i < num_sack_blks; i++) {
 483                 for (j = i + 1; j < num_sack_blks; j++) {
 484                         if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
 485                                 sack = sack_blocks[i];
 486                                 sack_blocks[i] = sack_blocks[j];
 487                                 sack_blocks[j] = sack;
 488                         }
 489                 }
 490         }
 491         if (TAILQ_EMPTY(&tp->snd_holes)) {
 492                 /*
 493                  * Empty scoreboard. Need to initialize snd_fack (it may be
 494                  * uninitialized or have a bogus value). Scoreboard holes
 495                  * (from the sack blocks received) are created later below (in
 496                  * the logic that adds holes to the tail of the scoreboard).
 497                  */
 498                 tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
 499         }
 500
 501         old_snd_fack = tp->snd_fack;
 502         /*
 503          * In the while-loop below, incoming SACK blocks (sack_blocks[])
 504          * and SACK holes (snd_holes) are traversed from their tails with
 505          * just one pass in order to reduce the number of compares especially
 506          * when the bandwidth-delay product is large.
 507          * Note: Typically, in the first RTT of SACK recovery, the highest
 508          * three or four SACK blocks with the same ack number are received.
 509          * In the second RTT, if retransmitted data segments are not lost,
 510          * the highest three or four SACK blocks with ack number advancing
 511          * are received.
 512          */
 513         sblkp = &sack_blocks[num_sack_blks - 1];        /* Last SACK block */
 514         if (SEQ_LT(tp->snd_fack, sblkp->start)) {
 515                 /*
 516                  * The highest SACK block is beyond fack.
 517                  * Append new SACK hole at the tail.
 518                  * If the second or later highest SACK blocks are also
 519                  * beyond the current fack, they will be inserted by
 520                  * way of hole splitting in the while-loop below.
 521                  */
 522                 temp = tcp_sackhole_insert(tp, tp->snd_fack, sblkp->start, NULL);
 523                 if (temp != NULL) {
 524                         tp->snd_fack = sblkp->end;
 525                         tcp_sack_update_byte_counter(tp, sblkp->start, sblkp->end, newbytes_acked, after_rexmit_acked);
 526
 527                         /* Go to the previous sack block. */
 528                         sblkp--;
 529                 } else {
 530                         /*
 531                          * We failed to add a new hole based on the current
 532                          * sack block.  Skip over all the sack blocks that
 533                          * fall completely to the right of snd_fack and proceed
 534                          * to trim the scoreboard based on the remaining sack
 535                          * blocks. This also trims the scoreboard for th_ack
 536                          * (which is sack_blocks[0]).
 537                          */
 538                         while (sblkp >= sack_blocks &&
 539                             SEQ_LT(tp->snd_fack, sblkp->start)) {
 540                                 sblkp--;
 541                         }
 542                         if (sblkp >= sack_blocks &&
 543                             SEQ_LT(tp->snd_fack, sblkp->end)) {
 544                                 tcp_sack_update_byte_counter(tp, tp->snd_fack, sblkp->end, newbytes_acked, after_rexmit_acked);
 545                                 tp->snd_fack = sblkp->end;
 546                         }
 547                 }
 548         } else if (SEQ_LT(tp->snd_fack, sblkp->end)) {
 549                 /* fack is advanced. */
 550                 tcp_sack_update_byte_counter(tp, tp->snd_fack, sblkp->end, newbytes_acked, after_rexmit_acked);
 551                 tp->snd_fack = sblkp->end;
 552         }
 553         /* We must have at least one SACK hole in scoreboard */
 554         cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole */
 555         /*
 556          * Since the incoming sack blocks are sorted, we can process them
 557          * making one sweep of the scoreboard.
 558          */
 559         while (sblkp >= sack_blocks && cur != NULL) {
 560                 if (SEQ_GEQ(sblkp->start, cur->end)) {
 561                         /*
 562                          * SACKs data beyond the current hole.
 563                          * Go to the previous sack block.
 564                          */
 565                         sblkp--;
 566                         continue;
 567                 }
 568                 if (SEQ_LEQ(sblkp->end, cur->start)) {
 569                         /*
 570                          * SACKs data before the current hole.
 571                          * Go to the previous hole.
 572                          */
 573                         cur = TAILQ_PREV(cur, sackhole_head, scblink);
 574                         continue;
 575                 }
 576                 tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start);
 577                 if (tp->sackhint.sack_bytes_rexmit < 0) {
 578                         tp->sackhint.sack_bytes_rexmit = 0;
 579                 }
 580
 581                 if (SEQ_LEQ(sblkp->start, cur->start)) {
 582                         /* Data acks at least the beginning of hole */
 583                         if (SEQ_GEQ(sblkp->end, cur->end)) {
 584                                 /* Acks entire hole, so delete hole */
 585                                 tcp_sack_update_byte_counter(tp, cur->start, cur->end, newbytes_acked, after_rexmit_acked);
 586
 587                                 tcp_sack_detect_reordering(tp, cur,
 588                                     cur->end, old_snd_fack);
 589                                 temp = cur;
 590                                 cur = TAILQ_PREV(cur, sackhole_head, scblink);
 591                                 tcp_sackhole_remove(tp, temp);
 592                                 /*
 593                                  * The sack block may ack all or part of the next
 594                                  * hole too, so continue onto the next hole.
 595                                  */
 596                                 continue;
 597                         } else {
 598                                 /* Move start of hole forward */
 599                                 tcp_sack_update_byte_counter(tp, cur->start, sblkp->end, newbytes_acked, after_rexmit_acked);
 600                                 tcp_sack_detect_reordering(tp, cur,
 601                                     sblkp->end, old_snd_fack);
 602                                 cur->start = sblkp->end;
 603                                 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
 604                         }
 605                 } else {
 606                         /* Data acks at least the end of hole */
 607                         if (SEQ_GEQ(sblkp->end, cur->end)) {
 608                                 /* Move end of hole backward */
 609                                 tcp_sack_update_byte_counter(tp, sblkp->start, cur->end, newbytes_acked, after_rexmit_acked);
 610                                 tcp_sack_detect_reordering(tp, cur,
 611                                     cur->end, old_snd_fack);
 612                                 cur->end = sblkp->start;
 613                                 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
 614                         } else {
 615                                 /*
 616                                  * ACKs some data in the middle of a hole;
 617                                  * need to split current hole
 618                                  */
 619                                 tcp_sack_detect_reordering(tp, cur,
 620                                     sblkp->end, old_snd_fack);
 621                                 temp = tcp_sackhole_insert(tp, sblkp->end,
 622                                     cur->end, cur);
 623                                 if (temp != NULL) {
 624                                         tcp_sack_update_byte_counter(tp, sblkp->start, sblkp->end, newbytes_acked, after_rexmit_acked);
 625                                         if (SEQ_GT(cur->rxmit, temp->rxmit)) {
 626                                                 temp->rxmit = cur->rxmit;
 627                                                 tp->sackhint.sack_bytes_rexmit
 628                                                         += (temp->rxmit
 629                                                     - temp->start);
 630                                         }
 631                                         cur->end = sblkp->start;
 632                                         cur->rxmit = SEQ_MIN(cur->rxmit,
 633                                             cur->end);
 634                                         /*
 635                                          * Reset the rxmit_start to that of
 636                                          * the current hole as that will
 637                                          * help to compute the reorder
 638                                          * window correctly
 639                                          */
 640                                         temp->rxmit_start = cur->rxmit_start;
 641                                 }
 642                         }
 643                 }
 644                 tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start);
 645                 /*
 646                  * Testing sblkp->start against cur->start tells us whether
 647                  * we're done with the sack block or the sack hole.
 648                  * Accordingly, we advance one or the other.
 649                  */
 650                 if (SEQ_LEQ(sblkp->start, cur->start)) {
 651                         cur = TAILQ_PREV(cur, sackhole_head, scblink);
 652                 } else {
 653                         sblkp--;
 654                 }
 655         }
 656 }
 657
 658 /*
 659  * Free all SACK holes to clear the scoreboard.
 660  */
 661 void
 662 tcp_free_sackholes(struct tcpcb *tp)
 663 {
 664         struct sackhole *q;
 665
 666         while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
 667                 tcp_sackhole_remove(tp, q);
 668         }
 669         tp->sackhint.sack_bytes_rexmit = 0;
 670         tp->sackhint.sack_bytes_acked = 0;
 671         tp->t_new_dupacks = 0;
 672         tp->sackhint.nexthole = NULL;
 673         tp->sack_newdata = 0;
 674 }
 675
 676 /*
 677  * Partial ack handling within a sack recovery episode.
 678  * Keeping this very simple for now. When a partial ack
 679  * is received, force snd_cwnd to a value that will allow
 680  * the sender to transmit no more than 2 segments.
 681  * If necessary, a better scheme can be adopted at a
 682  * later point, but for now, the goal is to prevent the
 683  * sender from bursting a large amount of data in the midst
 684  * of sack recovery.
 685  */
 686 void
 687 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
 688 {
 689         int num_segs = 1;
 690
 691         tp->t_timer[TCPT_REXMT] = 0;
 692         tp->t_rtttime = 0;
 693         /* send one or 2 segments based on how much new data was acked */
 694         if (((BYTES_ACKED(th, tp)) / tp->t_maxseg) > 2) {
 695                 num_segs = 2;
 696         }
 697         if (tcp_do_better_lr) {
 698                 tp->snd_cwnd = tcp_flight_size(tp) + num_segs * tp->t_maxseg;
 699         } else {
 700                 tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
 701                     (tp->snd_nxt - tp->sack_newdata) +
 702                     num_segs * tp->t_maxseg);
 703         }
 704         if (tp->snd_cwnd > tp->snd_ssthresh) {
 705                 tp->snd_cwnd = tp->snd_ssthresh;
 706         }
 707         if (SEQ_LT(tp->snd_fack, tp->snd_recover) &&
 708             tp->snd_fack == th->th_ack && TAILQ_EMPTY(&tp->snd_holes)) {
 709                 struct sackhole *temp;
 710                 /*
 711                  * we received a partial ack but there is no sack_hole
 712                  * that will cover the remaining seq space. In this case,
 713                  * create a hole from snd_fack to snd_recover so that
 714                  * the sack recovery will continue.
 715                  */
 716                 temp = tcp_sackhole_insert(tp, tp->snd_fack,
 717                     tp->snd_recover, NULL);
 718                 if (temp != NULL) {
 719                         tp->snd_fack = tp->snd_recover;
 720                 }
 721         }
 722         (void) tcp_output(tp);
 723 }
 724
 725 /*
 726  * Debug version of tcp_sack_output() that walks the scoreboard. Used for
 727  * now to sanity check the hint.
 728  */
 729 static struct sackhole *
 730 tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt)
 731 {
 732         struct sackhole *p;
 733
 734         *sack_bytes_rexmt = 0;
 735         TAILQ_FOREACH(p, &tp->snd_holes, scblink) {
 736                 if (SEQ_LT(p->rxmit, p->end)) {
 737                         if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
 738                                 continue;
 739                         }
 740                         *sack_bytes_rexmt += (p->rxmit - p->start);
 741                         break;
 742                 }
 743                 *sack_bytes_rexmt += (p->rxmit - p->start);
 744         }
 745         return p;
 746 }
 747
 748 /*
 749  * Returns the next hole to retransmit and the number of retransmitted bytes
 750  * from the scoreboard. We store both the next hole and the number of
 751  * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK
 752  * reception). This avoids scoreboard traversals completely.
 753  *
 754  * The loop here will traverse *at most* one link. Here's the argument.
 755  * For the loop to traverse more than 1 link before finding the next hole to
 756  * retransmit, we would need to have at least 1 node following the current hint
 757  * with (rxmit == end). But, for all holes following the current hint,
 758  * (start == rxmit), since we have not yet retransmitted from them. Therefore,
 759  * in order to traverse more 1 link in the loop below, we need to have at least
 760  * one node following the current hint with (start == rxmit == end).
 761  * But that can't happen, (start == end) means that all the data in that hole
 762  * has been sacked, in which case, the hole would have been removed from the
 763  * scoreboard.
 764  */
 765 struct sackhole *
 766 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
 767 {
 768         struct sackhole *hole = NULL, *dbg_hole = NULL;
 769         int dbg_bytes_rexmt;
 770
 771         dbg_hole = tcp_sack_output_debug(tp, &dbg_bytes_rexmt);
 772         *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit;
 773         hole = tp->sackhint.nexthole;
 774         if (hole == NULL || SEQ_LT(hole->rxmit, hole->end)) {
 775                 goto out;
 776         }
 777         while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) {
 778                 if (SEQ_LT(hole->rxmit, hole->end)) {
 779                         tp->sackhint.nexthole = hole;
 780                         break;
 781                 }
 782         }
 783 out:
 784         if (dbg_hole != hole) {
 785                 printf("%s: Computed sack hole not the same as cached value\n", __func__);
 786                 hole = dbg_hole;
 787         }
 788         if (*sack_bytes_rexmt != dbg_bytes_rexmt) {
 789                 printf("%s: Computed sack_bytes_retransmitted (%d) not "
 790                     "the same as cached value (%d)\n",
 791                     __func__, dbg_bytes_rexmt, *sack_bytes_rexmt);
 792                 *sack_bytes_rexmt = dbg_bytes_rexmt;
 793         }
 794         return hole;
 795 }
 796
 797 void
 798 tcp_sack_lost_rexmit(struct tcpcb *tp)
 799 {
 800         struct sackhole *hole = TAILQ_FIRST(&tp->snd_holes);
 801
 802         while (hole) {
 803                 hole->rxmit = hole->start;
 804                 hole->rxmit_start = tcp_now;
 805
 806                 hole = TAILQ_NEXT(hole, scblink);
 807         }
 808
 809         tp->sackhint.nexthole = TAILQ_FIRST(&tp->snd_holes);
 810         tp->sackhint.sack_bytes_rexmit = 0;
 811         tp->sack_newdata = tp->snd_nxt;
 812 }
 813
 814 /*
 815  * After a timeout, the SACK list may be rebuilt.  This SACK information
 816  * should be used to avoid retransmitting SACKed data.  This function
 817  * traverses the SACK list to see if snd_nxt should be moved forward.
 818  */
 819 void
 820 tcp_sack_adjust(struct tcpcb *tp)
 821 {
 822         struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes);
 823
 824         if (cur == NULL) {
 825                 return; /* No holes */
 826         }
 827         if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) {
 828                 return; /* We're already beyond any SACKed blocks */
 829         }
 830         /*
 831          * Two cases for which we want to advance snd_nxt:
 832          * i) snd_nxt lies between end of one hole and beginning of another
 833          * ii) snd_nxt lies between end of last hole and snd_fack
 834          */
 835         while ((p = TAILQ_NEXT(cur, scblink)) != NULL) {
 836                 if (SEQ_LT(tp->snd_nxt, cur->end)) {
 837                         return;
 838                 }
 839                 if (SEQ_GEQ(tp->snd_nxt, p->start)) {
 840                         cur = p;
 841                 } else {
 842                         tp->snd_nxt = p->start;
 843                         return;
 844                 }
 845         }
 846         if (SEQ_LT(tp->snd_nxt, cur->end)) {
 847                 return;
 848         }
 849         tp->snd_nxt = tp->snd_fack;
 850         return;
 851 }
 852
 853 /*
 854  * This function returns TRUE if more than (tcprexmtthresh - 1) * SMSS
 855  * bytes with sequence numbers greater than snd_una have been SACKed.
 856  */
 857 boolean_t
 858 tcp_sack_byte_islost(struct tcpcb *tp)
 859 {
 860         u_int32_t unacked_bytes, sndhole_bytes = 0;
 861         struct sackhole *sndhole;
 862         if (!SACK_ENABLED(tp) || IN_FASTRECOVERY(tp) ||
 863             TAILQ_EMPTY(&tp->snd_holes) ||
 864             (tp->t_flagsext & TF_PKTS_REORDERED)) {
 865                 return FALSE;
 866         }
 867
 868         unacked_bytes = tp->snd_max - tp->snd_una;
 869
 870         TAILQ_FOREACH(sndhole, &tp->snd_holes, scblink) {
 871                 sndhole_bytes += (sndhole->end - sndhole->start);
 872         }
 873
 874         VERIFY(unacked_bytes >= sndhole_bytes);
 875         return (unacked_bytes - sndhole_bytes) >
 876                ((tcprexmtthresh - 1) * tp->t_maxseg);
 877 }
 878
 879 /*
 880  * Process any DSACK options that might be present on an input packet
 881  */
 882
 883 boolean_t
 884 tcp_sack_process_dsack(struct tcpcb *tp, struct tcpopt *to,
 885     struct tcphdr *th)
 886 {
 887         struct sackblk first_sack, second_sack;
 888         struct tcp_rxt_seg *rxseg;
 889
 890         bcopy(to->to_sacks, &first_sack, sizeof(first_sack));
 891         first_sack.start = ntohl(first_sack.start);
 892         first_sack.end = ntohl(first_sack.end);
 893
 894         if (to->to_nsacks > 1) {
 895                 bcopy((to->to_sacks + TCPOLEN_SACK), &second_sack,
 896                     sizeof(second_sack));
 897                 second_sack.start = ntohl(second_sack.start);
 898                 second_sack.end = ntohl(second_sack.end);
 899         }
 900
 901         if (SEQ_LT(first_sack.start, th->th_ack) &&
 902             SEQ_LEQ(first_sack.end, th->th_ack)) {
 903                 /*
 904                  * There is a dsack option reporting a duplicate segment
 905                  * also covered by cumulative acknowledgement.
 906                  *
 907                  * Validate the sequence numbers before looking at dsack
 908                  * option. The duplicate notification can come after
 909                  * snd_una moves forward. In order to set a window of valid
 910                  * sequence numbers to look for, we set a maximum send
 911                  * window within which the DSACK option will be processed.
 912                  */
 913                 if (!(TCP_DSACK_SEQ_IN_WINDOW(tp, first_sack.start, th->th_ack) &&
 914                     TCP_DSACK_SEQ_IN_WINDOW(tp, first_sack.end, th->th_ack))) {
 915                         to->to_nsacks--;
 916                         to->to_sacks += TCPOLEN_SACK;
 917                         tcpstat.tcps_dsack_recvd_old++;
 918
 919                         /*
 920                          * returning true here so that the ack will not be
 921                          * treated as duplicate ack.
 922                          */
 923                         return TRUE;
 924                 }
 925         } else if (to->to_nsacks > 1 &&
 926             SEQ_LEQ(second_sack.start, first_sack.start) &&
 927             SEQ_GEQ(second_sack.end, first_sack.end)) {
 928                 /*
 929                  * there is a dsack option in the first block not
 930                  * covered by the cumulative acknowledgement but covered
 931                  * by the second sack block.
 932                  *
 933                  * verify the sequence numbes on the second sack block
 934                  * before processing the DSACK option. Returning false
 935                  * here will treat the ack as a duplicate ack.
 936                  */
 937                 if (!TCP_VALIDATE_SACK_SEQ_NUMBERS(tp, &second_sack,
 938                     th->th_ack)) {
 939                         to->to_nsacks--;
 940                         to->to_sacks += TCPOLEN_SACK;
 941                         tcpstat.tcps_dsack_recvd_old++;
 942                         return TRUE;
 943                 }
 944         } else {
 945                 /* no dsack options, proceed with processing the sack */
 946                 return FALSE;
 947         }
 948
 949         /* Update the tcpopt pointer to exclude dsack block */
 950         to->to_nsacks--;
 951         to->to_sacks += TCPOLEN_SACK;
 952         tcpstat.tcps_dsack_recvd++;
 953         tp->t_dsack_recvd++;
 954
 955         /* If the DSACK is for TLP mark it as such */
 956         if ((tp->t_flagsext & TF_SENT_TLPROBE) &&
 957             first_sack.end == tp->t_tlphighrxt) {
 958                 if ((rxseg = tcp_rxtseg_find(tp, first_sack.start,
 959                     (first_sack.end - 1))) != NULL) {
 960                         rxseg->rx_flags |= TCP_RXT_DSACK_FOR_TLP;
 961                 }
 962         }
 963         /* Update the sender's retransmit segment state */
 964         if (((tp->t_rxtshift == 1 && first_sack.start == tp->snd_una) ||
 965             ((tp->t_flagsext & TF_SENT_TLPROBE) &&
 966             first_sack.end == tp->t_tlphighrxt)) &&
 967             TAILQ_EMPTY(&tp->snd_holes) &&
 968             SEQ_GT(th->th_ack, tp->snd_una)) {
 969                 /*
 970                  * If the dsack is for a retransmitted packet and one of
 971                  * the two cases is true, it indicates ack loss:
 972                  * - retransmit timeout and first_sack.start == snd_una
 973                  * - TLP probe and first_sack.end == tlphighrxt
 974                  *
 975                  * Ignore dsack and do not update state when there is
 976                  * ack loss
 977                  */
 978                 tcpstat.tcps_dsack_ackloss++;
 979
 980                 return TRUE;
 981         } else {
 982                 tcp_rxtseg_set_spurious(tp, first_sack.start, (first_sack.end - 1));
 983         }
 984         return TRUE;
 985 }