bsd/netinet/tcp_newreno.c

   1 /*
   2  * Copyright (c) 2010 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 #include <sys/param.h>
  29 #include <sys/systm.h>
  30 #include <sys/kernel.h>
  31 #include <sys/protosw.h>
  32
  33 #include <net/route.h>
  34 #include <netinet/in.h>
  35 #include <netinet/in_systm.h>
  36 #include <netinet/ip.h>
  37
  38 #if INET6
  39 #include <netinet/ip6.h>
  40 #endif
  41 #include <netinet/ip_var.h>
  42 #include <netinet/tcp.h>
  43 #include <netinet/tcp_fsm.h>
  44 #include <netinet/tcp_timer.h>
  45 #include <netinet/tcp_var.h>
  46 #include <netinet/tcpip.h>
  47 #include <netinet/tcp_cc.h>
  48 #include <libkern/OSAtomic.h>
  49
  50 int tcp_newreno_init(struct tcpcb *tp);
  51 int tcp_newreno_cleanup(struct tcpcb *tp);
  52 void tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp);
  53 void tcp_newreno_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th);
  54 void tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th);
  55 void tcp_newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th);
  56 void tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th);
  57 void tcp_newreno_after_idle(struct tcpcb *tp);
  58 void tcp_newreno_after_timeout(struct tcpcb *tp);
  59 int tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th);
  60 void tcp_newreno_switch_cc(struct tcpcb *tp, uint16_t old_index);
  61
  62 struct tcp_cc_algo tcp_cc_newreno = {
  63         .name = "newreno",
  64         .init = tcp_newreno_init,
  65         .cleanup = tcp_newreno_cleanup,
  66         .cwnd_init = tcp_newreno_cwnd_init_or_reset,
  67         .inseq_ack_rcvd = tcp_newreno_inseq_ack_rcvd,
  68         .ack_rcvd = tcp_newreno_ack_rcvd,
  69         .pre_fr = tcp_newreno_pre_fr,
  70         .post_fr = tcp_newreno_post_fr,
  71         .after_idle = tcp_newreno_cwnd_init_or_reset,
  72         .after_timeout = tcp_newreno_after_timeout,
  73         .delay_ack = tcp_newreno_delay_ack,
  74         .switch_to = tcp_newreno_switch_cc
  75 };
  76
  77 extern int tcp_do_rfc3465;
  78 extern int tcp_do_rfc3465_lim2;
  79 extern int maxseg_unacked;
  80
  81 int tcp_newreno_init(struct tcpcb *tp) {
  82 #pragma unused(tp)
  83         OSIncrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets);
  84         return 0;
  85 }
  86
  87 int tcp_newreno_cleanup(struct tcpcb *tp) {
  88 #pragma unused(tp)
  89         OSDecrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets);
  90         return 0;
  91 }
  92
  93 /* Initialize the congestion window for a connection or
  94  * handles connections that have been idle for
  95  * some time. In this state, no acks are
  96  * expected to clock out any data we send --
  97  * slow start to get ack "clock" running again.
  98  *
  99  * Set the slow-start flight size depending on whether
 100  * this is a local network or not.
 101  */
 102 void
 103 tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp) {
 104         if ( tp->t_flags & TF_LOCAL )
 105                 tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
 106         else {
 107                 /* Calculate initial cwnd according to RFC3390,
 108                  * - On a standard link, this will result in a higher cwnd
 109                  * and improve initial transfer rate.
 110                  * - Keep the old ss_fltsz sysctl for ABI compabitility issues.
 111                  * but it will be overriden if tcp_do_rfc3390 sysctl is set.
 112                  */
 113
 114                 if (tcp_do_rfc3390)
 115                         tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380));
 116
 117                 else
 118                         tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
 119         }
 120 }
 121
 122
 123 /* Function to handle an in-sequence ack during congestion avoidance phase.
 124  * This will get called from header prediction code.
 125  */
 126 void
 127 tcp_newreno_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) {
 128         int acked = 0;
 129         acked = th->th_ack - tp->snd_una;
 130         /*
 131          * Grow the congestion window, if the
 132          * connection is cwnd bound.
 133          */
 134         if (tp->snd_cwnd < tp->snd_wnd) {
 135                 tp->t_bytes_acked += acked;
 136                 if (tp->t_bytes_acked > tp->snd_cwnd) {
 137                         tp->t_bytes_acked -= tp->snd_cwnd;
 138                         tp->snd_cwnd += tp->t_maxseg;
 139                 }
 140         }
 141 }
 142 /* Function to process an ack.
 143  */
 144 void
 145 tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) {
 146         /*
 147          * RFC 3465 - Appropriate Byte Counting.
 148          *
 149          * If the window is currently less than ssthresh,
 150          * open the window by the number of bytes ACKed by
 151          * the last ACK, however clamp the window increase
 152          * to an upper limit "L".
 153          *
 154          * In congestion avoidance phase, open the window by
 155          * one segment each time "bytes_acked" grows to be
 156          * greater than or equal to the congestion window.
 157          */
 158
 159         register u_int cw = tp->snd_cwnd;
 160         register u_int incr = tp->t_maxseg;
 161         int acked = 0;
 162
 163         acked = th->th_ack - tp->snd_una;
 164         if (tcp_do_rfc3465) {
 165
 166                 if (cw >= tp->snd_ssthresh) {
 167                         tp->t_bytes_acked += acked;
 168                         if (tp->t_bytes_acked >= cw) {
 169                                 /* Time to increase the window. */
 170                                 tp->t_bytes_acked -= cw;
 171                         } else {
 172                                 /* No need to increase yet. */
 173                                 incr = 0;
 174                         }
 175                 } else {
 176                         /*
 177                          * If the user explicitly enables RFC3465
 178                          * use 2*SMSS for the "L" param.  Otherwise
 179                          * use the more conservative 1*SMSS.
 180                          *
 181                          * (See RFC 3465 2.3 Choosing the Limit)
 182                          */
 183                         u_int abc_lim;
 184
 185                         abc_lim = (tcp_do_rfc3465_lim2 &&
 186                                 tp->snd_nxt == tp->snd_max) ? incr * 2 : incr;
 187
 188                         incr = lmin(acked, abc_lim);
 189                 }
 190         } else {
 191                 /*
 192                  * If the window gives us less than ssthresh packets
 193                  * in flight, open exponentially (segsz per packet).
 194                  * Otherwise open linearly: segsz per window
 195                  * (segsz^2 / cwnd per packet).
 196                  */
 197
 198                 if (cw >= tp->snd_ssthresh)
 199                         incr = max((incr * incr / cw), 1);
 200         }
 201         tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
 202 }
 203
 204 void
 205 tcp_newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th) {
 206 #pragma unused(th)
 207
 208         uint32_t win;
 209
 210         win = min(tp->snd_wnd, tp->snd_cwnd) /
 211                 2 / tp->t_maxseg;
 212         if ( win < 2 )
 213                 win = 2;
 214         tp->snd_ssthresh = win * tp->t_maxseg;
 215 }
 216
 217 void
 218 tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th) {
 219         int32_t ss;
 220
 221         ss = tp->snd_max - th->th_ack;
 222
 223         /*
 224          * Complete ack.  Inflate the congestion window to
 225          * ssthresh and exit fast recovery.
 226          *
 227          * Window inflation should have left us with approx.
 228          * snd_ssthresh outstanding data.  But in case we
 229          * would be inclined to send a burst, better to do
 230          * it via the slow start mechanism.
 231          */
 232         if (ss < (int32_t)tp->snd_ssthresh)
 233                 tp->snd_cwnd = ss + tp->t_maxseg;
 234         else
 235                 tp->snd_cwnd = tp->snd_ssthresh;
 236         tp->t_bytes_acked = 0;
 237 }
 238
 239 /* Function to change the congestion window when the retransmit
 240  * timer fires.
 241  */
 242 void
 243 tcp_newreno_after_timeout(struct tcpcb *tp) {
 244         /*
 245          * Close the congestion window down to one segment
 246          * (we'll open it by one segment for each ack we get).
 247          * Since we probably have a window's worth of unacked
 248          * data accumulated, this "slow start" keeps us from
 249          * dumping all that data as back-to-back packets (which
 250          * might overwhelm an intermediate gateway).
 251          *
 252          * There are two phases to the opening: Initially we
 253          * open by one mss on each ack.  This makes the window
 254          * size increase exponentially with time.  If the
 255          * window is larger than the path can handle, this
 256          * exponential growth results in dropped packet(s)
 257          * almost immediately.  To get more time between
 258          * drops but still "push" the network to take advantage
 259          * of improving conditions, we switch from exponential
 260          * to linear window opening at some threshhold size.
 261          * For a threshhold, we use half the current window
 262          * size, truncated to a multiple of the mss.
 263          *
 264          * (the minimum cwnd that will give us exponential
 265          * growth is 2 mss.  We don't allow the threshhold
 266          * to go below this.)
 267          */
 268         if (tp->t_state >=  TCPS_ESTABLISHED) {
 269                 u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
 270                 if (win < 2)
 271                         win = 2;
 272                 tp->snd_cwnd = tp->t_maxseg;
 273                 tp->snd_ssthresh = win * tp->t_maxseg;
 274                 tp->t_bytes_acked = 0;
 275                 tp->t_dupacks = 0;
 276         }
 277 }
 278
 279 /*
 280  * Indicate whether this ack should be delayed.
 281  * We can delay the ack if:
 282  *  - delayed acks are enabled and set to 1, same as when value is set to 2.
 283  *    We kept this for binary compatibility.
 284  *  - delayed acks are enabled and set to 2, will "ack every other packet"
 285  *      - if our last ack wasn't a 0-sized window.
 286  *      - if the peer hasn't sent us a TH_PUSH data packet (this solves 3649245).
 287  *              If TH_PUSH is set, take this as a clue that we need to ACK
 288  *              with no delay. This helps higher level protocols who won't send
 289  *              us more data even if the window is open because their
 290  *              last "segment" hasn't been ACKed
 291  *  - delayed acks are enabled and set to 3,  will do "streaming detection"
 292  *    (see the comment in tcp_input.c) and
 293  *      - if we receive more than "maxseg_unacked" full packets in the last 100ms
 294  *      - if the connection is not in slow-start or idle or loss/recovery states
 295  *      - if those criteria aren't met, it will ack every other packet.
 296  */
 297
 298 int
 299 tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th) {
 300         switch (tcp_delack_enabled) {
 301         case 1:
 302         case 2:
 303                 if ((tp->t_flags & TF_RXWIN0SENT) == 0 &&
 304                         (th->th_flags & TH_PUSH) == 0 &&
 305                         (tp->t_flags & TF_DELACK) == 0)
 306                         return(1);
 307                 break;
 308         case 3:
 309                 if ((tp->t_flags & TF_RXWIN0SENT) == 0 &&
 310                         (th->th_flags & TH_PUSH) == 0 &&
 311                         ((tp->t_unacksegs == 0) ||
 312                         ((tp->t_flags & TF_STRETCHACK) != 0 &&
 313                         tp->t_unacksegs < (maxseg_unacked - 1))))
 314                         return(1);
 315                 break;
 316         }
 317         return(0);
 318 }
 319
 320 /* Switch to newreno from a different CC. If the connection is in
 321  * congestion avoidance state, it can continue to use the current
 322  * congestion window because it is going to be conservative. But
 323  * if the connection is in slow-start, we will halve the congestion
 324  * window and let newreno work from there.
 325  */
 326 void
 327 tcp_newreno_switch_cc(struct tcpcb *tp, uint16_t old_index) {
 328 #pragma unused(old_index)
 329
 330         uint32_t cwnd = min(tp->snd_wnd, tp->snd_cwnd);
 331         if (tp->snd_cwnd >= tp->snd_ssthresh) {
 332                 cwnd = cwnd / tp->t_maxseg;
 333         } else {
 334                 cwnd = cwnd / 2 / tp->t_maxseg;
 335         }
 336         if (cwnd < 1)
 337                 cwnd = 1;
 338         tp->snd_cwnd = cwnd * tp->t_maxseg;
 339
 340         /* Start counting bytes for RFC 3465 again */
 341         tp->t_bytes_acked = 0;
 342
 343         OSIncrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets);
 344 }