]>
Commit | Line | Data |
---|---|---|
1c79356b A |
1 | /* |
2 | * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * The contents of this file constitute Original Code as defined in and | |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
11 | * | |
12 | * This Original Code and all software distributed under the License are | |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the | |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
19 | * | |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | /* | |
23 | * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 | |
24 | * The Regents of the University of California. All rights reserved. | |
25 | * | |
26 | * Redistribution and use in source and binary forms, with or without | |
27 | * modification, are permitted provided that the following conditions | |
28 | * are met: | |
29 | * 1. Redistributions of source code must retain the above copyright | |
30 | * notice, this list of conditions and the following disclaimer. | |
31 | * 2. Redistributions in binary form must reproduce the above copyright | |
32 | * notice, this list of conditions and the following disclaimer in the | |
33 | * documentation and/or other materials provided with the distribution. | |
34 | * 3. All advertising materials mentioning features or use of this software | |
35 | * must display the following acknowledgement: | |
36 | * This product includes software developed by the University of | |
37 | * California, Berkeley and its contributors. | |
38 | * 4. Neither the name of the University nor the names of its contributors | |
39 | * may be used to endorse or promote products derived from this software | |
40 | * without specific prior written permission. | |
41 | * | |
42 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
43 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
44 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
45 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
46 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
47 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
48 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
49 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
50 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
51 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
52 | * SUCH DAMAGE. | |
53 | * | |
54 | * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 | |
55 | */ | |
56 | ||
57 | #if ISFB31 | |
58 | #include "opt_tcpdebug.h" | |
59 | #endif | |
60 | #define _IP_VHL | |
61 | ||
62 | #include <stddef.h> | |
63 | ||
64 | #include <sys/param.h> | |
65 | #include <sys/systm.h> | |
66 | #include <sys/mbuf.h> | |
67 | #include <sys/domain.h> | |
68 | #include <sys/protosw.h> | |
69 | #include <sys/socket.h> | |
70 | #include <sys/socketvar.h> | |
71 | ||
72 | #include <net/route.h> | |
73 | ||
74 | #include <netinet/in.h> | |
75 | #include <netinet/in_systm.h> | |
76 | #include <netinet/ip.h> | |
77 | #include <netinet/ip_var.h> | |
78 | #if INET6 | |
79 | #include <netinet/ip6.h> | |
80 | #include <netinet/ip_var.h> | |
81 | #include <netinet6/ip6_var.h> | |
82 | #endif | |
83 | #include <netinet/in_pcb.h> | |
84 | #include <netinet/tcp.h> | |
85 | #define TCPOUTFLAGS | |
86 | #include <netinet/tcp_fsm.h> | |
87 | #include <netinet/tcp_seq.h> | |
88 | #include <netinet/tcp_timer.h> | |
89 | #include <netinet/tcp_var.h> | |
90 | #include <netinet/tcpip.h> | |
91 | #if TCPDEBUG | |
92 | #include <netinet/tcp_debug.h> | |
93 | #endif | |
94 | #include <sys/kdebug.h> | |
95 | ||
96 | #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1) | |
97 | #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3) | |
98 | #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1) | |
99 | ||
100 | ||
101 | #ifdef notyet | |
102 | extern struct mbuf *m_copypack(); | |
103 | #endif | |
104 | ||
105 | ||
106 | /* | |
107 | * Tcp output routine: figure out what should be sent and send it. | |
108 | */ | |
109 | int | |
110 | tcp_output(tp) | |
111 | register struct tcpcb *tp; | |
112 | { | |
113 | register struct socket *so = tp->t_inpcb->inp_socket; | |
114 | register long len, win; | |
115 | int off, flags, error; | |
116 | register struct mbuf *m; | |
117 | struct ip *ip = NULL; | |
118 | struct ipovly *ipov = NULL; | |
119 | #if INET6 | |
120 | struct ip6_hdr *ip6 = NULL; | |
121 | #endif /* INET6 */ | |
122 | struct tcphdr *th; | |
123 | u_char opt[TCP_MAXOLEN]; | |
124 | unsigned ipoptlen, optlen, hdrlen; | |
125 | int idle, sendalot; | |
126 | struct rmxp_tao *taop; | |
127 | struct rmxp_tao tao_noncached; | |
128 | #if INET6 | |
129 | int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0; | |
130 | #endif | |
131 | ||
132 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); | |
133 | KERNEL_DEBUG(DBG_LAYER_BEG, | |
0b4e3aa0 A |
134 | ((tp->t_template->tt_dport << 16) | tp->t_template->tt_sport), |
135 | (((tp->t_template->tt_src.s_addr & 0xffff) << 16) | | |
136 | (tp->t_template->tt_dst.s_addr & 0xffff)), | |
1c79356b A |
137 | 0,0,0); |
138 | ||
139 | /* | |
140 | * Determine length of data that should be transmitted, | |
141 | * and flags that will be used. | |
142 | * If there is some data or critical controls (SYN, RST) | |
143 | * to send, then transmit; otherwise, investigate further. | |
144 | */ | |
145 | idle = (tp->snd_max == tp->snd_una); | |
146 | if (idle && tp->t_idle >= tp->t_rxtcur) | |
147 | /* | |
148 | * We have been idle for "a while" and no acks are | |
149 | * expected to clock out any data we send -- | |
150 | * slow start to get ack "clock" running again. | |
151 | */ | |
152 | tp->snd_cwnd = tp->t_maxseg; | |
0b4e3aa0 A |
153 | |
154 | /* Never send data that's already been acked */ | |
155 | if (SEQ_GT(tp->snd_una, tp->snd_nxt)) | |
156 | tp->snd_nxt = tp->snd_una; | |
1c79356b A |
157 | again: |
158 | sendalot = 0; | |
159 | off = tp->snd_nxt - tp->snd_una; | |
160 | win = min(tp->snd_wnd, tp->snd_cwnd); | |
161 | ||
162 | flags = tcp_outflags[tp->t_state]; | |
163 | /* | |
164 | * Get standard flags, and add SYN or FIN if requested by 'hidden' | |
165 | * state flags. | |
166 | */ | |
167 | if (tp->t_flags & TF_NEEDFIN) | |
168 | flags |= TH_FIN; | |
169 | if (tp->t_flags & TF_NEEDSYN) | |
170 | flags |= TH_SYN; | |
171 | ||
172 | /* | |
173 | * If in persist timeout with window of 0, send 1 byte. | |
174 | * Otherwise, if window is small but nonzero | |
175 | * and timer expired, we will send what we can | |
176 | * and go to transmit state. | |
177 | */ | |
178 | if (tp->t_force) { | |
179 | if (win == 0) { | |
180 | /* | |
181 | * If we still have some data to send, then | |
182 | * clear the FIN bit. Usually this would | |
183 | * happen below when it realizes that we | |
184 | * aren't sending all the data. However, | |
185 | * if we have exactly 1 byte of unsent data, | |
186 | * then it won't clear the FIN bit below, | |
187 | * and if we are in persist state, we wind | |
188 | * up sending the packet without recording | |
189 | * that we sent the FIN bit. | |
190 | * | |
191 | * We can't just blindly clear the FIN bit, | |
192 | * because if we don't have any more data | |
193 | * to send then the probe will be the FIN | |
194 | * itself. | |
195 | */ | |
196 | if (off < so->so_snd.sb_cc) | |
197 | flags &= ~TH_FIN; | |
198 | win = 1; | |
199 | } else { | |
200 | tp->t_timer[TCPT_PERSIST] = 0; | |
201 | tp->t_rxtshift = 0; | |
202 | } | |
203 | } | |
204 | ||
205 | len = (long)ulmin(so->so_snd.sb_cc, win) - off; | |
206 | ||
207 | if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { | |
208 | taop = &tao_noncached; | |
209 | bzero(taop, sizeof(*taop)); | |
210 | } | |
211 | ||
212 | /* | |
213 | * Lop off SYN bit if it has already been sent. However, if this | |
214 | * is SYN-SENT state and if segment contains data and if we don't | |
215 | * know that foreign host supports TAO, suppress sending segment. | |
216 | */ | |
217 | if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { | |
218 | flags &= ~TH_SYN; | |
219 | off--, len++; | |
220 | if (len > 0 && tp->t_state == TCPS_SYN_SENT && | |
221 | taop->tao_ccsent == 0) { | |
222 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
223 | return 0; | |
224 | } | |
225 | } | |
226 | ||
227 | /* | |
228 | * Be careful not to send data and/or FIN on SYN segments | |
229 | * in cases when no CC option will be sent. | |
230 | * This measure is needed to prevent interoperability problems | |
231 | * with not fully conformant TCP implementations. | |
232 | */ | |
233 | if ((flags & TH_SYN) && | |
234 | ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || | |
235 | ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { | |
236 | len = 0; | |
237 | flags &= ~TH_FIN; | |
238 | } | |
239 | ||
240 | if (len < 0) { | |
241 | /* | |
242 | * If FIN has been sent but not acked, | |
243 | * but we haven't been called to retransmit, | |
244 | * len will be -1. Otherwise, window shrank | |
245 | * after we sent into it. If window shrank to 0, | |
246 | * cancel pending retransmit, pull snd_nxt back | |
247 | * to (closed) window, and set the persist timer | |
248 | * if it isn't already going. If the window didn't | |
249 | * close completely, just wait for an ACK. | |
250 | */ | |
251 | len = 0; | |
252 | if (win == 0) { | |
253 | tp->t_timer[TCPT_REXMT] = 0; | |
254 | tp->t_rxtshift = 0; | |
255 | tp->snd_nxt = tp->snd_una; | |
256 | if (tp->t_timer[TCPT_PERSIST] == 0) | |
257 | tcp_setpersist(tp); | |
258 | } | |
259 | } | |
260 | if (len > tp->t_maxseg) { | |
261 | len = tp->t_maxseg; | |
262 | sendalot = 1; | |
263 | } | |
264 | if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) | |
265 | flags &= ~TH_FIN; | |
266 | ||
267 | win = sbspace(&so->so_rcv); | |
268 | ||
269 | /* | |
270 | * Sender silly window avoidance. If connection is idle | |
271 | * and can send all data, a maximum segment, | |
272 | * at least a maximum default-size segment do it, | |
273 | * or are forced, do it; otherwise don't bother. | |
274 | * If peer's buffer is tiny, then send | |
275 | * when window is at least half open. | |
276 | * If retransmitting (possibly after persist timer forced us | |
277 | * to send into a small window), then must resend. | |
278 | */ | |
279 | if (len) { | |
280 | if (len == tp->t_maxseg) | |
281 | goto send; | |
282 | if (!(tp->t_flags & TF_MORETOCOME) && | |
283 | (idle || tp->t_flags & TF_NODELAY) && | |
284 | (tp->t_flags & TF_NOPUSH) == 0 && | |
285 | len + off >= so->so_snd.sb_cc) | |
286 | goto send; | |
287 | if (tp->t_force) | |
288 | goto send; | |
289 | if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) | |
290 | goto send; | |
291 | if (SEQ_LT(tp->snd_nxt, tp->snd_max)) | |
292 | goto send; | |
293 | } | |
294 | ||
295 | /* | |
296 | * Compare available window to amount of window | |
297 | * known to peer (as advertised window less | |
298 | * next expected input). If the difference is at least two | |
299 | * max size segments, or at least 50% of the maximum possible | |
300 | * window, then want to send a window update to peer. | |
301 | */ | |
302 | if (win > 0) { | |
303 | /* | |
304 | * "adv" is the amount we can increase the window, | |
305 | * taking into account that we are limited by | |
306 | * TCP_MAXWIN << tp->rcv_scale. | |
307 | */ | |
308 | long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - | |
309 | (tp->rcv_adv - tp->rcv_nxt); | |
310 | ||
311 | if (adv >= (long) (2 * tp->t_maxseg)) | |
312 | goto send; | |
313 | if (2 * adv >= (long) so->so_rcv.sb_hiwat) | |
314 | goto send; | |
315 | } | |
316 | ||
317 | /* | |
318 | * Send if we owe peer an ACK. | |
319 | */ | |
320 | if (tp->t_flags & TF_ACKNOW) | |
321 | goto send; | |
322 | if ((flags & TH_RST) || | |
323 | ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) | |
324 | goto send; | |
325 | if (SEQ_GT(tp->snd_up, tp->snd_una)) | |
326 | goto send; | |
327 | /* | |
328 | * If our state indicates that FIN should be sent | |
329 | * and we have not yet done so, or we're retransmitting the FIN, | |
330 | * then we need to send. | |
331 | */ | |
332 | if (flags & TH_FIN && | |
333 | ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) | |
334 | goto send; | |
335 | ||
336 | /* | |
337 | * TCP window updates are not reliable, rather a polling protocol | |
338 | * using ``persist'' packets is used to insure receipt of window | |
339 | * updates. The three ``states'' for the output side are: | |
340 | * idle not doing retransmits or persists | |
341 | * persisting to move a small or zero window | |
342 | * (re)transmitting and thereby not persisting | |
343 | * | |
344 | * tp->t_timer[TCPT_PERSIST] | |
345 | * is set when we are in persist state. | |
346 | * tp->t_force | |
347 | * is set when we are called to send a persist packet. | |
348 | * tp->t_timer[TCPT_REXMT] | |
349 | * is set when we are retransmitting | |
350 | * The output side is idle when both timers are zero. | |
351 | * | |
352 | * If send window is too small, there is data to transmit, and no | |
353 | * retransmit or persist is pending, then go to persist state. | |
354 | * If nothing happens soon, send when timer expires: | |
355 | * if window is nonzero, transmit what we can, | |
356 | * otherwise force out a byte. | |
357 | */ | |
358 | if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && | |
359 | tp->t_timer[TCPT_PERSIST] == 0) { | |
360 | tp->t_rxtshift = 0; | |
361 | tcp_setpersist(tp); | |
362 | } | |
363 | ||
364 | /* | |
365 | * No reason to send a segment, just return. | |
366 | */ | |
367 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
368 | return (0); | |
369 | ||
370 | send: | |
371 | /* | |
372 | * Before ESTABLISHED, force sending of initial options | |
373 | * unless TCP set not to do any options. | |
374 | * NOTE: we assume that the IP/TCP header plus TCP options | |
375 | * always fit in a single mbuf, leaving room for a maximum | |
376 | * link header, i.e. | |
377 | * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN | |
378 | */ | |
379 | optlen = 0; | |
380 | #if INET6 | |
381 | if (isipv6) | |
382 | hdrlen = sizeof (struct tcpip6hdr); | |
383 | else | |
384 | #endif | |
385 | hdrlen = sizeof (struct tcpiphdr); | |
386 | if (flags & TH_SYN) { | |
387 | tp->snd_nxt = tp->iss; | |
388 | if ((tp->t_flags & TF_NOOPT) == 0) { | |
389 | u_short mss; | |
390 | ||
391 | opt[0] = TCPOPT_MAXSEG; | |
392 | opt[1] = TCPOLEN_MAXSEG; | |
393 | mss = htons((u_short) tcp_mssopt(tp, isipv6)); | |
394 | (void)memcpy(opt + 2, &mss, sizeof(mss)); | |
395 | optlen = TCPOLEN_MAXSEG; | |
396 | ||
397 | if ((tp->t_flags & TF_REQ_SCALE) && | |
398 | ((flags & TH_ACK) == 0 || | |
399 | (tp->t_flags & TF_RCVD_SCALE))) { | |
400 | *((u_int32_t *)(opt + optlen)) = htonl( | |
401 | TCPOPT_NOP << 24 | | |
402 | TCPOPT_WINDOW << 16 | | |
403 | TCPOLEN_WINDOW << 8 | | |
404 | tp->request_r_scale); | |
405 | optlen += 4; | |
406 | } | |
407 | } | |
408 | } | |
409 | ||
410 | /* | |
411 | * Send a timestamp and echo-reply if this is a SYN and our side | |
412 | * wants to use timestamps (TF_REQ_TSTMP is set) or both our side | |
413 | * and our peer have sent timestamps in our SYN's. | |
414 | */ | |
415 | if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && | |
416 | (flags & TH_RST) == 0 && | |
417 | ((flags & TH_ACK) == 0 || | |
418 | (tp->t_flags & TF_RCVD_TSTMP))) { | |
419 | u_int32_t *lp = (u_int32_t *)(opt + optlen); | |
420 | ||
421 | /* Form timestamp option as shown in appendix A of RFC 1323. */ | |
422 | *lp++ = htonl(TCPOPT_TSTAMP_HDR); | |
423 | *lp++ = htonl(tcp_now); | |
424 | *lp = htonl(tp->ts_recent); | |
425 | optlen += TCPOLEN_TSTAMP_APPA; | |
426 | } | |
427 | ||
428 | /* | |
429 | * Send `CC-family' options if our side wants to use them (TF_REQ_CC), | |
430 | * options are allowed (!TF_NOOPT) and it's not a RST. | |
431 | */ | |
432 | if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && | |
433 | (flags & TH_RST) == 0) { | |
434 | switch (flags & (TH_SYN|TH_ACK)) { | |
435 | /* | |
436 | * This is a normal ACK, send CC if we received CC before | |
437 | * from our peer. | |
438 | */ | |
439 | case TH_ACK: | |
440 | if (!(tp->t_flags & TF_RCVD_CC)) | |
441 | break; | |
442 | /*FALLTHROUGH*/ | |
443 | ||
444 | /* | |
445 | * We can only get here in T/TCP's SYN_SENT* state, when | |
446 | * we're a sending a non-SYN segment without waiting for | |
447 | * the ACK of our SYN. A check above assures that we only | |
448 | * do this if our peer understands T/TCP. | |
449 | */ | |
450 | case 0: | |
451 | opt[optlen++] = TCPOPT_NOP; | |
452 | opt[optlen++] = TCPOPT_NOP; | |
453 | opt[optlen++] = TCPOPT_CC; | |
454 | opt[optlen++] = TCPOLEN_CC; | |
455 | *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); | |
456 | ||
457 | optlen += 4; | |
458 | break; | |
459 | ||
460 | /* | |
461 | * This is our initial SYN, check whether we have to use | |
462 | * CC or CC.new. | |
463 | */ | |
464 | case TH_SYN: | |
465 | opt[optlen++] = TCPOPT_NOP; | |
466 | opt[optlen++] = TCPOPT_NOP; | |
467 | opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? | |
468 | TCPOPT_CCNEW : TCPOPT_CC; | |
469 | opt[optlen++] = TCPOLEN_CC; | |
470 | *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); | |
471 | optlen += 4; | |
472 | break; | |
473 | ||
474 | /* | |
475 | * This is a SYN,ACK; send CC and CC.echo if we received | |
476 | * CC from our peer. | |
477 | */ | |
478 | case (TH_SYN|TH_ACK): | |
479 | if (tp->t_flags & TF_RCVD_CC) { | |
480 | opt[optlen++] = TCPOPT_NOP; | |
481 | opt[optlen++] = TCPOPT_NOP; | |
482 | opt[optlen++] = TCPOPT_CC; | |
483 | opt[optlen++] = TCPOLEN_CC; | |
484 | *(u_int32_t *)&opt[optlen] = | |
485 | htonl(tp->cc_send); | |
486 | optlen += 4; | |
487 | opt[optlen++] = TCPOPT_NOP; | |
488 | opt[optlen++] = TCPOPT_NOP; | |
489 | opt[optlen++] = TCPOPT_CCECHO; | |
490 | opt[optlen++] = TCPOLEN_CC; | |
491 | *(u_int32_t *)&opt[optlen] = | |
492 | htonl(tp->cc_recv); | |
493 | optlen += 4; | |
494 | } | |
495 | break; | |
496 | } | |
497 | } | |
498 | ||
499 | hdrlen += optlen; | |
500 | #if INET6 | |
501 | if (isipv6) | |
502 | ipoptlen = ip6_optlen(tp->t_inpcb); | |
503 | else | |
504 | #endif | |
505 | if (tp->t_inpcb->inp_options) { | |
506 | ipoptlen = tp->t_inpcb->inp_options->m_len - | |
507 | offsetof(struct ipoption, ipopt_list); | |
508 | } else { | |
509 | ipoptlen = 0; | |
510 | } | |
511 | #if IPSEC | |
512 | #if INET6 | |
513 | ipoptlen += ipsec_hdrsiz_tcp(tp, isipv6); | |
514 | #else | |
515 | ipoptlen += ipsec_hdrsiz_tcp(tp, 0); | |
516 | #endif | |
517 | #endif | |
518 | ||
519 | /* | |
520 | * Adjust data length if insertion of options will | |
521 | * bump the packet length beyond the t_maxopd length. | |
522 | * Clear the FIN bit because we cut off the tail of | |
523 | * the segment. | |
524 | */ | |
525 | if (len + optlen + ipoptlen > tp->t_maxopd) { | |
526 | /* | |
527 | * If there is still more to send, don't close the connection. | |
528 | */ | |
529 | flags &= ~TH_FIN; | |
530 | len = tp->t_maxopd - optlen - ipoptlen; | |
531 | sendalot = 1; | |
532 | } | |
533 | ||
534 | /*#ifdef DIAGNOSTIC*/ | |
535 | if (max_linkhdr + hdrlen > MHLEN) | |
536 | panic("tcphdr too big"); | |
537 | /*#endif*/ | |
538 | ||
539 | /* | |
540 | * Grab a header mbuf, attaching a copy of data to | |
541 | * be transmitted, and initialize the header from | |
542 | * the template for sends on this connection. | |
543 | */ | |
544 | if (len) { | |
545 | if (tp->t_force && len == 1) | |
546 | tcpstat.tcps_sndprobe++; | |
547 | else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { | |
548 | tcpstat.tcps_sndrexmitpack++; | |
549 | tcpstat.tcps_sndrexmitbyte += len; | |
550 | } else { | |
551 | tcpstat.tcps_sndpack++; | |
552 | tcpstat.tcps_sndbyte += len; | |
553 | } | |
554 | #ifdef notyet | |
555 | if ((m = m_copypack(so->so_snd.sb_mb, off, | |
556 | (int)len, max_linkhdr + hdrlen)) == 0) { | |
557 | error = ENOBUFS; | |
558 | goto out; | |
559 | } | |
560 | /* | |
561 | * m_copypack left space for our hdr; use it. | |
562 | */ | |
563 | m->m_len += hdrlen; | |
564 | m->m_data -= hdrlen; | |
565 | #else | |
566 | MGETHDR(m, M_DONTWAIT, MT_HEADER); | |
567 | if (m == NULL) { | |
568 | error = ENOBUFS; | |
569 | goto out; | |
570 | } | |
571 | #if INET6 | |
572 | if (MHLEN < hdrlen + max_linkhdr) { | |
573 | MCLGET(m, M_DONTWAIT); | |
574 | if ((m->m_flags & M_EXT) == 0) { | |
575 | m_freem(m); | |
576 | error = ENOBUFS; | |
577 | goto out; | |
578 | } | |
579 | } | |
580 | #endif | |
581 | m->m_data += max_linkhdr; | |
582 | m->m_len = hdrlen; | |
583 | if (len <= MHLEN - hdrlen - max_linkhdr) { | |
584 | m_copydata(so->so_snd.sb_mb, off, (int) len, | |
585 | mtod(m, caddr_t) + hdrlen); | |
586 | m->m_len += len; | |
587 | } else { | |
588 | m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); | |
589 | if (m->m_next == 0) { | |
590 | (void) m_free(m); | |
591 | error = ENOBUFS; | |
592 | goto out; | |
593 | } | |
594 | } | |
595 | #endif | |
596 | /* | |
597 | * If we're sending everything we've got, set PUSH. | |
598 | * (This will keep happy those implementations which only | |
599 | * give data to the user when a buffer fills or | |
600 | * a PUSH comes in.) | |
601 | */ | |
602 | if (off + len == so->so_snd.sb_cc) | |
603 | flags |= TH_PUSH; | |
604 | } else { | |
605 | if (tp->t_flags & TF_ACKNOW) | |
606 | tcpstat.tcps_sndacks++; | |
607 | else if (flags & (TH_SYN|TH_FIN|TH_RST)) | |
608 | tcpstat.tcps_sndctrl++; | |
609 | else if (SEQ_GT(tp->snd_up, tp->snd_una)) | |
610 | tcpstat.tcps_sndurg++; | |
611 | else | |
612 | tcpstat.tcps_sndwinup++; | |
613 | ||
614 | MGETHDR(m, M_DONTWAIT, MT_HEADER); | |
615 | if (m == NULL) { | |
616 | error = ENOBUFS; | |
617 | goto out; | |
618 | } | |
619 | #if INET6 | |
620 | if (isipv6) { | |
621 | MH_ALIGN(m, hdrlen); | |
622 | } else | |
623 | #endif | |
624 | m->m_data += max_linkhdr; | |
625 | m->m_len = hdrlen; | |
626 | } | |
627 | m->m_pkthdr.rcvif = (struct ifnet *)0; | |
628 | if (tp->t_template == 0) | |
629 | panic("tcp_output"); | |
630 | #if INET6 | |
631 | if (isipv6) { | |
632 | ip6 = mtod(m, struct ip6_hdr *); | |
633 | th = (struct tcphdr *)(ip6 + 1); | |
634 | bcopy((caddr_t)&tp->t_template->tt_i6, (caddr_t)ip6, | |
635 | sizeof(struct ip6_hdr)); | |
636 | bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, | |
637 | sizeof(struct tcphdr)); | |
638 | } else { | |
639 | #endif /* INET6 */ | |
640 | ip = mtod(m, struct ip *); | |
641 | ipov = (struct ipovly *)ip; | |
642 | th = (struct tcphdr *)(ip + 1); | |
643 | bcopy((caddr_t)&tp->t_template->tt_i, (caddr_t)ip, sizeof(struct ip)); | |
644 | bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, | |
645 | sizeof(struct tcphdr)); | |
646 | #if INET6 | |
647 | } | |
648 | #endif /* INET6 */ | |
649 | ||
650 | /* | |
651 | * Fill in fields, remembering maximum advertised | |
652 | * window for use in delaying messages about window sizes. | |
653 | * If resending a FIN, be sure not to use a new sequence number. | |
654 | */ | |
655 | if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && | |
656 | tp->snd_nxt == tp->snd_max) | |
657 | tp->snd_nxt--; | |
658 | /* | |
659 | * If we are doing retransmissions, then snd_nxt will | |
660 | * not reflect the first unsent octet. For ACK only | |
661 | * packets, we do not want the sequence number of the | |
662 | * retransmitted packet, we want the sequence number | |
663 | * of the next unsent octet. So, if there is no data | |
664 | * (and no SYN or FIN), use snd_max instead of snd_nxt | |
665 | * when filling in ti_seq. But if we are in persist | |
666 | * state, snd_max might reflect one byte beyond the | |
667 | * right edge of the window, so use snd_nxt in that | |
668 | * case, since we know we aren't doing a retransmission. | |
669 | * (retransmit and persist are mutually exclusive...) | |
670 | */ | |
671 | if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST]) | |
672 | th->th_seq = htonl(tp->snd_nxt); | |
673 | else | |
674 | th->th_seq = htonl(tp->snd_max); | |
675 | th->th_ack = htonl(tp->rcv_nxt); | |
676 | if (optlen) { | |
677 | bcopy(opt, th + 1, optlen); | |
678 | th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; | |
679 | } | |
680 | th->th_flags = flags; | |
681 | /* | |
682 | * Calculate receive window. Don't shrink window, | |
683 | * but avoid silly window syndrome. | |
684 | */ | |
685 | if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) | |
686 | win = 0; | |
687 | if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) | |
688 | win = (long)(tp->rcv_adv - tp->rcv_nxt); | |
689 | if (win > (long)TCP_MAXWIN << tp->rcv_scale) | |
690 | win = (long)TCP_MAXWIN << tp->rcv_scale; | |
691 | th->th_win = htons((u_short) (win>>tp->rcv_scale)); | |
692 | if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { | |
693 | th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); | |
694 | th->th_flags |= TH_URG; | |
695 | } else | |
696 | /* | |
697 | * If no urgent pointer to send, then we pull | |
698 | * the urgent pointer to the left edge of the send window | |
699 | * so that it doesn't drift into the send window on sequence | |
700 | * number wraparound. | |
701 | */ | |
702 | tp->snd_up = tp->snd_una; /* drag it along */ | |
703 | ||
704 | /* | |
705 | * Put TCP length in extended header, and then | |
706 | * checksum extended header and data. | |
707 | */ | |
708 | m->m_pkthdr.len = hdrlen + len; | |
709 | #if INET6 | |
710 | if (isipv6) { | |
711 | #if 0 /* ip6_plen will be filled in ip6_output. */ | |
712 | ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) + | |
713 | optlen + len)); | |
714 | #endif | |
715 | ||
716 | th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), | |
717 | sizeof(struct tcphdr) + optlen + len); | |
0b4e3aa0 | 718 | } else |
1c79356b | 719 | #endif /* INET6 */ |
0b4e3aa0 A |
720 | { |
721 | ||
1c79356b A |
722 | if (len + optlen) |
723 | ipov->ih_len = htons((u_short)(sizeof (struct tcphdr) + | |
724 | optlen + len)); | |
0b4e3aa0 A |
725 | m->m_pkthdr.csum_flags = CSUM_TCP; |
726 | m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); | |
727 | if (len + optlen) { | |
728 | th->th_sum = in_addword(th->th_sum, | |
729 | htons((u_short)(optlen + len))); | |
730 | } | |
731 | ||
1c79356b | 732 | } |
1c79356b A |
733 | |
734 | /* | |
735 | * In transmit state, time the transmission and arrange for | |
736 | * the retransmit. In persist state, just set snd_max. | |
737 | */ | |
738 | if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { | |
739 | tcp_seq startseq = tp->snd_nxt; | |
740 | ||
741 | /* | |
742 | * Advance snd_nxt over sequence space of this segment. | |
743 | */ | |
744 | if (flags & (TH_SYN|TH_FIN)) { | |
745 | if (flags & TH_SYN) | |
746 | tp->snd_nxt++; | |
747 | if (flags & TH_FIN) { | |
748 | tp->snd_nxt++; | |
749 | tp->t_flags |= TF_SENTFIN; | |
750 | } | |
751 | } | |
752 | tp->snd_nxt += len; | |
753 | if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { | |
754 | tp->snd_max = tp->snd_nxt; | |
755 | /* | |
756 | * Time this transmission if not a retransmission and | |
757 | * not currently timing anything. | |
758 | */ | |
759 | if (tp->t_rtt == 0) { | |
760 | tp->t_rtt = 1; | |
761 | tp->t_rtseq = startseq; | |
762 | tcpstat.tcps_segstimed++; | |
763 | } | |
764 | } | |
765 | ||
766 | /* | |
767 | * Set retransmit timer if not currently set, | |
768 | * and not doing an ack or a keep-alive probe. | |
769 | * Initial value for retransmit timer is smoothed | |
770 | * round-trip time + 2 * round-trip time variance. | |
771 | * Initialize shift counter which is used for backoff | |
772 | * of retransmit time. | |
773 | */ | |
774 | if (tp->t_timer[TCPT_REXMT] == 0 && | |
775 | tp->snd_nxt != tp->snd_una) { | |
776 | tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; | |
777 | if (tp->t_timer[TCPT_PERSIST]) { | |
778 | tp->t_timer[TCPT_PERSIST] = 0; | |
779 | tp->t_rxtshift = 0; | |
780 | } | |
781 | } | |
782 | } else | |
783 | if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) | |
784 | tp->snd_max = tp->snd_nxt + len; | |
785 | ||
786 | #if TCPDEBUG | |
787 | /* | |
788 | * Trace. | |
789 | */ | |
790 | if (so->so_options & SO_DEBUG) { | |
791 | #if INET6 | |
792 | if (isipv6) | |
793 | ip6->ip6_vfc = IPV6_VERSION; | |
794 | else | |
795 | ip->ip_vhl = IP_MAKE_VHL(IPVERSION, | |
796 | IP_VHL_HL(ip->ip_vhl)); | |
797 | #endif /* INET6 */ | |
798 | tcp_trace(TA_OUTPUT, tp->t_state, tp, | |
799 | #if INET6 | |
800 | isipv6 ? (void *)ip6 : | |
801 | #endif /* INET6 */ | |
802 | ip, | |
803 | th, 0); | |
804 | ||
805 | } | |
806 | #endif /* TCPDEBUG */ | |
807 | ||
808 | /* | |
809 | * Fill in IP length and desired time to live and | |
810 | * send to IP level. There should be a better way | |
811 | * to handle ttl and tos; we could keep them in | |
812 | * the template, but need a way to checksum without them. | |
813 | */ | |
814 | #if INET6 | |
815 | if (isipv6) { | |
816 | /* | |
817 | * we separately set hoplimit for every segment, since the | |
818 | * user might want to change the value via setsockopt. | |
819 | * Also, desired default hop limit might be changed via | |
820 | * Neighbor Discovery. | |
821 | */ | |
822 | ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, | |
823 | tp->t_inpcb->in6p_route.ro_rt ? | |
824 | tp->t_inpcb->in6p_route.ro_rt->rt_ifp | |
825 | : NULL); | |
826 | ||
827 | /* TODO: IPv6 IP6TOS_ECT bit on */ | |
828 | #if IPSEC | |
829 | ipsec_setsocket(m, so); | |
830 | #endif /*IPSEC*/ | |
831 | error = ip6_output(m, | |
832 | tp->t_inpcb->in6p_outputopts, | |
833 | &tp->t_inpcb->in6p_route, | |
834 | (so->so_options & SO_DONTROUTE) /* | IP6_DONTFRAG */, | |
835 | NULL, NULL); | |
836 | } else | |
837 | #endif /* INET6 */ | |
838 | { | |
839 | #if 1 | |
840 | struct rtentry *rt; | |
841 | #endif | |
842 | ip->ip_len = m->m_pkthdr.len; | |
843 | #if INET6 | |
844 | if (INP_CHECK_SOCKAF(so, AF_INET6)) | |
845 | ip->ip_ttl = in6_selecthlim(tp->t_inpcb, | |
846 | tp->t_inpcb->in6p_route.ro_rt ? | |
847 | tp->t_inpcb->in6p_route.ro_rt->rt_ifp | |
848 | : NULL); | |
849 | else | |
850 | #endif /* INET6 */ | |
851 | ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ | |
852 | ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */ | |
853 | ||
0b4e3aa0 A |
854 | #define thtoti(x) \ |
855 | ((struct tcpiphdr *)(((char *)(x)) - (sizeof (struct ip)))) | |
856 | ||
1c79356b | 857 | KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), |
0b4e3aa0 | 858 | (((thtoti(th)->ti_src.s_addr & 0xffff) << 16) | (thtoti(th)->ti_dst.s_addr & 0xffff)), |
1c79356b A |
859 | th->th_seq, th->th_ack, th->th_win); |
860 | ||
861 | ||
862 | #if 1 | |
863 | /* | |
864 | * See if we should do MTU discovery. We do it only if the following | |
865 | * are true: | |
866 | * 1) we have a valid route to the destination | |
867 | * 2) the MTU is not locked (if it is, then discovery has been | |
868 | * disabled) | |
869 | */ | |
870 | if ((rt = tp->t_inpcb->inp_route.ro_rt) | |
871 | && rt->rt_flags & RTF_UP | |
872 | && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { | |
873 | ip->ip_off |= IP_DF; | |
874 | } | |
875 | #endif | |
876 | ||
877 | #if IPSEC | |
878 | ipsec_setsocket(m, so); | |
879 | #endif /*IPSEC*/ | |
880 | ||
881 | error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, | |
882 | so->so_options & SO_DONTROUTE, 0); | |
883 | } | |
884 | if (error) { | |
885 | out: | |
886 | if (error == ENOBUFS) { | |
0b4e3aa0 A |
887 | if (!tp->t_timer[TCPT_REXMT] && |
888 | !tp->t_timer[TCPT_PERSIST]) | |
889 | tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; | |
1c79356b A |
890 | tcp_quench(tp->t_inpcb, 0); |
891 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
892 | return (0); | |
893 | } | |
894 | #if 1 | |
895 | if (error == EMSGSIZE) { | |
896 | /* | |
897 | * ip_output() will have already fixed the route | |
898 | * for us. tcp_mtudisc() will, as its last action, | |
899 | * initiate retransmission, so it is important to | |
900 | * not do so here. | |
901 | */ | |
902 | tcp_mtudisc(tp->t_inpcb, 0); | |
903 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
904 | return 0; | |
905 | } | |
906 | #endif | |
907 | if ((error == EHOSTUNREACH || error == ENETDOWN) | |
908 | && TCPS_HAVERCVDSYN(tp->t_state)) { | |
909 | tp->t_softerror = error; | |
910 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
911 | return (0); | |
912 | } | |
913 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
914 | return (error); | |
915 | } | |
916 | tcpstat.tcps_sndtotal++; | |
917 | ||
918 | /* | |
919 | * Data sent (as far as we can tell). | |
920 | * If this advertises a larger window than any other segment, | |
921 | * then remember the size of the advertised window. | |
922 | * Any pending ACK has now been sent. | |
923 | */ | |
924 | if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) | |
925 | tp->rcv_adv = tp->rcv_nxt + win; | |
926 | tp->last_ack_sent = tp->rcv_nxt; | |
927 | tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); | |
928 | if (sendalot) | |
929 | goto again; | |
930 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
931 | return (0); | |
932 | } | |
933 | ||
934 | void | |
935 | tcp_setpersist(tp) | |
936 | register struct tcpcb *tp; | |
937 | { | |
938 | register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; | |
939 | ||
940 | if (tp->t_timer[TCPT_REXMT]) | |
941 | panic("tcp_output REXMT"); | |
942 | /* | |
943 | * Start/restart persistance timer. | |
944 | */ | |
945 | TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], | |
946 | t * tcp_backoff[tp->t_rxtshift], | |
947 | TCPTV_PERSMIN, TCPTV_PERSMAX); | |
948 | if (tp->t_rxtshift < TCP_MAXRXTSHIFT) | |
949 | tp->t_rxtshift++; | |
950 | } |