]>
Commit | Line | Data |
---|---|---|
1c79356b A |
1 | /* |
2 | * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * The contents of this file constitute Original Code as defined in and | |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
11 | * | |
12 | * This Original Code and all software distributed under the License are | |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the | |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
19 | * | |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | /* | |
23 | * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 | |
24 | * The Regents of the University of California. All rights reserved. | |
25 | * | |
26 | * Redistribution and use in source and binary forms, with or without | |
27 | * modification, are permitted provided that the following conditions | |
28 | * are met: | |
29 | * 1. Redistributions of source code must retain the above copyright | |
30 | * notice, this list of conditions and the following disclaimer. | |
31 | * 2. Redistributions in binary form must reproduce the above copyright | |
32 | * notice, this list of conditions and the following disclaimer in the | |
33 | * documentation and/or other materials provided with the distribution. | |
34 | * 3. All advertising materials mentioning features or use of this software | |
35 | * must display the following acknowledgement: | |
36 | * This product includes software developed by the University of | |
37 | * California, Berkeley and its contributors. | |
38 | * 4. Neither the name of the University nor the names of its contributors | |
39 | * may be used to endorse or promote products derived from this software | |
40 | * without specific prior written permission. | |
41 | * | |
42 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
43 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
44 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
45 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
46 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
47 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
48 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
49 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
50 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
51 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
52 | * SUCH DAMAGE. | |
53 | * | |
54 | * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 | |
55 | */ | |
56 | ||
57 | #if ISFB31 | |
58 | #include "opt_tcpdebug.h" | |
59 | #endif | |
60 | #define _IP_VHL | |
61 | ||
62 | #include <stddef.h> | |
63 | ||
64 | #include <sys/param.h> | |
65 | #include <sys/systm.h> | |
66 | #include <sys/mbuf.h> | |
67 | #include <sys/domain.h> | |
68 | #include <sys/protosw.h> | |
69 | #include <sys/socket.h> | |
70 | #include <sys/socketvar.h> | |
71 | ||
72 | #include <net/route.h> | |
73 | ||
74 | #include <netinet/in.h> | |
75 | #include <netinet/in_systm.h> | |
76 | #include <netinet/ip.h> | |
77 | #include <netinet/ip_var.h> | |
78 | #if INET6 | |
79 | #include <netinet/ip6.h> | |
80 | #include <netinet/ip_var.h> | |
81 | #include <netinet6/ip6_var.h> | |
82 | #endif | |
83 | #include <netinet/in_pcb.h> | |
84 | #include <netinet/tcp.h> | |
85 | #define TCPOUTFLAGS | |
86 | #include <netinet/tcp_fsm.h> | |
87 | #include <netinet/tcp_seq.h> | |
88 | #include <netinet/tcp_timer.h> | |
89 | #include <netinet/tcp_var.h> | |
90 | #include <netinet/tcpip.h> | |
91 | #if TCPDEBUG | |
92 | #include <netinet/tcp_debug.h> | |
93 | #endif | |
94 | #include <sys/kdebug.h> | |
95 | ||
fa4905b1 | 96 | |
1c79356b A |
97 | #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1) |
98 | #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3) | |
99 | #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1) | |
100 | ||
101 | ||
102 | #ifdef notyet | |
103 | extern struct mbuf *m_copypack(); | |
104 | #endif | |
105 | ||
106 | ||
107 | /* | |
108 | * Tcp output routine: figure out what should be sent and send it. | |
109 | */ | |
110 | int | |
111 | tcp_output(tp) | |
112 | register struct tcpcb *tp; | |
113 | { | |
114 | register struct socket *so = tp->t_inpcb->inp_socket; | |
115 | register long len, win; | |
116 | int off, flags, error; | |
117 | register struct mbuf *m; | |
118 | struct ip *ip = NULL; | |
119 | struct ipovly *ipov = NULL; | |
120 | #if INET6 | |
121 | struct ip6_hdr *ip6 = NULL; | |
122 | #endif /* INET6 */ | |
123 | struct tcphdr *th; | |
124 | u_char opt[TCP_MAXOLEN]; | |
125 | unsigned ipoptlen, optlen, hdrlen; | |
126 | int idle, sendalot; | |
127 | struct rmxp_tao *taop; | |
128 | struct rmxp_tao tao_noncached; | |
129 | #if INET6 | |
130 | int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0; | |
131 | #endif | |
fa4905b1 A |
132 | int last_off; |
133 | int m_off; | |
134 | struct mbuf *m_last = 0; | |
135 | struct mbuf *m_head = 0; | |
136 | ||
1c79356b A |
137 | |
138 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); | |
fa4905b1 | 139 | |
1c79356b | 140 | KERNEL_DEBUG(DBG_LAYER_BEG, |
0b4e3aa0 A |
141 | ((tp->t_template->tt_dport << 16) | tp->t_template->tt_sport), |
142 | (((tp->t_template->tt_src.s_addr & 0xffff) << 16) | | |
143 | (tp->t_template->tt_dst.s_addr & 0xffff)), | |
1c79356b | 144 | 0,0,0); |
1c79356b A |
145 | /* |
146 | * Determine length of data that should be transmitted, | |
147 | * and flags that will be used. | |
148 | * If there is some data or critical controls (SYN, RST) | |
149 | * to send, then transmit; otherwise, investigate further. | |
150 | */ | |
151 | idle = (tp->snd_max == tp->snd_una); | |
152 | if (idle && tp->t_idle >= tp->t_rxtcur) | |
153 | /* | |
154 | * We have been idle for "a while" and no acks are | |
155 | * expected to clock out any data we send -- | |
156 | * slow start to get ack "clock" running again. | |
157 | */ | |
158 | tp->snd_cwnd = tp->t_maxseg; | |
0b4e3aa0 A |
159 | |
160 | /* Never send data that's already been acked */ | |
161 | if (SEQ_GT(tp->snd_una, tp->snd_nxt)) | |
162 | tp->snd_nxt = tp->snd_una; | |
1c79356b A |
163 | again: |
164 | sendalot = 0; | |
165 | off = tp->snd_nxt - tp->snd_una; | |
166 | win = min(tp->snd_wnd, tp->snd_cwnd); | |
167 | ||
168 | flags = tcp_outflags[tp->t_state]; | |
169 | /* | |
170 | * Get standard flags, and add SYN or FIN if requested by 'hidden' | |
171 | * state flags. | |
172 | */ | |
173 | if (tp->t_flags & TF_NEEDFIN) | |
174 | flags |= TH_FIN; | |
175 | if (tp->t_flags & TF_NEEDSYN) | |
176 | flags |= TH_SYN; | |
177 | ||
178 | /* | |
179 | * If in persist timeout with window of 0, send 1 byte. | |
180 | * Otherwise, if window is small but nonzero | |
181 | * and timer expired, we will send what we can | |
182 | * and go to transmit state. | |
183 | */ | |
184 | if (tp->t_force) { | |
185 | if (win == 0) { | |
186 | /* | |
187 | * If we still have some data to send, then | |
188 | * clear the FIN bit. Usually this would | |
189 | * happen below when it realizes that we | |
190 | * aren't sending all the data. However, | |
191 | * if we have exactly 1 byte of unsent data, | |
192 | * then it won't clear the FIN bit below, | |
193 | * and if we are in persist state, we wind | |
194 | * up sending the packet without recording | |
195 | * that we sent the FIN bit. | |
196 | * | |
197 | * We can't just blindly clear the FIN bit, | |
198 | * because if we don't have any more data | |
199 | * to send then the probe will be the FIN | |
200 | * itself. | |
201 | */ | |
202 | if (off < so->so_snd.sb_cc) | |
203 | flags &= ~TH_FIN; | |
204 | win = 1; | |
205 | } else { | |
206 | tp->t_timer[TCPT_PERSIST] = 0; | |
207 | tp->t_rxtshift = 0; | |
208 | } | |
209 | } | |
210 | ||
211 | len = (long)ulmin(so->so_snd.sb_cc, win) - off; | |
212 | ||
213 | if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { | |
214 | taop = &tao_noncached; | |
215 | bzero(taop, sizeof(*taop)); | |
216 | } | |
217 | ||
218 | /* | |
219 | * Lop off SYN bit if it has already been sent. However, if this | |
220 | * is SYN-SENT state and if segment contains data and if we don't | |
221 | * know that foreign host supports TAO, suppress sending segment. | |
222 | */ | |
223 | if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { | |
224 | flags &= ~TH_SYN; | |
225 | off--, len++; | |
226 | if (len > 0 && tp->t_state == TCPS_SYN_SENT && | |
227 | taop->tao_ccsent == 0) { | |
228 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
229 | return 0; | |
230 | } | |
231 | } | |
232 | ||
233 | /* | |
234 | * Be careful not to send data and/or FIN on SYN segments | |
235 | * in cases when no CC option will be sent. | |
236 | * This measure is needed to prevent interoperability problems | |
237 | * with not fully conformant TCP implementations. | |
238 | */ | |
239 | if ((flags & TH_SYN) && | |
240 | ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) || | |
241 | ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) { | |
242 | len = 0; | |
243 | flags &= ~TH_FIN; | |
244 | } | |
245 | ||
246 | if (len < 0) { | |
247 | /* | |
248 | * If FIN has been sent but not acked, | |
249 | * but we haven't been called to retransmit, | |
250 | * len will be -1. Otherwise, window shrank | |
251 | * after we sent into it. If window shrank to 0, | |
252 | * cancel pending retransmit, pull snd_nxt back | |
253 | * to (closed) window, and set the persist timer | |
254 | * if it isn't already going. If the window didn't | |
255 | * close completely, just wait for an ACK. | |
256 | */ | |
257 | len = 0; | |
258 | if (win == 0) { | |
259 | tp->t_timer[TCPT_REXMT] = 0; | |
260 | tp->t_rxtshift = 0; | |
261 | tp->snd_nxt = tp->snd_una; | |
262 | if (tp->t_timer[TCPT_PERSIST] == 0) | |
263 | tcp_setpersist(tp); | |
264 | } | |
265 | } | |
266 | if (len > tp->t_maxseg) { | |
267 | len = tp->t_maxseg; | |
268 | sendalot = 1; | |
269 | } | |
270 | if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) | |
271 | flags &= ~TH_FIN; | |
272 | ||
273 | win = sbspace(&so->so_rcv); | |
274 | ||
275 | /* | |
276 | * Sender silly window avoidance. If connection is idle | |
277 | * and can send all data, a maximum segment, | |
278 | * at least a maximum default-size segment do it, | |
279 | * or are forced, do it; otherwise don't bother. | |
280 | * If peer's buffer is tiny, then send | |
281 | * when window is at least half open. | |
282 | * If retransmitting (possibly after persist timer forced us | |
283 | * to send into a small window), then must resend. | |
284 | */ | |
285 | if (len) { | |
286 | if (len == tp->t_maxseg) | |
287 | goto send; | |
288 | if (!(tp->t_flags & TF_MORETOCOME) && | |
289 | (idle || tp->t_flags & TF_NODELAY) && | |
290 | (tp->t_flags & TF_NOPUSH) == 0 && | |
291 | len + off >= so->so_snd.sb_cc) | |
292 | goto send; | |
293 | if (tp->t_force) | |
294 | goto send; | |
295 | if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) | |
296 | goto send; | |
297 | if (SEQ_LT(tp->snd_nxt, tp->snd_max)) | |
298 | goto send; | |
299 | } | |
300 | ||
301 | /* | |
302 | * Compare available window to amount of window | |
303 | * known to peer (as advertised window less | |
304 | * next expected input). If the difference is at least two | |
305 | * max size segments, or at least 50% of the maximum possible | |
306 | * window, then want to send a window update to peer. | |
307 | */ | |
308 | if (win > 0) { | |
309 | /* | |
310 | * "adv" is the amount we can increase the window, | |
311 | * taking into account that we are limited by | |
312 | * TCP_MAXWIN << tp->rcv_scale. | |
313 | */ | |
314 | long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) - | |
315 | (tp->rcv_adv - tp->rcv_nxt); | |
316 | ||
317 | if (adv >= (long) (2 * tp->t_maxseg)) | |
318 | goto send; | |
319 | if (2 * adv >= (long) so->so_rcv.sb_hiwat) | |
320 | goto send; | |
321 | } | |
322 | ||
323 | /* | |
324 | * Send if we owe peer an ACK. | |
325 | */ | |
326 | if (tp->t_flags & TF_ACKNOW) | |
327 | goto send; | |
328 | if ((flags & TH_RST) || | |
329 | ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) | |
330 | goto send; | |
331 | if (SEQ_GT(tp->snd_up, tp->snd_una)) | |
332 | goto send; | |
333 | /* | |
334 | * If our state indicates that FIN should be sent | |
335 | * and we have not yet done so, or we're retransmitting the FIN, | |
336 | * then we need to send. | |
337 | */ | |
338 | if (flags & TH_FIN && | |
339 | ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) | |
340 | goto send; | |
341 | ||
342 | /* | |
343 | * TCP window updates are not reliable, rather a polling protocol | |
344 | * using ``persist'' packets is used to insure receipt of window | |
345 | * updates. The three ``states'' for the output side are: | |
346 | * idle not doing retransmits or persists | |
347 | * persisting to move a small or zero window | |
348 | * (re)transmitting and thereby not persisting | |
349 | * | |
350 | * tp->t_timer[TCPT_PERSIST] | |
351 | * is set when we are in persist state. | |
352 | * tp->t_force | |
353 | * is set when we are called to send a persist packet. | |
354 | * tp->t_timer[TCPT_REXMT] | |
355 | * is set when we are retransmitting | |
356 | * The output side is idle when both timers are zero. | |
357 | * | |
358 | * If send window is too small, there is data to transmit, and no | |
359 | * retransmit or persist is pending, then go to persist state. | |
360 | * If nothing happens soon, send when timer expires: | |
361 | * if window is nonzero, transmit what we can, | |
362 | * otherwise force out a byte. | |
363 | */ | |
364 | if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && | |
365 | tp->t_timer[TCPT_PERSIST] == 0) { | |
366 | tp->t_rxtshift = 0; | |
367 | tcp_setpersist(tp); | |
368 | } | |
369 | ||
370 | /* | |
371 | * No reason to send a segment, just return. | |
372 | */ | |
373 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
374 | return (0); | |
375 | ||
376 | send: | |
377 | /* | |
378 | * Before ESTABLISHED, force sending of initial options | |
379 | * unless TCP set not to do any options. | |
380 | * NOTE: we assume that the IP/TCP header plus TCP options | |
381 | * always fit in a single mbuf, leaving room for a maximum | |
382 | * link header, i.e. | |
383 | * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN | |
384 | */ | |
385 | optlen = 0; | |
386 | #if INET6 | |
387 | if (isipv6) | |
388 | hdrlen = sizeof (struct tcpip6hdr); | |
389 | else | |
390 | #endif | |
391 | hdrlen = sizeof (struct tcpiphdr); | |
392 | if (flags & TH_SYN) { | |
393 | tp->snd_nxt = tp->iss; | |
394 | if ((tp->t_flags & TF_NOOPT) == 0) { | |
395 | u_short mss; | |
396 | ||
397 | opt[0] = TCPOPT_MAXSEG; | |
398 | opt[1] = TCPOLEN_MAXSEG; | |
399 | mss = htons((u_short) tcp_mssopt(tp, isipv6)); | |
400 | (void)memcpy(opt + 2, &mss, sizeof(mss)); | |
401 | optlen = TCPOLEN_MAXSEG; | |
402 | ||
403 | if ((tp->t_flags & TF_REQ_SCALE) && | |
404 | ((flags & TH_ACK) == 0 || | |
405 | (tp->t_flags & TF_RCVD_SCALE))) { | |
406 | *((u_int32_t *)(opt + optlen)) = htonl( | |
407 | TCPOPT_NOP << 24 | | |
408 | TCPOPT_WINDOW << 16 | | |
409 | TCPOLEN_WINDOW << 8 | | |
410 | tp->request_r_scale); | |
411 | optlen += 4; | |
412 | } | |
413 | } | |
414 | } | |
415 | ||
416 | /* | |
417 | * Send a timestamp and echo-reply if this is a SYN and our side | |
418 | * wants to use timestamps (TF_REQ_TSTMP is set) or both our side | |
419 | * and our peer have sent timestamps in our SYN's. | |
420 | */ | |
421 | if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && | |
422 | (flags & TH_RST) == 0 && | |
423 | ((flags & TH_ACK) == 0 || | |
424 | (tp->t_flags & TF_RCVD_TSTMP))) { | |
425 | u_int32_t *lp = (u_int32_t *)(opt + optlen); | |
426 | ||
427 | /* Form timestamp option as shown in appendix A of RFC 1323. */ | |
428 | *lp++ = htonl(TCPOPT_TSTAMP_HDR); | |
429 | *lp++ = htonl(tcp_now); | |
430 | *lp = htonl(tp->ts_recent); | |
431 | optlen += TCPOLEN_TSTAMP_APPA; | |
432 | } | |
433 | ||
434 | /* | |
435 | * Send `CC-family' options if our side wants to use them (TF_REQ_CC), | |
436 | * options are allowed (!TF_NOOPT) and it's not a RST. | |
437 | */ | |
438 | if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && | |
439 | (flags & TH_RST) == 0) { | |
440 | switch (flags & (TH_SYN|TH_ACK)) { | |
441 | /* | |
442 | * This is a normal ACK, send CC if we received CC before | |
443 | * from our peer. | |
444 | */ | |
445 | case TH_ACK: | |
446 | if (!(tp->t_flags & TF_RCVD_CC)) | |
447 | break; | |
448 | /*FALLTHROUGH*/ | |
449 | ||
450 | /* | |
451 | * We can only get here in T/TCP's SYN_SENT* state, when | |
452 | * we're a sending a non-SYN segment without waiting for | |
453 | * the ACK of our SYN. A check above assures that we only | |
454 | * do this if our peer understands T/TCP. | |
455 | */ | |
456 | case 0: | |
457 | opt[optlen++] = TCPOPT_NOP; | |
458 | opt[optlen++] = TCPOPT_NOP; | |
459 | opt[optlen++] = TCPOPT_CC; | |
460 | opt[optlen++] = TCPOLEN_CC; | |
461 | *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); | |
462 | ||
463 | optlen += 4; | |
464 | break; | |
465 | ||
466 | /* | |
467 | * This is our initial SYN, check whether we have to use | |
468 | * CC or CC.new. | |
469 | */ | |
470 | case TH_SYN: | |
471 | opt[optlen++] = TCPOPT_NOP; | |
472 | opt[optlen++] = TCPOPT_NOP; | |
473 | opt[optlen++] = tp->t_flags & TF_SENDCCNEW ? | |
474 | TCPOPT_CCNEW : TCPOPT_CC; | |
475 | opt[optlen++] = TCPOLEN_CC; | |
476 | *(u_int32_t *)&opt[optlen] = htonl(tp->cc_send); | |
477 | optlen += 4; | |
478 | break; | |
479 | ||
480 | /* | |
481 | * This is a SYN,ACK; send CC and CC.echo if we received | |
482 | * CC from our peer. | |
483 | */ | |
484 | case (TH_SYN|TH_ACK): | |
485 | if (tp->t_flags & TF_RCVD_CC) { | |
486 | opt[optlen++] = TCPOPT_NOP; | |
487 | opt[optlen++] = TCPOPT_NOP; | |
488 | opt[optlen++] = TCPOPT_CC; | |
489 | opt[optlen++] = TCPOLEN_CC; | |
490 | *(u_int32_t *)&opt[optlen] = | |
491 | htonl(tp->cc_send); | |
492 | optlen += 4; | |
493 | opt[optlen++] = TCPOPT_NOP; | |
494 | opt[optlen++] = TCPOPT_NOP; | |
495 | opt[optlen++] = TCPOPT_CCECHO; | |
496 | opt[optlen++] = TCPOLEN_CC; | |
497 | *(u_int32_t *)&opt[optlen] = | |
498 | htonl(tp->cc_recv); | |
499 | optlen += 4; | |
500 | } | |
501 | break; | |
502 | } | |
503 | } | |
504 | ||
505 | hdrlen += optlen; | |
506 | #if INET6 | |
507 | if (isipv6) | |
508 | ipoptlen = ip6_optlen(tp->t_inpcb); | |
509 | else | |
510 | #endif | |
511 | if (tp->t_inpcb->inp_options) { | |
512 | ipoptlen = tp->t_inpcb->inp_options->m_len - | |
513 | offsetof(struct ipoption, ipopt_list); | |
514 | } else { | |
515 | ipoptlen = 0; | |
516 | } | |
517 | #if IPSEC | |
518 | #if INET6 | |
519 | ipoptlen += ipsec_hdrsiz_tcp(tp, isipv6); | |
520 | #else | |
521 | ipoptlen += ipsec_hdrsiz_tcp(tp, 0); | |
522 | #endif | |
523 | #endif | |
524 | ||
525 | /* | |
526 | * Adjust data length if insertion of options will | |
527 | * bump the packet length beyond the t_maxopd length. | |
528 | * Clear the FIN bit because we cut off the tail of | |
529 | * the segment. | |
530 | */ | |
531 | if (len + optlen + ipoptlen > tp->t_maxopd) { | |
532 | /* | |
533 | * If there is still more to send, don't close the connection. | |
534 | */ | |
535 | flags &= ~TH_FIN; | |
536 | len = tp->t_maxopd - optlen - ipoptlen; | |
537 | sendalot = 1; | |
538 | } | |
539 | ||
540 | /*#ifdef DIAGNOSTIC*/ | |
541 | if (max_linkhdr + hdrlen > MHLEN) | |
542 | panic("tcphdr too big"); | |
543 | /*#endif*/ | |
544 | ||
545 | /* | |
546 | * Grab a header mbuf, attaching a copy of data to | |
547 | * be transmitted, and initialize the header from | |
548 | * the template for sends on this connection. | |
549 | */ | |
550 | if (len) { | |
551 | if (tp->t_force && len == 1) | |
552 | tcpstat.tcps_sndprobe++; | |
553 | else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { | |
554 | tcpstat.tcps_sndrexmitpack++; | |
555 | tcpstat.tcps_sndrexmitbyte += len; | |
556 | } else { | |
557 | tcpstat.tcps_sndpack++; | |
558 | tcpstat.tcps_sndbyte += len; | |
559 | } | |
560 | #ifdef notyet | |
561 | if ((m = m_copypack(so->so_snd.sb_mb, off, | |
562 | (int)len, max_linkhdr + hdrlen)) == 0) { | |
563 | error = ENOBUFS; | |
564 | goto out; | |
565 | } | |
566 | /* | |
567 | * m_copypack left space for our hdr; use it. | |
568 | */ | |
569 | m->m_len += hdrlen; | |
570 | m->m_data -= hdrlen; | |
571 | #else | |
fa4905b1 | 572 | m = NULL; |
1c79356b A |
573 | #if INET6 |
574 | if (MHLEN < hdrlen + max_linkhdr) { | |
fa4905b1 A |
575 | MGETHDR(m, M_DONTWAIT, MT_HEADER); |
576 | if (m == NULL) { | |
577 | error = ENOBUFS; | |
578 | goto out; | |
579 | } | |
1c79356b A |
580 | MCLGET(m, M_DONTWAIT); |
581 | if ((m->m_flags & M_EXT) == 0) { | |
582 | m_freem(m); | |
583 | error = ENOBUFS; | |
584 | goto out; | |
585 | } | |
fa4905b1 A |
586 | m->m_data += max_linkhdr; |
587 | m->m_len = hdrlen; | |
1c79356b A |
588 | } |
589 | #endif | |
1c79356b | 590 | if (len <= MHLEN - hdrlen - max_linkhdr) { |
fa4905b1 A |
591 | if (m == NULL) { |
592 | MGETHDR(m, M_DONTWAIT, MT_HEADER); | |
593 | if (m == NULL) { | |
594 | error = ENOBUFS; | |
595 | goto out; | |
596 | } | |
597 | m->m_data += max_linkhdr; | |
598 | m->m_len = hdrlen; | |
599 | } | |
1c79356b A |
600 | m_copydata(so->so_snd.sb_mb, off, (int) len, |
601 | mtod(m, caddr_t) + hdrlen); | |
602 | m->m_len += len; | |
603 | } else { | |
fa4905b1 A |
604 | if (m != NULL) { |
605 | m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); | |
606 | if (m->m_next == 0) { | |
607 | (void) m_free(m); | |
608 | error = ENOBUFS; | |
609 | goto out; | |
610 | } | |
611 | } else { | |
612 | if (m_head != so->so_snd.sb_mb || last_off != off) | |
613 | m_last = NULL; | |
614 | last_off = off + len; | |
615 | m_head = so->so_snd.sb_mb; | |
616 | ||
617 | if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, (int) len, M_DONTWAIT, &m_last, &m_off)) == NULL) { | |
618 | error = ENOBUFS; | |
619 | goto out; | |
620 | } | |
621 | m->m_data += max_linkhdr; | |
622 | m->m_len = hdrlen; | |
1c79356b A |
623 | } |
624 | } | |
625 | #endif | |
626 | /* | |
627 | * If we're sending everything we've got, set PUSH. | |
628 | * (This will keep happy those implementations which only | |
629 | * give data to the user when a buffer fills or | |
630 | * a PUSH comes in.) | |
631 | */ | |
632 | if (off + len == so->so_snd.sb_cc) | |
633 | flags |= TH_PUSH; | |
634 | } else { | |
635 | if (tp->t_flags & TF_ACKNOW) | |
636 | tcpstat.tcps_sndacks++; | |
637 | else if (flags & (TH_SYN|TH_FIN|TH_RST)) | |
638 | tcpstat.tcps_sndctrl++; | |
639 | else if (SEQ_GT(tp->snd_up, tp->snd_una)) | |
640 | tcpstat.tcps_sndurg++; | |
641 | else | |
642 | tcpstat.tcps_sndwinup++; | |
643 | ||
644 | MGETHDR(m, M_DONTWAIT, MT_HEADER); | |
645 | if (m == NULL) { | |
646 | error = ENOBUFS; | |
647 | goto out; | |
648 | } | |
649 | #if INET6 | |
650 | if (isipv6) { | |
651 | MH_ALIGN(m, hdrlen); | |
652 | } else | |
653 | #endif | |
654 | m->m_data += max_linkhdr; | |
655 | m->m_len = hdrlen; | |
656 | } | |
657 | m->m_pkthdr.rcvif = (struct ifnet *)0; | |
658 | if (tp->t_template == 0) | |
659 | panic("tcp_output"); | |
660 | #if INET6 | |
661 | if (isipv6) { | |
662 | ip6 = mtod(m, struct ip6_hdr *); | |
663 | th = (struct tcphdr *)(ip6 + 1); | |
664 | bcopy((caddr_t)&tp->t_template->tt_i6, (caddr_t)ip6, | |
665 | sizeof(struct ip6_hdr)); | |
666 | bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, | |
667 | sizeof(struct tcphdr)); | |
668 | } else { | |
669 | #endif /* INET6 */ | |
670 | ip = mtod(m, struct ip *); | |
671 | ipov = (struct ipovly *)ip; | |
672 | th = (struct tcphdr *)(ip + 1); | |
673 | bcopy((caddr_t)&tp->t_template->tt_i, (caddr_t)ip, sizeof(struct ip)); | |
674 | bcopy((caddr_t)&tp->t_template->tt_t, (caddr_t)th, | |
675 | sizeof(struct tcphdr)); | |
676 | #if INET6 | |
677 | } | |
678 | #endif /* INET6 */ | |
679 | ||
680 | /* | |
681 | * Fill in fields, remembering maximum advertised | |
682 | * window for use in delaying messages about window sizes. | |
683 | * If resending a FIN, be sure not to use a new sequence number. | |
684 | */ | |
685 | if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && | |
686 | tp->snd_nxt == tp->snd_max) | |
687 | tp->snd_nxt--; | |
688 | /* | |
689 | * If we are doing retransmissions, then snd_nxt will | |
690 | * not reflect the first unsent octet. For ACK only | |
691 | * packets, we do not want the sequence number of the | |
692 | * retransmitted packet, we want the sequence number | |
693 | * of the next unsent octet. So, if there is no data | |
694 | * (and no SYN or FIN), use snd_max instead of snd_nxt | |
695 | * when filling in ti_seq. But if we are in persist | |
696 | * state, snd_max might reflect one byte beyond the | |
697 | * right edge of the window, so use snd_nxt in that | |
698 | * case, since we know we aren't doing a retransmission. | |
699 | * (retransmit and persist are mutually exclusive...) | |
700 | */ | |
701 | if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST]) | |
702 | th->th_seq = htonl(tp->snd_nxt); | |
703 | else | |
704 | th->th_seq = htonl(tp->snd_max); | |
705 | th->th_ack = htonl(tp->rcv_nxt); | |
706 | if (optlen) { | |
707 | bcopy(opt, th + 1, optlen); | |
708 | th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; | |
709 | } | |
710 | th->th_flags = flags; | |
711 | /* | |
712 | * Calculate receive window. Don't shrink window, | |
713 | * but avoid silly window syndrome. | |
714 | */ | |
715 | if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) | |
716 | win = 0; | |
717 | if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) | |
718 | win = (long)(tp->rcv_adv - tp->rcv_nxt); | |
719 | if (win > (long)TCP_MAXWIN << tp->rcv_scale) | |
720 | win = (long)TCP_MAXWIN << tp->rcv_scale; | |
721 | th->th_win = htons((u_short) (win>>tp->rcv_scale)); | |
722 | if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { | |
723 | th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); | |
724 | th->th_flags |= TH_URG; | |
725 | } else | |
726 | /* | |
727 | * If no urgent pointer to send, then we pull | |
728 | * the urgent pointer to the left edge of the send window | |
729 | * so that it doesn't drift into the send window on sequence | |
730 | * number wraparound. | |
731 | */ | |
732 | tp->snd_up = tp->snd_una; /* drag it along */ | |
733 | ||
fa4905b1 | 734 | |
1c79356b A |
735 | /* |
736 | * Put TCP length in extended header, and then | |
737 | * checksum extended header and data. | |
738 | */ | |
739 | m->m_pkthdr.len = hdrlen + len; | |
740 | #if INET6 | |
741 | if (isipv6) { | |
742 | #if 0 /* ip6_plen will be filled in ip6_output. */ | |
743 | ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) + | |
744 | optlen + len)); | |
745 | #endif | |
746 | ||
747 | th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), | |
748 | sizeof(struct tcphdr) + optlen + len); | |
0b4e3aa0 | 749 | } else |
1c79356b | 750 | #endif /* INET6 */ |
0b4e3aa0 A |
751 | { |
752 | ||
1c79356b A |
753 | if (len + optlen) |
754 | ipov->ih_len = htons((u_short)(sizeof (struct tcphdr) + | |
755 | optlen + len)); | |
0b4e3aa0 A |
756 | m->m_pkthdr.csum_flags = CSUM_TCP; |
757 | m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); | |
758 | if (len + optlen) { | |
759 | th->th_sum = in_addword(th->th_sum, | |
760 | htons((u_short)(optlen + len))); | |
761 | } | |
762 | ||
1c79356b | 763 | } |
1c79356b A |
764 | |
765 | /* | |
766 | * In transmit state, time the transmission and arrange for | |
767 | * the retransmit. In persist state, just set snd_max. | |
768 | */ | |
769 | if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { | |
770 | tcp_seq startseq = tp->snd_nxt; | |
771 | ||
772 | /* | |
773 | * Advance snd_nxt over sequence space of this segment. | |
774 | */ | |
775 | if (flags & (TH_SYN|TH_FIN)) { | |
776 | if (flags & TH_SYN) | |
777 | tp->snd_nxt++; | |
778 | if (flags & TH_FIN) { | |
779 | tp->snd_nxt++; | |
780 | tp->t_flags |= TF_SENTFIN; | |
781 | } | |
782 | } | |
783 | tp->snd_nxt += len; | |
784 | if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { | |
785 | tp->snd_max = tp->snd_nxt; | |
786 | /* | |
787 | * Time this transmission if not a retransmission and | |
788 | * not currently timing anything. | |
789 | */ | |
790 | if (tp->t_rtt == 0) { | |
791 | tp->t_rtt = 1; | |
792 | tp->t_rtseq = startseq; | |
793 | tcpstat.tcps_segstimed++; | |
794 | } | |
795 | } | |
796 | ||
797 | /* | |
798 | * Set retransmit timer if not currently set, | |
799 | * and not doing an ack or a keep-alive probe. | |
800 | * Initial value for retransmit timer is smoothed | |
801 | * round-trip time + 2 * round-trip time variance. | |
802 | * Initialize shift counter which is used for backoff | |
803 | * of retransmit time. | |
804 | */ | |
805 | if (tp->t_timer[TCPT_REXMT] == 0 && | |
806 | tp->snd_nxt != tp->snd_una) { | |
807 | tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; | |
808 | if (tp->t_timer[TCPT_PERSIST]) { | |
809 | tp->t_timer[TCPT_PERSIST] = 0; | |
810 | tp->t_rxtshift = 0; | |
811 | } | |
812 | } | |
813 | } else | |
814 | if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) | |
815 | tp->snd_max = tp->snd_nxt + len; | |
816 | ||
817 | #if TCPDEBUG | |
818 | /* | |
819 | * Trace. | |
820 | */ | |
821 | if (so->so_options & SO_DEBUG) { | |
822 | #if INET6 | |
823 | if (isipv6) | |
824 | ip6->ip6_vfc = IPV6_VERSION; | |
825 | else | |
826 | ip->ip_vhl = IP_MAKE_VHL(IPVERSION, | |
827 | IP_VHL_HL(ip->ip_vhl)); | |
828 | #endif /* INET6 */ | |
829 | tcp_trace(TA_OUTPUT, tp->t_state, tp, | |
830 | #if INET6 | |
831 | isipv6 ? (void *)ip6 : | |
832 | #endif /* INET6 */ | |
833 | ip, | |
834 | th, 0); | |
835 | ||
836 | } | |
837 | #endif /* TCPDEBUG */ | |
838 | ||
839 | /* | |
840 | * Fill in IP length and desired time to live and | |
841 | * send to IP level. There should be a better way | |
842 | * to handle ttl and tos; we could keep them in | |
843 | * the template, but need a way to checksum without them. | |
844 | */ | |
845 | #if INET6 | |
846 | if (isipv6) { | |
847 | /* | |
848 | * we separately set hoplimit for every segment, since the | |
849 | * user might want to change the value via setsockopt. | |
850 | * Also, desired default hop limit might be changed via | |
851 | * Neighbor Discovery. | |
852 | */ | |
853 | ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, | |
854 | tp->t_inpcb->in6p_route.ro_rt ? | |
855 | tp->t_inpcb->in6p_route.ro_rt->rt_ifp | |
856 | : NULL); | |
857 | ||
858 | /* TODO: IPv6 IP6TOS_ECT bit on */ | |
859 | #if IPSEC | |
860 | ipsec_setsocket(m, so); | |
861 | #endif /*IPSEC*/ | |
862 | error = ip6_output(m, | |
863 | tp->t_inpcb->in6p_outputopts, | |
864 | &tp->t_inpcb->in6p_route, | |
865 | (so->so_options & SO_DONTROUTE) /* | IP6_DONTFRAG */, | |
866 | NULL, NULL); | |
867 | } else | |
868 | #endif /* INET6 */ | |
869 | { | |
870 | #if 1 | |
871 | struct rtentry *rt; | |
872 | #endif | |
873 | ip->ip_len = m->m_pkthdr.len; | |
874 | #if INET6 | |
875 | if (INP_CHECK_SOCKAF(so, AF_INET6)) | |
876 | ip->ip_ttl = in6_selecthlim(tp->t_inpcb, | |
877 | tp->t_inpcb->in6p_route.ro_rt ? | |
878 | tp->t_inpcb->in6p_route.ro_rt->rt_ifp | |
879 | : NULL); | |
880 | else | |
881 | #endif /* INET6 */ | |
882 | ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ | |
883 | ip->ip_tos = tp->t_inpcb->inp_ip_tos; /* XXX */ | |
884 | ||
0b4e3aa0 A |
885 | #define thtoti(x) \ |
886 | ((struct tcpiphdr *)(((char *)(x)) - (sizeof (struct ip)))) | |
887 | ||
1c79356b | 888 | KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport), |
0b4e3aa0 | 889 | (((thtoti(th)->ti_src.s_addr & 0xffff) << 16) | (thtoti(th)->ti_dst.s_addr & 0xffff)), |
1c79356b | 890 | th->th_seq, th->th_ack, th->th_win); |
1c79356b A |
891 | #if 1 |
892 | /* | |
893 | * See if we should do MTU discovery. We do it only if the following | |
894 | * are true: | |
895 | * 1) we have a valid route to the destination | |
896 | * 2) the MTU is not locked (if it is, then discovery has been | |
897 | * disabled) | |
898 | */ | |
899 | if ((rt = tp->t_inpcb->inp_route.ro_rt) | |
900 | && rt->rt_flags & RTF_UP | |
901 | && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { | |
902 | ip->ip_off |= IP_DF; | |
903 | } | |
904 | #endif | |
905 | ||
906 | #if IPSEC | |
907 | ipsec_setsocket(m, so); | |
908 | #endif /*IPSEC*/ | |
909 | ||
910 | error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, | |
911 | so->so_options & SO_DONTROUTE, 0); | |
912 | } | |
913 | if (error) { | |
914 | out: | |
915 | if (error == ENOBUFS) { | |
0b4e3aa0 A |
916 | if (!tp->t_timer[TCPT_REXMT] && |
917 | !tp->t_timer[TCPT_PERSIST]) | |
918 | tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; | |
1c79356b A |
919 | tcp_quench(tp->t_inpcb, 0); |
920 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
921 | return (0); | |
922 | } | |
923 | #if 1 | |
924 | if (error == EMSGSIZE) { | |
925 | /* | |
926 | * ip_output() will have already fixed the route | |
927 | * for us. tcp_mtudisc() will, as its last action, | |
928 | * initiate retransmission, so it is important to | |
929 | * not do so here. | |
930 | */ | |
931 | tcp_mtudisc(tp->t_inpcb, 0); | |
932 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
933 | return 0; | |
934 | } | |
935 | #endif | |
936 | if ((error == EHOSTUNREACH || error == ENETDOWN) | |
937 | && TCPS_HAVERCVDSYN(tp->t_state)) { | |
938 | tp->t_softerror = error; | |
939 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
940 | return (0); | |
941 | } | |
942 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
943 | return (error); | |
944 | } | |
945 | tcpstat.tcps_sndtotal++; | |
946 | ||
947 | /* | |
948 | * Data sent (as far as we can tell). | |
949 | * If this advertises a larger window than any other segment, | |
950 | * then remember the size of the advertised window. | |
951 | * Any pending ACK has now been sent. | |
952 | */ | |
953 | if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) | |
954 | tp->rcv_adv = tp->rcv_nxt + win; | |
955 | tp->last_ack_sent = tp->rcv_nxt; | |
956 | tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); | |
957 | if (sendalot) | |
958 | goto again; | |
959 | KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); | |
960 | return (0); | |
961 | } | |
962 | ||
963 | void | |
964 | tcp_setpersist(tp) | |
965 | register struct tcpcb *tp; | |
966 | { | |
967 | register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; | |
968 | ||
969 | if (tp->t_timer[TCPT_REXMT]) | |
970 | panic("tcp_output REXMT"); | |
971 | /* | |
972 | * Start/restart persistance timer. | |
973 | */ | |
974 | TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], | |
975 | t * tcp_backoff[tp->t_rxtshift], | |
976 | TCPTV_PERSMIN, TCPTV_PERSMAX); | |
977 | if (tp->t_rxtshift < TCP_MAXRXTSHIFT) | |
978 | tp->t_rxtshift++; | |
979 | } |