2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
26 * Copyright (c) 1982, 1986, 1988, 1993
27 * The Regents of the University of California. All rights reserved.
29 * Redistribution and use in source and binary forms, with or without
30 * modification, are permitted provided that the following conditions
32 * 1. Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * 2. Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in the
36 * documentation and/or other materials provided with the distribution.
37 * 3. All advertising materials mentioning features or use of this software
38 * must display the following acknowledgement:
39 * This product includes software developed by the University of
40 * California, Berkeley and its contributors.
41 * 4. Neither the name of the University nor the names of its contributors
42 * may be used to endorse or promote products derived from this software
43 * without specific prior written permission.
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * $FreeBSD: src/sys/netinet/ip_divert.c,v 1.42.2.4 2001/07/29 19:32:40 ume Exp $
62 #error "IPDIVERT requires INET."
65 #include <sys/param.h>
66 #include <sys/kernel.h>
67 #include <sys/malloc.h>
69 #include <sys/socket.h>
70 #include <sys/protosw.h>
71 #include <sys/socketvar.h>
72 #include <sys/sysctl.h>
73 #include <sys/systm.h>
78 #include <net/route.h>
80 #include <netinet/in.h>
81 #include <netinet/in_systm.h>
82 #include <netinet/ip.h>
83 #include <netinet/in_pcb.h>
84 #include <netinet/in_var.h>
85 #include <netinet/ip_var.h>
92 * Allocate enough space to hold a full IP packet
94 #define DIVSNDQ (65536 + 100)
95 #define DIVRCVQ (65536 + 100)
98 * A 16 bit cookie is passed to and from the user process.
99 * The user process can send it back to help the caller know
100 * something about where the packet originally came from.
102 * In the case of ipfw, then the cookie is the rule that sent
103 * us here. On reinjection is is the rule after which processing
104 * should continue. Leaving it the same will make processing start
105 * at the rule number after that which sent it here. Setting it to
106 * 0 will restart processing at the beginning.
108 * For divert_packet(), ip_divert_cookie is an input value only.
109 * For div_output(), ip_divert_cookie is an output value only.
111 u_int16_t ip_divert_cookie
;
113 /* Internal variables */
114 static struct inpcbhead divcb
;
115 static struct inpcbinfo divcbinfo
;
117 static u_long div_sendspace
= DIVSNDQ
; /* XXX sysctl ? */
118 static u_long div_recvspace
= DIVRCVQ
; /* XXX sysctl ? */
120 /* Optimization: have this preinitialized */
121 static struct sockaddr_in divsrc
= { sizeof(divsrc
), AF_INET
};
123 /* Internal functions */
124 static int div_output(struct socket
*so
,
125 struct mbuf
*m
, struct sockaddr
*addr
, struct mbuf
*control
);
128 * Initialize divert connection block queue.
134 divcbinfo
.listhead
= &divcb
;
136 * XXX We don't use the hash list for divert IP, but it's easier
137 * to allocate a one entry hash list than it is to check all
138 * over the place for hashbase == NULL.
140 divcbinfo
.hashbase
= hashinit(1, M_PCB
, &divcbinfo
.hashmask
);
141 divcbinfo
.porthashbase
= hashinit(1, M_PCB
, &divcbinfo
.porthashmask
);
142 divcbinfo
.ipi_zone
= (void *) zinit(sizeof(struct inpcb
),(maxsockets
* sizeof(struct inpcb
)),
146 * ### LD 08/03: init IP forwarding at this point [ipfw is not a module yet]
154 * IPPROTO_DIVERT is not a real IP protocol; don't allow any packets
155 * with that protocol number to enter the system from the outside.
158 div_input(struct mbuf
*m
, int off
)
160 ipstat
.ips_noproto
++;
165 * Divert a packet by passing it up to the divert socket at port 'port'.
167 * Setup generic address and protocol structures for div_input routine,
168 * then pass them along with mbuf chain.
171 divert_packet(struct mbuf
*m
, int incoming
, int port
)
179 KASSERT(port
!= 0, ("%s: port=0", __FUNCTION__
));
181 /* Record and reset divert cookie */
182 divsrc
.sin_port
= ip_divert_cookie
;
183 ip_divert_cookie
= 0;
186 if (m
->m_len
< sizeof(struct ip
) &&
187 (m
= m_pullup(m
, sizeof(struct ip
))) == 0) {
190 ip
= mtod(m
, struct ip
*);
193 * Record receive interface address, if any.
194 * But only for incoming packets.
196 divsrc
.sin_addr
.s_addr
= 0;
201 KASSERT((m
->m_flags
& M_PKTHDR
), ("%s: !PKTHDR", __FUNCTION__
));
203 /* Find IP address for receive interface */
204 TAILQ_FOREACH(ifa
, &m
->m_pkthdr
.rcvif
->if_addrhead
, ifa_link
) {
205 if (ifa
->ifa_addr
== NULL
)
207 if (ifa
->ifa_addr
->sa_family
!= AF_INET
)
210 ((struct sockaddr_in
*) ifa
->ifa_addr
)->sin_addr
;
215 * Record the incoming interface name whenever we have one.
217 bzero(&divsrc
.sin_zero
, sizeof(divsrc
.sin_zero
));
218 if (m
->m_pkthdr
.rcvif
) {
220 * Hide the actual interface name in there in the
221 * sin_zero array. XXX This needs to be moved to a
222 * different sockaddr type for divert, e.g.
223 * sockaddr_div with multiple fields like
224 * sockaddr_dl. Presently we have only 7 bytes
225 * but that will do for now as most interfaces
226 * are 4 or less + 2 or less bytes for unit.
227 * There is probably a faster way of doing this,
228 * possibly taking it from the sockaddr_dl on the iface.
229 * This solves the problem of a P2P link and a LAN interface
230 * having the same address, which can result in the wrong
231 * interface being assigned to the packet when fed back
232 * into the divert socket. Theoretically if the daemon saves
233 * and re-uses the sockaddr_in as suggested in the man pages,
234 * this iface name will come along for the ride.
235 * (see div_output for the other half of this.)
237 snprintf(divsrc
.sin_zero
, sizeof(divsrc
.sin_zero
),
238 "%s%d", m
->m_pkthdr
.rcvif
->if_name
,
239 m
->m_pkthdr
.rcvif
->if_unit
);
242 /* Put packet on socket queue, if any */
244 nport
= htons((u_int16_t
)port
);
245 LIST_FOREACH(inp
, &divcb
, inp_list
) {
246 if (inp
->inp_lport
== nport
)
247 sa
= inp
->inp_socket
;
250 if (sbappendaddr(&sa
->so_rcv
, (struct sockaddr
*)&divsrc
,
251 m
, (struct mbuf
*)0) == 0)
257 ipstat
.ips_noproto
++;
258 ipstat
.ips_delivered
--;
263 * Deliver packet back into the IP processing machinery.
265 * If no address specified, or address is 0.0.0.0, send to ip_output();
266 * otherwise, send to ip_input() and mark as having been received on
267 * the interface with that address.
270 div_output(so
, m
, addr
, control
)
272 register struct mbuf
*m
;
273 struct sockaddr
*addr
;
274 struct mbuf
*control
;
276 register struct inpcb
*const inp
= sotoinpcb(so
);
277 register struct ip
*const ip
= mtod(m
, struct ip
*);
278 struct sockaddr_in
*sin
= (struct sockaddr_in
*)addr
;
282 m_freem(control
); /* XXX */
284 /* Loopback avoidance and state recovery */
287 char *c
= sin
->sin_zero
;
289 ip_divert_cookie
= sin
->sin_port
;
292 * Find receive interface with the given name or IP address.
293 * The name is user supplied data so don't trust it's size or
294 * that it is zero terminated. The name has priority.
295 * We are presently assuming that the sockaddr_in
296 * has not been replaced by a sockaddr_div, so we limit it
297 * to 16 bytes in total. the name is stuffed (if it exists)
298 * in the sin_zero[] field.
300 while (*c
++ && (len
++ < sizeof(sin
->sin_zero
)));
301 if ((len
> 0) && (len
< sizeof(sin
->sin_zero
)))
302 m
->m_pkthdr
.rcvif
= ifunit(sin
->sin_zero
);
304 ip_divert_cookie
= 0;
307 /* Reinject packet into the system as incoming or outgoing */
308 if (!sin
|| sin
->sin_addr
.s_addr
== 0) {
310 * Don't allow both user specified and setsockopt options,
311 * and don't allow packet length sizes that will crash
313 if (((ip
->ip_hl
!= (sizeof (*ip
) >> 2)) && inp
->inp_options
) ||
314 ((u_short
)ntohs(ip
->ip_len
) > m
->m_pkthdr
.len
)) {
319 /* Convert fields to host order for ip_output() */
323 /* Send packet to output processing */
324 ipstat
.ips_rawout
++; /* XXX */
325 error
= ip_output(m
, inp
->inp_options
, &inp
->inp_route
,
326 (so
->so_options
& SO_DONTROUTE
) |
327 IP_ALLOWBROADCAST
| IP_RAWOUTPUT
,
332 /* If no luck with the name above. check by IP address. */
333 if (m
->m_pkthdr
.rcvif
== NULL
) {
335 * Make sure there are no distractions
336 * for ifa_ifwithaddr. Clear the port and the ifname.
337 * Maybe zap all 8 bytes at once using a 64bit write?
339 bzero(sin
->sin_zero
, sizeof(sin
->sin_zero
));
340 /* *((u_int64_t *)sin->sin_zero) = 0; */ /* XXX ?? */
342 if (!(ifa
= ifa_ifwithaddr((struct sockaddr
*) sin
))) {
343 error
= EADDRNOTAVAIL
;
346 m
->m_pkthdr
.rcvif
= ifa
->ifa_ifp
;
349 /* Send packet to input processing */
353 /* paranoid: Reset for next time (and other packets) */
354 /* almost definitly already done in the ipfw filter but.. */
355 ip_divert_cookie
= 0;
360 ip_divert_cookie
= 0;
365 div_attach(struct socket
*so
, int proto
, struct proc
*p
)
373 if (p
&& (error
= suser(p
->p_ucred
, &p
->p_acflag
)) != 0)
376 error
= soreserve(so
, div_sendspace
, div_recvspace
);
380 error
= in_pcballoc(so
, &divcbinfo
, p
);
384 inp
= (struct inpcb
*)so
->so_pcb
;
385 inp
->inp_ip_p
= proto
;
386 inp
->inp_vflag
|= INP_IPV4
;
387 inp
->inp_flags
|= INP_HDRINCL
;
388 /* The socket is always "connected" because
389 we always know "where" to send the packet */
390 so
->so_state
|= SS_ISCONNECTED
;
395 div_detach(struct socket
*so
)
407 div_abort(struct socket
*so
)
409 soisdisconnected(so
);
410 return div_detach(so
);
414 div_disconnect(struct socket
*so
)
416 if ((so
->so_state
& SS_ISCONNECTED
) == 0)
418 return div_abort(so
);
422 div_bind(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
430 /* in_pcbbind assumes that the socket is a sockaddr_in
431 * and in_pcbbind requires a valid address. Since divert
432 * sockets don't we need to make sure the address is
433 * filled in properly.
434 * XXX -- divert should not be abusing in_pcbind
435 * and should probably have its own family.
437 if (nam
->sa_family
!= AF_INET
) {
438 error
= EAFNOSUPPORT
;
440 ((struct sockaddr_in
*)nam
)->sin_addr
.s_addr
= INADDR_ANY
;
441 error
= in_pcbbind(inp
, nam
, p
);
448 div_shutdown(struct socket
*so
)
455 div_send(struct socket
*so
, int flags
, struct mbuf
*m
, struct sockaddr
*nam
,
456 struct mbuf
*control
, struct proc
*p
)
458 /* Packet must have a header (but that's about it) */
459 if (m
->m_len
< sizeof (struct ip
) &&
460 (m
= m_pullup(m
, sizeof (struct ip
))) == 0) {
461 ipstat
.ips_toosmall
++;
467 return div_output(so
, m
, nam
, control
);
471 div_pcblist SYSCTL_HANDLER_ARGS
474 struct inpcb
*inp
, **inp_list
;
479 * The process of preparing the TCB list is too time-consuming and
480 * resource-intensive to repeat twice on every request.
482 if (req
->oldptr
== 0) {
483 n
= divcbinfo
.ipi_count
;
484 req
->oldidx
= 2 * (sizeof xig
)
485 + (n
+ n
/8) * sizeof(struct xinpcb
);
489 if (req
->newptr
!= 0)
493 * OK, now we're committed to doing something.
496 gencnt
= divcbinfo
.ipi_gencnt
;
497 n
= divcbinfo
.ipi_count
;
500 xig
.xig_len
= sizeof xig
;
502 xig
.xig_gen
= gencnt
;
503 xig
.xig_sogen
= so_gencnt
;
504 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
508 inp_list
= _MALLOC(n
* sizeof *inp_list
, M_TEMP
, M_WAITOK
);
513 for (inp
= LIST_FIRST(divcbinfo
.listhead
), i
= 0; inp
&& i
< n
;
514 inp
= LIST_NEXT(inp
, inp_list
)) {
516 if (inp
->inp_gencnt
<= gencnt
)
518 if (inp
->inp_gencnt
<= gencnt
&& !prison_xinpcb(req
->p
, inp
))
526 for (i
= 0; i
< n
; i
++) {
528 if (inp
->inp_gencnt
<= gencnt
) {
530 xi
.xi_len
= sizeof xi
;
531 /* XXX should avoid extra copy */
532 bcopy(inp
, &xi
.xi_inp
, sizeof *inp
);
534 sotoxsocket(inp
->inp_socket
, &xi
.xi_socket
);
535 error
= SYSCTL_OUT(req
, &xi
, sizeof xi
);
540 * Give the user an updated idea of our state.
541 * If the generation differs from what we told
542 * her before, she knows that something happened
543 * while we were processing this request, and it
544 * might be necessary to retry.
547 xig
.xig_gen
= divcbinfo
.ipi_gencnt
;
548 xig
.xig_sogen
= so_gencnt
;
549 xig
.xig_count
= divcbinfo
.ipi_count
;
551 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
553 FREE(inp_list
, M_TEMP
);
557 #warning Fix SYSCTL net_inet_divert
559 SYSCTL_DECL(_net_inet_divert
);
560 SYSCTL_PROC(_net_inet_divert
, OID_AUTO
, pcblist
, CTLFLAG_RD
, 0, 0,
561 div_pcblist
, "S,xinpcb", "List of active divert sockets");
564 struct pr_usrreqs div_usrreqs
= {
565 div_abort
, pru_accept_notsupp
, div_attach
, div_bind
,
566 pru_connect_notsupp
, pru_connect2_notsupp
, in_control
, div_detach
,
567 div_disconnect
, pru_listen_notsupp
, in_setpeeraddr
, pru_rcvd_notsupp
,
568 pru_rcvoob_notsupp
, div_send
, pru_sense_null
, div_shutdown
,
569 in_setsockaddr
, sosend
, soreceive
, sopoll