2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 * Copyright (c) 1982, 1986, 1988, 1993
24 * The Regents of the University of California. All rights reserved.
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 * must display the following acknowledgement:
36 * This product includes software developed by the University of
37 * California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 * may be used to endorse or promote products derived from this software
40 * without specific prior written permission.
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * $FreeBSD: src/sys/netinet/ip_divert.c,v 1.42.2.4 2001/07/29 19:32:40 ume Exp $
59 #error "IPDIVERT requires INET."
62 #include <sys/param.h>
63 #include <sys/kernel.h>
64 #include <sys/malloc.h>
66 #include <sys/socket.h>
67 #include <sys/protosw.h>
68 #include <sys/socketvar.h>
69 #include <sys/sysctl.h>
70 #include <sys/systm.h>
75 #include <net/route.h>
77 #include <netinet/in.h>
78 #include <netinet/in_systm.h>
79 #include <netinet/ip.h>
80 #include <netinet/in_pcb.h>
81 #include <netinet/in_var.h>
82 #include <netinet/ip_var.h>
89 * Allocate enough space to hold a full IP packet
91 #define DIVSNDQ (65536 + 100)
92 #define DIVRCVQ (65536 + 100)
95 * A 16 bit cookie is passed to and from the user process.
96 * The user process can send it back to help the caller know
97 * something about where the packet originally came from.
99 * In the case of ipfw, then the cookie is the rule that sent
100 * us here. On reinjection is is the rule after which processing
101 * should continue. Leaving it the same will make processing start
102 * at the rule number after that which sent it here. Setting it to
103 * 0 will restart processing at the beginning.
105 * For divert_packet(), ip_divert_cookie is an input value only.
106 * For div_output(), ip_divert_cookie is an output value only.
108 u_int16_t ip_divert_cookie
;
110 /* Internal variables */
111 static struct inpcbhead divcb
;
112 static struct inpcbinfo divcbinfo
;
114 static u_long div_sendspace
= DIVSNDQ
; /* XXX sysctl ? */
115 static u_long div_recvspace
= DIVRCVQ
; /* XXX sysctl ? */
117 /* Optimization: have this preinitialized */
118 static struct sockaddr_in divsrc
= { sizeof(divsrc
), AF_INET
};
120 /* Internal functions */
121 static int div_output(struct socket
*so
,
122 struct mbuf
*m
, struct sockaddr
*addr
, struct mbuf
*control
);
125 * Initialize divert connection block queue.
131 divcbinfo
.listhead
= &divcb
;
133 * XXX We don't use the hash list for divert IP, but it's easier
134 * to allocate a one entry hash list than it is to check all
135 * over the place for hashbase == NULL.
137 divcbinfo
.hashbase
= hashinit(1, M_PCB
, &divcbinfo
.hashmask
);
138 divcbinfo
.porthashbase
= hashinit(1, M_PCB
, &divcbinfo
.porthashmask
);
139 divcbinfo
.ipi_zone
= (void *) zinit(sizeof(struct inpcb
),(maxsockets
* sizeof(struct inpcb
)),
143 * ### LD 08/03: init IP forwarding at this point [ipfw is not a module yet]
151 * IPPROTO_DIVERT is not a real IP protocol; don't allow any packets
152 * with that protocol number to enter the system from the outside.
155 div_input(struct mbuf
*m
, int off
)
157 ipstat
.ips_noproto
++;
162 * Divert a packet by passing it up to the divert socket at port 'port'.
164 * Setup generic address and protocol structures for div_input routine,
165 * then pass them along with mbuf chain.
168 divert_packet(struct mbuf
*m
, int incoming
, int port
)
176 KASSERT(port
!= 0, ("%s: port=0", __FUNCTION__
));
178 /* Record and reset divert cookie */
179 divsrc
.sin_port
= ip_divert_cookie
;
180 ip_divert_cookie
= 0;
183 if (m
->m_len
< sizeof(struct ip
) &&
184 (m
= m_pullup(m
, sizeof(struct ip
))) == 0) {
187 ip
= mtod(m
, struct ip
*);
190 * Record receive interface address, if any.
191 * But only for incoming packets.
193 divsrc
.sin_addr
.s_addr
= 0;
198 KASSERT((m
->m_flags
& M_PKTHDR
), ("%s: !PKTHDR", __FUNCTION__
));
200 /* Find IP address for receive interface */
201 TAILQ_FOREACH(ifa
, &m
->m_pkthdr
.rcvif
->if_addrhead
, ifa_link
) {
202 if (ifa
->ifa_addr
== NULL
)
204 if (ifa
->ifa_addr
->sa_family
!= AF_INET
)
207 ((struct sockaddr_in
*) ifa
->ifa_addr
)->sin_addr
;
212 * Record the incoming interface name whenever we have one.
214 bzero(&divsrc
.sin_zero
, sizeof(divsrc
.sin_zero
));
215 if (m
->m_pkthdr
.rcvif
) {
217 * Hide the actual interface name in there in the
218 * sin_zero array. XXX This needs to be moved to a
219 * different sockaddr type for divert, e.g.
220 * sockaddr_div with multiple fields like
221 * sockaddr_dl. Presently we have only 7 bytes
222 * but that will do for now as most interfaces
223 * are 4 or less + 2 or less bytes for unit.
224 * There is probably a faster way of doing this,
225 * possibly taking it from the sockaddr_dl on the iface.
226 * This solves the problem of a P2P link and a LAN interface
227 * having the same address, which can result in the wrong
228 * interface being assigned to the packet when fed back
229 * into the divert socket. Theoretically if the daemon saves
230 * and re-uses the sockaddr_in as suggested in the man pages,
231 * this iface name will come along for the ride.
232 * (see div_output for the other half of this.)
234 snprintf(divsrc
.sin_zero
, sizeof(divsrc
.sin_zero
),
235 "%s%d", m
->m_pkthdr
.rcvif
->if_name
,
236 m
->m_pkthdr
.rcvif
->if_unit
);
239 /* Put packet on socket queue, if any */
241 nport
= htons((u_int16_t
)port
);
242 LIST_FOREACH(inp
, &divcb
, inp_list
) {
243 if (inp
->inp_lport
== nport
)
244 sa
= inp
->inp_socket
;
247 if (sbappendaddr(&sa
->so_rcv
, (struct sockaddr
*)&divsrc
,
248 m
, (struct mbuf
*)0) == 0)
254 ipstat
.ips_noproto
++;
255 ipstat
.ips_delivered
--;
260 * Deliver packet back into the IP processing machinery.
262 * If no address specified, or address is 0.0.0.0, send to ip_output();
263 * otherwise, send to ip_input() and mark as having been received on
264 * the interface with that address.
267 div_output(so
, m
, addr
, control
)
269 register struct mbuf
*m
;
270 struct sockaddr
*addr
;
271 struct mbuf
*control
;
273 register struct inpcb
*const inp
= sotoinpcb(so
);
274 register struct ip
*const ip
= mtod(m
, struct ip
*);
275 struct sockaddr_in
*sin
= (struct sockaddr_in
*)addr
;
279 m_freem(control
); /* XXX */
281 /* Loopback avoidance and state recovery */
284 char *c
= sin
->sin_zero
;
286 ip_divert_cookie
= sin
->sin_port
;
289 * Find receive interface with the given name or IP address.
290 * The name is user supplied data so don't trust it's size or
291 * that it is zero terminated. The name has priority.
292 * We are presently assuming that the sockaddr_in
293 * has not been replaced by a sockaddr_div, so we limit it
294 * to 16 bytes in total. the name is stuffed (if it exists)
295 * in the sin_zero[] field.
297 while (*c
++ && (len
++ < sizeof(sin
->sin_zero
)));
298 if ((len
> 0) && (len
< sizeof(sin
->sin_zero
)))
299 m
->m_pkthdr
.rcvif
= ifunit(sin
->sin_zero
);
301 ip_divert_cookie
= 0;
304 /* Reinject packet into the system as incoming or outgoing */
305 if (!sin
|| sin
->sin_addr
.s_addr
== 0) {
307 * Don't allow both user specified and setsockopt options,
308 * and don't allow packet length sizes that will crash
310 if (((ip
->ip_hl
!= (sizeof (*ip
) >> 2)) && inp
->inp_options
) ||
311 ((u_short
)ntohs(ip
->ip_len
) > m
->m_pkthdr
.len
)) {
316 /* Convert fields to host order for ip_output() */
320 /* Send packet to output processing */
321 ipstat
.ips_rawout
++; /* XXX */
322 error
= ip_output(m
, inp
->inp_options
, &inp
->inp_route
,
323 (so
->so_options
& SO_DONTROUTE
) |
324 IP_ALLOWBROADCAST
| IP_RAWOUTPUT
,
329 /* If no luck with the name above. check by IP address. */
330 if (m
->m_pkthdr
.rcvif
== NULL
) {
332 * Make sure there are no distractions
333 * for ifa_ifwithaddr. Clear the port and the ifname.
334 * Maybe zap all 8 bytes at once using a 64bit write?
336 bzero(sin
->sin_zero
, sizeof(sin
->sin_zero
));
337 /* *((u_int64_t *)sin->sin_zero) = 0; */ /* XXX ?? */
339 if (!(ifa
= ifa_ifwithaddr((struct sockaddr
*) sin
))) {
340 error
= EADDRNOTAVAIL
;
343 m
->m_pkthdr
.rcvif
= ifa
->ifa_ifp
;
346 /* Send packet to input processing */
350 /* paranoid: Reset for next time (and other packets) */
351 /* almost definitly already done in the ipfw filter but.. */
352 ip_divert_cookie
= 0;
357 ip_divert_cookie
= 0;
362 div_attach(struct socket
*so
, int proto
, struct proc
*p
)
370 if (p
&& (error
= suser(p
->p_ucred
, &p
->p_acflag
)) != 0)
373 error
= soreserve(so
, div_sendspace
, div_recvspace
);
377 error
= in_pcballoc(so
, &divcbinfo
, p
);
381 inp
= (struct inpcb
*)so
->so_pcb
;
382 inp
->inp_ip_p
= proto
;
383 inp
->inp_vflag
|= INP_IPV4
;
384 inp
->inp_flags
|= INP_HDRINCL
;
385 /* The socket is always "connected" because
386 we always know "where" to send the packet */
387 so
->so_state
|= SS_ISCONNECTED
;
392 div_detach(struct socket
*so
)
404 div_abort(struct socket
*so
)
406 soisdisconnected(so
);
407 return div_detach(so
);
411 div_disconnect(struct socket
*so
)
413 if ((so
->so_state
& SS_ISCONNECTED
) == 0)
415 return div_abort(so
);
419 div_bind(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
427 /* in_pcbbind assumes that the socket is a sockaddr_in
428 * and in_pcbbind requires a valid address. Since divert
429 * sockets don't we need to make sure the address is
430 * filled in properly.
431 * XXX -- divert should not be abusing in_pcbind
432 * and should probably have its own family.
434 if (nam
->sa_family
!= AF_INET
) {
435 error
= EAFNOSUPPORT
;
437 ((struct sockaddr_in
*)nam
)->sin_addr
.s_addr
= INADDR_ANY
;
438 error
= in_pcbbind(inp
, nam
, p
);
445 div_shutdown(struct socket
*so
)
452 div_send(struct socket
*so
, int flags
, struct mbuf
*m
, struct sockaddr
*nam
,
453 struct mbuf
*control
, struct proc
*p
)
455 /* Packet must have a header (but that's about it) */
456 if (m
->m_len
< sizeof (struct ip
) &&
457 (m
= m_pullup(m
, sizeof (struct ip
))) == 0) {
458 ipstat
.ips_toosmall
++;
464 return div_output(so
, m
, nam
, control
);
468 div_pcblist SYSCTL_HANDLER_ARGS
471 struct inpcb
*inp
, **inp_list
;
476 * The process of preparing the TCB list is too time-consuming and
477 * resource-intensive to repeat twice on every request.
479 if (req
->oldptr
== 0) {
480 n
= divcbinfo
.ipi_count
;
481 req
->oldidx
= 2 * (sizeof xig
)
482 + (n
+ n
/8) * sizeof(struct xinpcb
);
486 if (req
->newptr
!= 0)
490 * OK, now we're committed to doing something.
493 gencnt
= divcbinfo
.ipi_gencnt
;
494 n
= divcbinfo
.ipi_count
;
497 xig
.xig_len
= sizeof xig
;
499 xig
.xig_gen
= gencnt
;
500 xig
.xig_sogen
= so_gencnt
;
501 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
505 inp_list
= _MALLOC(n
* sizeof *inp_list
, M_TEMP
, M_WAITOK
);
510 for (inp
= LIST_FIRST(divcbinfo
.listhead
), i
= 0; inp
&& i
< n
;
511 inp
= LIST_NEXT(inp
, inp_list
)) {
513 if (inp
->inp_gencnt
<= gencnt
)
515 if (inp
->inp_gencnt
<= gencnt
&& !prison_xinpcb(req
->p
, inp
))
523 for (i
= 0; i
< n
; i
++) {
525 if (inp
->inp_gencnt
<= gencnt
) {
527 xi
.xi_len
= sizeof xi
;
528 /* XXX should avoid extra copy */
529 bcopy(inp
, &xi
.xi_inp
, sizeof *inp
);
531 sotoxsocket(inp
->inp_socket
, &xi
.xi_socket
);
532 error
= SYSCTL_OUT(req
, &xi
, sizeof xi
);
537 * Give the user an updated idea of our state.
538 * If the generation differs from what we told
539 * her before, she knows that something happened
540 * while we were processing this request, and it
541 * might be necessary to retry.
544 xig
.xig_gen
= divcbinfo
.ipi_gencnt
;
545 xig
.xig_sogen
= so_gencnt
;
546 xig
.xig_count
= divcbinfo
.ipi_count
;
548 error
= SYSCTL_OUT(req
, &xig
, sizeof xig
);
550 FREE(inp_list
, M_TEMP
);
554 #warning Fix SYSCTL net_inet_divert
556 SYSCTL_DECL(_net_inet_divert
);
557 SYSCTL_PROC(_net_inet_divert
, OID_AUTO
, pcblist
, CTLFLAG_RD
, 0, 0,
558 div_pcblist
, "S,xinpcb", "List of active divert sockets");
561 struct pr_usrreqs div_usrreqs
= {
562 div_abort
, pru_accept_notsupp
, div_attach
, div_bind
,
563 pru_connect_notsupp
, pru_connect2_notsupp
, in_control
, div_detach
,
564 div_disconnect
, pru_listen_notsupp
, in_setpeeraddr
, pru_rcvd_notsupp
,
565 pru_rcvoob_notsupp
, div_send
, pru_sense_null
, div_shutdown
,
566 in_setsockaddr
, sosend
, soreceive
, sopoll