]> git.saurik.com Git - apple/xnu.git/commitdiff
xnu-1228.12.14.tar.gz mac-os-x-1057 v1228.12.14
authorApple <opensource@apple.com>
Mon, 11 May 2009 20:47:03 +0000 (20:47 +0000)
committerApple <opensource@apple.com>
Mon, 11 May 2009 20:47:03 +0000 (20:47 +0000)
206 files changed:
bsd/conf/MASTER
bsd/conf/Makefile.i386
bsd/conf/Makefile.ppc
bsd/dev/dtrace/dtrace.c
bsd/dev/dtrace/dtrace_ptss.c
bsd/dev/dtrace/fasttrap.c
bsd/dev/dtrace/lockstat.c
bsd/dev/dtrace/profile_prvd.c
bsd/dev/dtrace/sdt.c
bsd/dev/dtrace/systrace.c
bsd/dev/i386/sysctl.c
bsd/dev/unix_startup.c
bsd/hfs/hfs.h
bsd/hfs/hfs_encodings.c
bsd/hfs/hfs_link.c
bsd/hfs/hfs_vfsops.c
bsd/hfs/hfs_vfsutils.c
bsd/kern/kdebug.c
bsd/kern/kern_credential.c
bsd/kern/kern_exec.c
bsd/kern/kern_lockf.c
bsd/kern/kern_mib.c
bsd/kern/kern_sysctl.c
bsd/kern/kpi_socketfilter.c
bsd/kern/mach_loader.c
bsd/kern/pthread_synch.c
bsd/kern/uipc_mbuf.c
bsd/net/if.c
bsd/net/if_var.h
bsd/net/radix.c
bsd/net/radix.h
bsd/net/route.c
bsd/net/route.h
bsd/net/rtsock.c
bsd/netinet/in.c
bsd/netinet/in.h
bsd/netinet/in_arp.c
bsd/netinet/in_gif.c
bsd/netinet/in_pcb.c
bsd/netinet/in_pcb.h
bsd/netinet/in_rmx.c
bsd/netinet/in_var.h
bsd/netinet/ip_divert.c
bsd/netinet/ip_dummynet.c
bsd/netinet/ip_dummynet.h
bsd/netinet/ip_flow.c
bsd/netinet/ip_fw2.c
bsd/netinet/ip_fw2.h
bsd/netinet/ip_icmp.c
bsd/netinet/ip_input.c
bsd/netinet/ip_output.c
bsd/netinet/ip_var.h
bsd/netinet/kpi_ipfilter.c
bsd/netinet/raw_ip.c
bsd/netinet/tcp_input.c
bsd/netinet/tcp_output.c
bsd/netinet/tcp_subr.c
bsd/netinet/tcp_timer.c
bsd/netinet/tcp_usrreq.c
bsd/netinet/tcp_var.h
bsd/netinet/udp_usrreq.c
bsd/netinet6/icmp6.c
bsd/netinet6/in6_pcb.c
bsd/netinet6/in6_rmx.c
bsd/netinet6/ip6_fw.c
bsd/netinet6/ip6_output.c
bsd/sys/disk.h
bsd/sys/dtrace.h
bsd/sys/fcntl.h
bsd/sys/lockf.h
bsd/sys/lockstat.h
bsd/sys/mbuf.h
bsd/sys/vnode.h
bsd/sys/vnode_internal.h
bsd/vfs/kpi_vfs.c
bsd/vfs/vfs_journal.c
bsd/vfs/vfs_lookup.c
bsd/vfs/vfs_subr.c
bsd/vfs/vfs_syscalls.c
bsd/vfs/vfs_vnops.c
bsd/vfs/vfs_xattr.c
config/Libkern.exports
config/Libkern.i386.exports
config/Libkern.ppc.exports
config/Makefile
config/MasterVersion
config/System6.0.exports
config/System6.0.i386.exports
config/System6.0.ppc.exports
iokit/IOKit/IOBufferMemoryDescriptor.h
iokit/IOKit/IOMemoryDescriptor.h
iokit/Kernel/IOBufferMemoryDescriptor.cpp
iokit/Kernel/IOCatalogue.cpp
iokit/Kernel/IOHibernateIO.cpp
iokit/Kernel/IOMemoryDescriptor.cpp
iokit/Kernel/IOTimerEventSource.cpp
iokit/Kernel/IOUserClient.cpp
kgmacros
libkern/Makefile
libkern/c++/OSMetaClass.cpp
libkern/kmod/Makefile.kmod
libsa/catalogue.cpp
libsyscall/BSDmakefile
libsyscall/Makefile
libsyscall/Makefile.xbs
libsyscall/mach/Makefile.inc
makedefs/MakeInc.def
makedefs/MakeInc.rule
osfmk/conf/files.i386
osfmk/i386/AT386/model_dep.c
osfmk/i386/commpage/bcopy_sse3x.s
osfmk/i386/commpage/bcopy_sse3x_64.s
osfmk/i386/commpage/bcopy_sse42.s [new file with mode: 0644]
osfmk/i386/commpage/bcopy_sse42_64.s [new file with mode: 0644]
osfmk/i386/commpage/bzero_sse2.s
osfmk/i386/commpage/bzero_sse2_64.s
osfmk/i386/commpage/bzero_sse42.s [new file with mode: 0644]
osfmk/i386/commpage/bzero_sse42_64.s [new file with mode: 0644]
osfmk/i386/commpage/commpage_asm.s
osfmk/i386/commpage/commpage_mach_absolute_time.s
osfmk/i386/cpu_data.h
osfmk/i386/cpu_threads.c
osfmk/i386/cpu_topology.c
osfmk/i386/cpu_topology.h
osfmk/i386/cpuid.c
osfmk/i386/cpuid.h
osfmk/i386/etimer.c
osfmk/i386/i386_lock.s
osfmk/i386/i386_vm_init.c
osfmk/i386/lapic.c
osfmk/i386/lapic.h
osfmk/i386/machine_check.c
osfmk/i386/machine_check.h
osfmk/i386/machine_routines.c
osfmk/i386/machine_routines_asm.s
osfmk/i386/mp.c
osfmk/i386/mp.h
osfmk/i386/mp_desc.c
osfmk/i386/mp_events.h
osfmk/i386/pcb.c
osfmk/i386/pmCPU.c
osfmk/i386/pmCPU.h
osfmk/i386/pmap.c
osfmk/i386/proc_reg.h
osfmk/i386/rtclock.c
osfmk/i386/rtclock.h
osfmk/i386/tsc.c
osfmk/i386/tsc.h
osfmk/i386/vmx/vmx_cpu.c
osfmk/ipc/ipc_kmsg.h
osfmk/ipc/ipc_mqueue.c
osfmk/ipc/ipc_mqueue.h
osfmk/ipc/ipc_notify.c
osfmk/ipc/ipc_right.c
osfmk/ipc/mach_msg.c
osfmk/kern/ast.c
osfmk/kern/call_entry.h
osfmk/kern/clock.h
osfmk/kern/debug.c
osfmk/kern/debug.h
osfmk/kern/hibernate.c
osfmk/kern/host.c
osfmk/kern/ipc_mig.c
osfmk/kern/kmod.c
osfmk/kern/machine.c
osfmk/kern/misc_protos.h
osfmk/kern/printf.c
osfmk/kern/priority.c
osfmk/kern/processor.c
osfmk/kern/processor.h
osfmk/kern/processor_data.c
osfmk/kern/processor_data.h
osfmk/kern/sched.h
osfmk/kern/sched_prim.c
osfmk/kern/stack.c
osfmk/kern/startup.c
osfmk/kern/syscall_subr.c
osfmk/kern/thread_call.c
osfmk/kern/thread_call.h
osfmk/kern/timer_call.c
osfmk/kern/timer_call.h
osfmk/kern/timer_queue.h [new file with mode: 0644]
osfmk/kern/zalloc.c
osfmk/mach/Makefile
osfmk/mach/kext_panic_report.h [new file with mode: 0644]
osfmk/mach/kmod.h
osfmk/mach/machine.h
osfmk/mach/port.h
osfmk/ppc/Diagnostics.c
osfmk/ppc/cpu.c
osfmk/ppc/etimer.c
osfmk/ppc/exception.h
osfmk/ppc/machine_routines.c
osfmk/ppc/model_dep.c
osfmk/ppc/ppc_init.c
osfmk/ppc/rtclock.c
osfmk/ppc/rtclock.h
osfmk/vm/memory_object.c
osfmk/vm/vm_apple_protect.c
osfmk/vm/vm_fault.c
osfmk/vm/vm_init.c
pexpert/i386/pe_misc.s
pexpert/pexpert/pexpert.h
security/mac_framework.h
security/mac_policy.h
security/mac_vfs.c

index 5419f96bba63b432a207200e2a1193f7df8f41b3..9459048eb2e6bc1bcb5f25a0d166baa2cbfa82a8 100644 (file)
@@ -165,6 +165,7 @@ options             CONFIG_SOWUPCALL        # SB_UPCALL on sowwakeup        # <config_sowupcall>
 options                CONFIG_FORCE_OUT_IFP    # Force IP output to use an interface # <config_force_out_ifp>
 options                CONFIG_MBUF_NOEXPAND    # limit mbuf expansion  # <config_mbuf_noexpand>
 options                CONFIG_MBUF_JUMBO       # jumbo cluster pool    # <config_mbuf_jumbo>
+options                CONFIG_SCOPEDROUTING    # scoped routing on by default  # <config_scopedrouting>
 options                CONFIG_IP_EDGEHOLE      # Drop tagged packets at EDGE interface # <config_ip_edgehole>
 
 options                CONFIG_WORKQUEUE        # <config_workqueue>
index 07c0222081ba1c8b518f14a9bf36ff7474827b49..ec78b385ce9394d6825052d1b5002cc9564f1e89 100644 (file)
@@ -50,7 +50,6 @@ OBJS_NO_WERROR =              \
        ip_fw2_compat.o         \
        kpi_ipfilter.o          \
        in_gif.o                \
-       in_pcb.o                \
        ip_divert.o             \
        ip_dummynet.o           \
        ip_icmp.o               \
index ac870fd86b36136f24291fe08171737c64470abd..89d8109668aad4c7b2984b4e91885ead0e5af5ac 100644 (file)
@@ -51,7 +51,6 @@ OBJS_NO_WERROR =              \
        ip_fw2_compat.o         \
        kpi_ipfilter.o          \
        in_gif.o                \
-       in_pcb.o                \
        ip_divert.o             \
        ip_dummynet.o           \
        ip_icmp.o               \
index eebffddbbe8321269119e4502c98b2433edaf5f9..bdbe6a8746acf34b8caf87bd2286425b469227c7 100644 (file)
@@ -970,6 +970,8 @@ dtrace_priv_proc_common_zone(dtrace_state_t *state)
 
        return (0);
 #else
+#pragma unused(state)
+
        return 1; /* Darwin doesn't do zones. */
 #endif /* __APPLE__ */
 }
@@ -1124,7 +1126,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate)
        dtrace_dstate_percpu_t *dcpu;
        int i, work = 0;
 
-       for (i = 0; i < NCPU; i++) {
+       for (i = 0; i < (int)NCPU; i++) {
                dcpu = &dstate->dtds_percpu[i];
 
                ASSERT(dcpu->dtdsc_rinsing == NULL);
@@ -1174,7 +1176,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate)
 
        dtrace_sync();
 
-       for (i = 0; i < NCPU; i++) {
+       for (i = 0; i < (int)NCPU; i++) {
                dcpu = &dstate->dtds_percpu[i];
 
                if (dcpu->dtdsc_rinsing == NULL)
@@ -1519,7 +1521,7 @@ retry:
                                case DTRACE_DSTATE_CLEAN: {
                                        void *sp = &dstate->dtds_state;
 
-                                       if (++cpu >= NCPU)
+                                       if (++cpu >= (int)NCPU)
                                                cpu = 0;
 
                                        if (dcpu->dtdsc_dirty != NULL &&
@@ -1667,6 +1669,7 @@ retry:
 static void
 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
 {
+#pragma unused(arg)
        if (nval < *oval)
                *oval = nval;
 }
@@ -1675,6 +1678,7 @@ dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
 static void
 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
 {
+#pragma unused(arg)
        if (nval > *oval)
                *oval = nval;
 }
@@ -1744,6 +1748,7 @@ dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
 static void
 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
 {
+#pragma unused(arg)
        data[0]++;
        data[1] += nval;
 }
@@ -1752,6 +1757,7 @@ dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
 static void
 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
 {
+#pragma unused(nval,arg)
        *oval = *oval + 1;
 }
 
@@ -1759,6 +1765,7 @@ dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
 static void
 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
 {
+#pragma unused(arg)
        *oval += nval;
 }
 
@@ -1773,6 +1780,7 @@ static void
 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
 {
+#pragma unused(arg)
        dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
        uint32_t i, ndx, size, fsize;
        uint32_t align = sizeof (uint64_t) - 1;
@@ -3532,7 +3540,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
                 * string -- setting a bit in the map for every character
                 * found in the token string.
                 */
-               for (i = 0; i < sizeof (tokmap); i++)
+               for (i = 0; i < (int)sizeof (tokmap); i++)
                        tokmap[i] = 0;
 
                for (; tokaddr < toklimit; tokaddr++) {
@@ -4578,7 +4586,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
                                size_t sz = v->dtdv_type.dtdt_size;
 
                                sz += sizeof (uint64_t);
-                               ASSERT(svar->dtsv_size == NCPU * sz);
+                               ASSERT(svar->dtsv_size == (int)NCPU * sz);
                                a += CPU->cpu_id * sz;
 
                                if (*(uint8_t *)a == UINT8_MAX) {
@@ -4595,7 +4603,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
                                break;
                        }
 
-                       ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
+                       ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
                        tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
                        regs[rd] = tmp[CPU->cpu_id];
                        break;
@@ -4617,7 +4625,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
                                size_t sz = v->dtdv_type.dtdt_size;
 
                                sz += sizeof (uint64_t);
-                               ASSERT(svar->dtsv_size == NCPU * sz);
+                               ASSERT(svar->dtsv_size == (int)NCPU * sz);
                                a += CPU->cpu_id * sz;
 
                                if (regs[rd] == NULL) {
@@ -4633,7 +4641,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
                                break;
                        }
 
-                       ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
+                       ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
                        tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
                        tmp[CPU->cpu_id] = regs[rd];
                        break;
@@ -5403,7 +5411,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
 #ifdef lint
                uint64_t val = 0;
 #else
-               uint64_t val;
+               uint64_t val = 0;
 #endif
 
                mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
@@ -6535,6 +6543,7 @@ dtrace_match_string(const char *s, const char *p, int depth)
 static int
 dtrace_match_nul(const char *s, const char *p, int depth)
 {
+#pragma unused(s,p,depth)
        return (1); /* always match the empty pattern */
 }
 
@@ -6542,6 +6551,7 @@ dtrace_match_nul(const char *s, const char *p, int depth)
 static int
 dtrace_match_nonzero(const char *s, const char *p, int depth)
 {
+#pragma unused(p,depth)
        return (s != NULL && s[0] != '\0');
 }
 
@@ -7296,7 +7306,6 @@ dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
        }
 
        do {
-               kmod_info_t *ktl;
                /*
                 * First, call the blanket provide operation.
                 */
@@ -7322,10 +7331,10 @@ dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
 
                lck_mtx_unlock(&mod_lock);
 #else
-#if 0 /* XXX Workaround for PR_4643546 XXX */
+#if 0 /* FIXME: Workaround for PR_4643546 */
                simple_lock(&kmod_lock);
                
-               ktl = kmod;
+               kmod_info_t *ktl = kmod;
                while (ktl) {
                        prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ktl);
                        ktl = ktl->next;
@@ -8561,10 +8570,10 @@ dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
                        svarp = &vstate->dtvs_locals;
 
                        if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
-                               dsize = NCPU * (v->dtdv_type.dtdt_size +
+                               dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
                                    sizeof (uint64_t));
                        else
-                               dsize = NCPU * sizeof (uint64_t);
+                               dsize = (int)NCPU * sizeof (uint64_t);
 
                        break;
 
@@ -9100,7 +9109,7 @@ dtrace_ecb_resize(dtrace_ecb_t *ecb)
                         */
                        diff = offs + sizeof (dtrace_aggid_t);
 
-                       if (diff = (diff & (sizeof (uint64_t) - 1)))
+                       if ((diff = (diff & (sizeof (uint64_t) - 1))))
                                offs += sizeof (uint64_t) - diff;
 
                        aggbase = offs - sizeof (dtrace_aggid_t);
@@ -9795,12 +9804,12 @@ dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
                 * of creating our own (saving both time and space).
                 */
                dtrace_ecb_t *cached = dtrace_ecb_create_cache;
-               dtrace_action_t *act = cached->dte_action;
+               dtrace_action_t *act_if = cached->dte_action;
 
-               if (act != NULL) {
-                       ASSERT(act->dta_refcnt > 0);
-                       act->dta_refcnt++;
-                       ecb->dte_action = act;
+               if (act_if != NULL) {
+                       ASSERT(act_if->dta_refcnt > 0);
+                       act_if->dta_refcnt++;
+                       ecb->dte_action = act_if;
                        ecb->dte_action_last = cached->dte_action_last;
                        ecb->dte_needed = cached->dte_needed;
                        ecb->dte_size = cached->dte_size;
@@ -9961,7 +9970,7 @@ dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
                return (EFBIG);
 
 #if defined(__APPLE__)
-       if (size > (sane_size / 8) / NCPU) /* As in kdbg_set_nkdbufs(), roughly. */
+       if (size > (sane_size / 8) / (int)NCPU) /* As in kdbg_set_nkdbufs(), roughly. */
                return (ENOMEM);
 #endif /* __APPLE__ */
 
@@ -10056,7 +10065,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
        intptr_t offs = buf->dtb_offset, soffs;
        intptr_t woffs;
        caddr_t tomax;
-       size_t total;
+       size_t total_off;
 
        if (buf->dtb_flags & DTRACEBUF_INACTIVE)
                return (-1);
@@ -10100,7 +10109,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
                goto out;
        }
 
-       total = needed + (offs & (align - 1));
+       total_off = needed + (offs & (align - 1));
 
        /*
         * For a ring buffer, life is quite a bit more complicated.  Before
@@ -10109,15 +10118,15 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
         * is required.)
         */
        if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
-           offs + total > buf->dtb_size) {
+           offs + total_off > buf->dtb_size) {
                woffs = buf->dtb_xamot_offset;
 
-               if (offs + total > buf->dtb_size) {
+               if (offs + total_off > buf->dtb_size) {
                        /*
                         * We can't fit in the end of the buffer.  First, a
                         * sanity check that we can fit in the buffer at all.
                         */
-                       if (total > buf->dtb_size) {
+                       if (total_off > buf->dtb_size) {
                                dtrace_buffer_drop(buf);
                                return (-1);
                        }
@@ -10160,7 +10169,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
                         * that the top of the buffer is aligned.
                         */
                        offs = 0;
-                       total = needed;
+                       total_off = needed;
                        buf->dtb_flags |= DTRACEBUF_WRAPPED;
                } else {
                        /*
@@ -10186,7 +10195,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
                        }
                }
 
-               while (offs + total > woffs) {
+               while (offs + total_off > woffs) {
                        dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
                        size_t size;
 
@@ -10226,7 +10235,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
                                if (offs == 0) {
                                        buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
                                        buf->dtb_offset = 0;
-                                       woffs = total;
+                                       woffs = total_off;
 
                                        while (woffs < buf->dtb_size)
                                                tomax[woffs++] = 0;
@@ -10333,7 +10342,7 @@ dtrace_buffer_free(dtrace_buffer_t *bufs)
 {
        int i;
 
-       for (i = 0; i < NCPU; i++) {
+       for (i = 0; i < (int)NCPU; i++) {
                dtrace_buffer_t *buf = &bufs[i];
 
                if (buf->dtb_tomax == NULL) {
@@ -10714,7 +10723,7 @@ static int
 dtrace_enabling_matchstate(dtrace_state_t *state, int *nmatched)
 {
        dtrace_enabling_t *enab;
-       int matched, total = 0, err;
+       int matched, total_matched = 0, err;
 
        lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
        lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
@@ -10728,11 +10737,11 @@ dtrace_enabling_matchstate(dtrace_state_t *state, int *nmatched)
                if ((err = dtrace_enabling_match(enab, &matched)) != 0)
                        return (err);
 
-               total += matched;
+               total_matched += matched;
        }
 
        if (nmatched != NULL)
-               *nmatched = total;
+               *nmatched = total_matched;
 
        return (0);
 }
@@ -10824,6 +10833,7 @@ dtrace_enabling_provide(dtrace_provider_t *prv)
 static void
 dtrace_dof_error(dof_hdr_t *dof, const char *str)
 {
+#pragma unused(dof)
        if (dtrace_err_verbose)
                cmn_err(CE_WARN, "failed to process DOF: %s", str);
 
@@ -11155,7 +11165,7 @@ dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
        size_t ttl = 0;
        dof_difohdr_t *dofd;
        uintptr_t daddr = (uintptr_t)dof;
-       size_t max = dtrace_difo_maxsize;
+       size_t max_size = dtrace_difo_maxsize;
        int i, l, n;
 
        static const struct {
@@ -11220,7 +11230,7 @@ dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
                    dofd->dofd_links[l])) == NULL)
                        goto err; /* invalid section link */
 
-               if (ttl + subsec->dofs_size > max) {
+               if (ttl + subsec->dofs_size > max_size) {
                        dtrace_dof_error(dof, "exceeds maximum size");
                        goto err;
                }
@@ -11887,7 +11897,7 @@ static
 int
 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
 {
-       size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
+       size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
        void *base;
        uintptr_t limit;
        dtrace_dynvar_t *dvar, *next, *start;
@@ -11901,8 +11911,8 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
        if ((dstate->dtds_chunksize = chunksize) == 0)
                dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
 
-       if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
-               size = min;
+       if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
+               size = min_size;
 
        if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
                return (ENOMEM);
@@ -11910,7 +11920,7 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
        dstate->dtds_size = size;
        dstate->dtds_base = base;
        dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
-       bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
+       bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
 
        hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
 
@@ -11941,10 +11951,10 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
            ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
        limit = (uintptr_t)base + size;
 
-       maxper = (limit - (uintptr_t)start) / NCPU;
+       maxper = (limit - (uintptr_t)start) / (int)NCPU;
        maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
 
-       for (i = 0; i < NCPU; i++) {
+       for (i = 0; i < (int)NCPU; i++) {
                dstate->dtds_percpu[i].dtdsc_free = dvar = start;
 
                /*
@@ -11954,7 +11964,7 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
                 * whatever is left over.  In either case, we set the limit to
                 * be the limit of the dynamic variable space.
                 */
-               if (maxper == 0 || i == NCPU - 1) {
+               if (maxper == 0 || i == (int)NCPU - 1) {
                        limit = (uintptr_t)base + size;
                        start = NULL;
                } else {
@@ -12071,7 +12081,7 @@ dtrace_state_create(dev_t *devp, cred_t *cr)
        char c[30];
        dtrace_state_t *state;
        dtrace_optval_t *opt;
-       int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
+       int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
 
        lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
        lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
@@ -12310,7 +12320,7 @@ static int
 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
 {
        dtrace_optval_t *opt = state->dts_options, size;
-       processorid_t cpu;
+       processorid_t cpu = 0;
        int flags = 0, rval;
 
        lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
@@ -12430,7 +12440,7 @@ dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
        dtrace_buffer_t *buf;
        cyc_handler_t hdlr;
        cyc_time_t when;
-       int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
+       int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
        dtrace_icookie_t cookie;
 
        lck_mtx_lock(&cpu_lock);
@@ -12808,7 +12818,7 @@ dtrace_state_destroy(dtrace_state_t *state)
        dtrace_ecb_t *ecb;
        dtrace_vstate_t *vstate = &state->dts_vstate;
        minor_t minor = getminor(state->dts_dev);
-       int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
+       int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
        dtrace_speculation_t *spec = state->dts_speculations;
        int nspec = state->dts_nspeculations;
        uint32_t match;
@@ -13100,7 +13110,7 @@ dtrace_helper_trace(dtrace_helper_action_t *helper,
                if ((svar = vstate->dtvs_locals[i]) == NULL)
                        continue;
 
-               ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
+               ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
                ent->dtht_locals[i] =
                    ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
        }
@@ -13113,7 +13123,7 @@ dtrace_helper(int which, dtrace_mstate_t *mstate,
        uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
        uint64_t sarg0 = mstate->dtms_arg[0];
        uint64_t sarg1 = mstate->dtms_arg[1];
-       uint64_t rval;
+       uint64_t rval = 0;
        dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
        dtrace_helper_action_t *helper;
        dtrace_vstate_t *vstate;
@@ -13262,7 +13272,7 @@ dtrace_helper_destroygen(proc_t* p, int gen)
         * given generation number.
         */
        for (;;) {
-               dtrace_helper_provider_t *prov;
+               dtrace_helper_provider_t *prov = NULL;
 
                /*
                 * Look for a helper provider with the right generation. We
@@ -14840,7 +14850,7 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
            1, INT_MAX, 0);
 
        dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
-           sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
+           sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
            NULL, NULL, NULL, NULL, NULL, 0);
 
        lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
@@ -15075,6 +15085,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
 static int
 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 {
+#pragma unused(flag,otyp,cred_p)
        minor_t minor = getminor(dev);
        dtrace_state_t *state;
 
@@ -15294,6 +15305,8 @@ dtrace_ioctl_helper(int cmd, caddr_t arg, int *rv)
 static int
 dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 {
+#pragma unused(md)
+
        minor_t minor = getminor(dev);
        dtrace_state_t *state;
        int rval;
@@ -15798,7 +15811,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
                if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
                        return (EFAULT);
 
-               if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
+               if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= (int)NCPU)
                        return (EINVAL);
 
                lck_mtx_lock(&dtrace_lock);
@@ -15964,7 +15977,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
                nerrs = state->dts_errors;
                dstate = &state->dts_vstate.dtvs_dynvars;
 
-               for (i = 0; i < NCPU; i++) {
+               for (i = 0; i < (int)NCPU; i++) {
                        dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
 
                        stat.dtst_dyndrops += dcpu->dtdsc_drops;
index 8e2ec272e66e6861fa9251ad5daa5b5156a6b025..f4503c9efef09a72520e89ff1b217f458536120c 100644 (file)
@@ -161,6 +161,16 @@ dtrace_ptss_allocate_page(struct proc* p)
        mach_vm_address_t addr = 0LL;
        mach_vm_size_t size = PAGE_SIZE; // We need some way to assert that this matches vm_map_round_page() !!!
 
+#if CONFIG_EMBEDDED
+       /* The embedded OS has extra permissions for writable and executable pages. We can't pass in the flags
+        * we need for the correct permissions from mach_vm_allocate, so need to call mach_vm_map directly. */
+       vm_map_offset_t map_addr = 0;
+       kern_return_t kr = mach_vm_map(map, &map_addr, size, 0, VM_FLAGS_ANYWHERE, IPC_PORT_NULL, 0, FALSE, VM_PROT_READ|VM_PROT_EXECUTE, VM_PROT_READ|VM_PROT_EXECUTE, VM_INHERIT_DEFAULT);
+       if (kr != KERN_SUCCESS) {
+               goto err;
+       }
+       addr = map_addr;
+#else
        kern_return_t kr = mach_vm_allocate(map, &addr, size, VM_FLAGS_ANYWHERE);
        if (kr != KERN_SUCCESS) {
                goto err;
@@ -171,6 +181,7 @@ dtrace_ptss_allocate_page(struct proc* p)
                mach_vm_deallocate(map, addr, size);
                goto err;
        }       
+#endif
 
        // Chain the page entries.
        int i;
index b0828e6dc517a0b3853508fe6afd714111d0218f..3cb1b62e6d62ba95141208e6d144884e4cc7afc9 100644 (file)
@@ -1771,6 +1771,7 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata)
                        tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc;
                        tp->ftt_pid = pdata->ftps_pid;
 
+
                        pp->ftp_tps[0].fit_tp = tp;
                        pp->ftp_tps[0].fit_id.fti_probe = pp;
 #if defined(__APPLE__)
@@ -2368,6 +2369,8 @@ fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
         * Yes, this is a WAG.
         */
        fasttrap_max = (sane_size >> 28) * 100000;
+       if (fasttrap_max == 0)
+               fasttrap_max = 50000;
 #endif
        fasttrap_total = 0;
 
index 3c5602be9d45bdb86be9857d67baad26bcddcd4a..82539d98bfe7efc937551a078788401da5def91f 100644 (file)
@@ -183,6 +183,7 @@ vm_offset_t *assembly_probes[] = {
  */
 void lockstat_hot_patch(boolean_t active)
 {
+#pragma unused(active)
        int i;
 
 
@@ -224,6 +225,7 @@ static dtrace_provider_id_t lockstat_id;
 static void
 lockstat_enable(void *arg, dtrace_id_t id, void *parg)
 {
+#pragma unused(arg)
        lockstat_probe_t *probe = parg;
 
        ASSERT(!lockstat_probemap[probe->lsp_probe]);
@@ -243,6 +245,7 @@ lockstat_enable(void *arg, dtrace_id_t id, void *parg)
 static void
 lockstat_disable(void *arg, dtrace_id_t id, void *parg)
 {
+#pragma unused(arg,id)
        lockstat_probe_t *probe = parg;
        int i;
 
@@ -272,6 +275,7 @@ lockstat_disable(void *arg, dtrace_id_t id, void *parg)
 static void
 lockstat_provide(void *arg, const dtrace_probedesc_t *desc)
 {
+#pragma unused(arg,desc)
        int i = 0;
 
        for (i = 0; lockstat_probes[i].lsp_func != NULL; i++) {
@@ -293,6 +297,7 @@ lockstat_provide(void *arg, const dtrace_probedesc_t *desc)
 static void
 lockstat_destroy(void *arg, dtrace_id_t id, void *parg)
 {
+#pragma unused(arg,id)
        lockstat_probe_t *probe = parg;
 
        ASSERT(!lockstat_probemap[probe->lsp_probe]);
index 14895f8d983e8fd09e8f0263148be40930cfaf0f..cd561c2df27180c1a67899b7bd2785fa10f2586d 100644 (file)
@@ -206,6 +206,7 @@ profile_fire(void *arg)
            CPU->cpu_profile_upc, late, 0, 0);
 #else
 #if defined(__ppc__) || defined(__ppc64__)
+       {
        struct savearea *sv = find_kern_regs(current_thread());
 
        if (sv) {
@@ -218,7 +219,9 @@ profile_fire(void *arg)
                dtrace_probe(prof->prof_id, 0xcafebabe,
                0x0, late, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */
        }
+       }
 #elif defined(__i386__) || defined(__x86_64__)
+       {
        x86_saved_state32_t *kern_regs = find_kern_regs(current_thread());
 
        if (NULL != kern_regs) {
@@ -242,6 +245,7 @@ profile_fire(void *arg)
                        dtrace_probe(prof->prof_id, 0x0, regs->eip, 0, 0, 0);
                }       
        }
+       }
 #else
 #error Unknown architecture
 #endif
@@ -258,6 +262,7 @@ profile_tick(void *arg)
            CPU->cpu_profile_upc, 0, 0, 0);
 #else
 #if defined(__ppc__) || defined(__ppc64__)
+       {
        struct savearea *sv = find_kern_regs(current_thread());
 
        if (sv) {
@@ -270,7 +275,9 @@ profile_tick(void *arg)
                dtrace_probe(prof->prof_id, 0xcafebabe,
                0x0, 0, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */
        }
+       }
 #elif defined(__i386__) || defined(__x86_64__)
+       {
        x86_saved_state32_t *kern_regs = find_kern_regs(current_thread());
 
        if (NULL != kern_regs) {
@@ -294,6 +301,7 @@ profile_tick(void *arg)
                        dtrace_probe(prof->prof_id, 0x0, regs->eip, 0, 0, 0);
                }       
        }
+       }
 #else
 #error Unknown architecture
 #endif
index 640bfae341d87fba6c9930b3c7a3476638f46f46..946c6a4c617d21f2de878e741efd8d1fd71a311e 100644 (file)
@@ -657,6 +657,7 @@ void
 sdt_provide_module(void *arg, struct modctl *ctl)
 {
 #pragma unused(ctl)
+#pragma unused(arg)
     __sdt_provide_module(arg, &g_sdt_kernctl);
 
        sdt_probedesc_t *sdpd = g_sdt_mach_module.sdt_probes;
index 35601e943fec93c912b68554450657e5e7abda5f..52362b6401973c4238c438bc8ffac4f6a1900863 100644 (file)
@@ -161,8 +161,12 @@ dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv)
        // Bounds "check" the value of code a la unix_syscall
        sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code];
 
-       if ((id = sy->stsy_entry) != DTRACE_IDNONE)
-               (*systrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4));
+       if ((id = sy->stsy_entry) != DTRACE_IDNONE) {
+               if (ip)
+                       (*systrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4));
+               else
+                       (*systrace_probe)(id, 0, 0, 0, 0, 0);
+       }
 
 #if 0 /* XXX */
        /*
index 2637c26545866835692453b25d42a607b9d61cf7..3cb481c40d525b6415b919bbc46bafa861767438 100644 (file)
@@ -101,6 +101,59 @@ hw_cpu_logical_per_package SYSCTL_HANDLER_ARGS
                          sizeof(cpu_info->cpuid_logical_per_package));
 }
 
+static int
+hw_cpu_sysctl_nehalem SYSCTL_HANDLER_ARGS
+{
+       i386_cpu_info_t *cpu_info = cpuid_info();
+
+       if (cpu_info->cpuid_model != 26)
+               return ENOENT;
+
+       hw_cpu_sysctl(oidp, arg1, arg2, req);
+}
+
+static int
+hw_cpu_flex_ratio_desired SYSCTL_HANDLER_ARGS
+{
+       __unused struct sysctl_oid *unused_oidp = oidp;
+       __unused void *unused_arg1 = arg1;
+       __unused int unused_arg2 = arg2;
+       i386_cpu_info_t *cpu_info = cpuid_info();
+
+       if (cpu_info->cpuid_model != 26)
+               return ENOENT;
+
+       return SYSCTL_OUT(req, &flex_ratio, sizeof(flex_ratio));
+}
+
+static int
+hw_cpu_flex_ratio_min SYSCTL_HANDLER_ARGS
+{
+       __unused struct sysctl_oid *unused_oidp = oidp;
+       __unused void *unused_arg1 = arg1;
+       __unused int unused_arg2 = arg2;
+       i386_cpu_info_t *cpu_info = cpuid_info();
+
+       if (cpu_info->cpuid_model != 26)
+               return ENOENT;
+
+       return SYSCTL_OUT(req, &flex_ratio_min, sizeof(flex_ratio_min));
+}
+
+static int
+hw_cpu_flex_ratio_max SYSCTL_HANDLER_ARGS
+{
+       __unused struct sysctl_oid *unused_oidp = oidp;
+       __unused void *unused_arg1 = arg1;
+       __unused int unused_arg2 = arg2;
+       i386_cpu_info_t *cpu_info = cpuid_info();
+
+       if (cpu_info->cpuid_model != 26)
+               return ENOENT;
+
+       return SYSCTL_OUT(req, &flex_ratio_max, sizeof(flex_ratio_max));
+}
+
 SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
        "CPU info");
 
@@ -353,6 +406,23 @@ SYSCTL_PROC(_machdep_cpu, OID_AUTO, thread_count,
            sizeof(uint32_t),
            hw_cpu_sysctl, "I", "Number of enabled threads per package");
 
+SYSCTL_NODE(_machdep_cpu, OID_AUTO, flex_ratio, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
+       "Flex ratio");
+
+SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, desired,
+           CTLTYPE_INT | CTLFLAG_RD, 
+           0, 0,
+           hw_cpu_flex_ratio_desired, "I", "Flex ratio desired (0 disabled)");
+
+SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, min,
+           CTLTYPE_INT | CTLFLAG_RD, 
+           0, 0,
+           hw_cpu_flex_ratio_min, "I", "Flex ratio min (efficiency)");
+
+SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, max,
+           CTLTYPE_INT | CTLFLAG_RD, 
+           0, 0,
+           hw_cpu_flex_ratio_max, "I", "Flex ratio max (non-turbo)");
 
 uint64_t pmap_pv_hashlist_walks;
 uint64_t pmap_pv_hashlist_cnts;
index d2dd20b11e1997d6b22dd74824492db65ea36a3f..1522646eafd573c356f8a16b95a6a2924f7c29db 100644 (file)
@@ -46,6 +46,7 @@
 #include <sys/vnode.h>
 #include <sys/sysctl.h>
 #include <dev/ppc/cons.h>
+#include <pexpert/pexpert.h>
 
 extern vm_map_t mb_map;
 
@@ -81,6 +82,7 @@ SYSCTL_INT (_kern, OID_AUTO, maxnbuf, CTLFLAG_RW, &max_nbuf_headers, 0, "");
 __private_extern__ int customnbuf = 0;
 int             srv = 0;       /* Flag indicates a server boot when set */
 int             ncl = 0;
+static unsigned int mbuf_poolsz;
 
 vm_map_t        buffer_map;
 vm_map_t        bufferhdr_map;
@@ -209,6 +211,9 @@ bsd_bufferinit(void)
        bufinit();
 }
 
+/* 512 MB hard limit on size of the mbuf pool */
+#define MAX_MBUF_POOL   (512 << MBSHIFT)
+#define MAX_NCL         (MAX_MBUF_POOL >> MCLSHIFT)
 
 /*
  * this has been broken out into a separate routine that
@@ -220,8 +225,13 @@ bsd_bufferinit(void)
 int
 bsd_mbuf_cluster_reserve(void)
 {
-        if (sane_size > (64 * 1024 * 1024) || ncl) {
+       /* If called more than once, return the previously calculated size */
+        if (mbuf_poolsz != 0)
+                goto done;
+
+       PE_parse_boot_argn("ncl", &ncl, sizeof (ncl));
 
+        if (sane_size > (64 * 1024 * 1024) || ncl) {
                if ((nmbclusters = ncl) == 0) {
                        if ((nmbclusters = ((sane_size / 16)/MCLBYTES)) > 32768)
                                nmbclusters = 32768;
@@ -229,7 +239,13 @@ bsd_mbuf_cluster_reserve(void)
                /* Make sure it's not odd in case ncl is manually set */
                if (nmbclusters & 0x1)
                        --nmbclusters;
-       }
 
+                /* And obey the upper limit */
+                if (nmbclusters > MAX_NCL)
+                       nmbclusters = MAX_NCL;
+
+       }
+       mbuf_poolsz = nmbclusters << MCLSHIFT;
+done:
        return (nmbclusters * MCLBYTES);
 }
index cfed9e65dee611b3cf59128ed2d541696c5ae135..f3c12bb41e29e15473a84a243c2a0ef96aeb9d8b 100644 (file)
@@ -266,6 +266,7 @@ typedef struct hfsmount {
 
        lck_mtx_t      hfs_mutex;      /* protects access to hfsmount data */
        void          *hfs_freezing_proc;  /* who froze the fs */
+       void          *hfs_downgrading_proc; /* process who's downgrading to rdonly */
        lck_rw_t       hfs_insync;     /* protects sync/freeze interaction */
 
        /* Resize variables: */
@@ -341,6 +342,9 @@ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS};
 #define HFS_VIRTUAL_DEVICE        0x20000
 /* When set, we're in hfs_changefs, so hfs_sync should do nothing. */
 #define HFS_IN_CHANGEFS           0x40000
+/* When set, we are in process of downgrading or have downgraded to read-only, 
+ * so hfs_start_transaction should return EROFS. */
+#define HFS_RDONLY_DOWNGRADE      0x80000
 
 
 /* Macro to update next allocation block in the HFS mount structure.  If 
index d0e89e8d8d47c5a5aba3d71fd1172d4de43f57f5..c531aa28be47f2e52ed6101cd9b515465590b42f 100644 (file)
@@ -211,6 +211,7 @@ hfs_relconverter(u_int32_t encoding)
                                lck_mtx_unlock(&encodinglst_mutex);
  
                                FREE(encp, M_TEMP);
+                record_kext_unload(id);
                                kmod_destroy((host_priv_t) host_priv_self(), id);
                                return (0);
                        }
index f6c5e8409a82dd48ffe799fdbbf3a03f217cde2e..ba47918e3f261dd7bd680a59519403d12f4db171 100644 (file)
@@ -438,6 +438,25 @@ hfs_vnop_link(struct vnop_link_args *ap)
        }
        tdcp = VTOC(tdvp);
        cp = VTOC(vp);
+       
+       /*
+        * Make sure we don't race the src or dst parent directories with rmdir.
+        * Note that we should only have a src parent directory cnode lock 
+        * if we're dealing with a directory hardlink here.
+        */
+       if (fdcp) {
+               if (fdcp->c_flag & (C_NOEXISTS | C_DELETED)) {
+                       error = ENOENT;
+                       goto out;
+               }
+       }
+       
+       if (tdcp->c_flag & (C_NOEXISTS | C_DELETED)) {
+               error = ENOENT;
+               goto out;
+       }
+       
+       /* Check src for errors: too many links, immutable, race with unlink */
        if (cp->c_linkcount >= HFS_LINK_MAX) {
                error = EMLINK;
                goto out;
index 8eac4e20e796cea10527b6cb2eefe63c120e310b..b2e71a03485f9ac3355314cadc74ebc60818b42e 100644 (file)
@@ -220,17 +220,32 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte
                    vfs_isrdonly(mp)) {
                        int flags;
 
+                       /* Set flag to indicate that a downgrade to read-only
+                        * is in progress and therefore block any further 
+                        * modifications to the file system.
+                        */
+                       hfs_global_exclusive_lock_acquire(hfsmp);
+                       hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE;
+                       hfsmp->hfs_downgrading_proc = current_thread();
+                       hfs_global_exclusive_lock_release(hfsmp);
+
                        /* use VFS_SYNC to push out System (btree) files */
                        retval = VFS_SYNC(mp, MNT_WAIT, context);
-                       if (retval && ((cmdflags & MNT_FORCE) == 0))
+                       if (retval && ((cmdflags & MNT_FORCE) == 0)) {
+                               hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
+                               hfsmp->hfs_downgrading_proc = NULL;
                                goto out;
+                       }
                
                        flags = WRITECLOSE;
                        if (cmdflags & MNT_FORCE)
                                flags |= FORCECLOSE;
                                
-                       if ((retval = hfs_flushfiles(mp, flags, p)))
+                       if ((retval = hfs_flushfiles(mp, flags, p))) {
+                               hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
+                               hfsmp->hfs_downgrading_proc = NULL;
                                goto out;
+                       }
 
                        /* mark the volume cleanly unmounted */
                        hfsmp->vcbAtrb |= kHFSVolumeUnmountedMask;
@@ -248,6 +263,8 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte
                                }
                        }
                        if (retval) {
+                               hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
+                               hfsmp->hfs_downgrading_proc = NULL;
                                hfsmp->hfs_flags &= ~HFS_READ_ONLY;
                                goto out;
                        }
@@ -263,6 +280,8 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte
 
                            hfs_global_exclusive_lock_release(hfsmp);
                        }
+
+                       hfsmp->hfs_downgrading_proc = NULL;
                }
 
                /* Change to a writable file system. */
@@ -317,6 +336,13 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte
                        /* Only clear HFS_READ_ONLY after a successfull write */
                        hfsmp->hfs_flags &= ~HFS_READ_ONLY;
 
+                       /* If this mount point was downgraded from read-write 
+                        * to read-only, clear that information as we are now 
+                        * moving back to read-write.
+                        */
+                       hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
+                       hfsmp->hfs_downgrading_proc = NULL;
+
                        /* mark the volume dirty (clear clean unmount bit) */
                        hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask;
 
@@ -885,8 +911,13 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
         * block size to be 4k if there are more than 31-bits
         * worth of blocks but to insure compatibility with
         * pre-Tiger systems we have to do it.
+        *
+        * If the device size is not a multiple of 4K (8 * 512), then
+        * switching the logical block size isn't going to help because
+        * we will be unable to write the alternate volume header.
+        * In this case, just leave the logical block size unchanged.
         */
-       if (log_blkcnt > 0x000000007fffffff) {
+       if (log_blkcnt > 0x000000007fffffff && (log_blkcnt & 7) == 0) {
                minblksize = log_blksize = 4096;
                if (phys_blksize < log_blksize)
                        phys_blksize = log_blksize;
@@ -1024,6 +1055,8 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
                        }
                        hfsmp->hfs_logical_block_size = log_blksize;
                        hfsmp->hfs_logical_block_count = log_blkcnt;
+                       hfsmp->hfs_physical_block_size = log_blksize;
+                       hfsmp->hfs_log_per_phys = 1;
                }
                if (args) {
                        hfsmp->hfs_encoding = args->hfs_encoding;
@@ -1078,6 +1111,11 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
                                hfsmp->hfs_logical_block_count *=
                                    hfsmp->hfs_logical_block_size / log_blksize;
                                hfsmp->hfs_logical_block_size = log_blksize;
+                               
+                               /* Update logical/physical block size */
+                               hfsmp->hfs_physical_block_size = log_blksize;
+                               phys_blksize = log_blksize;
+                               hfsmp->hfs_log_per_phys = 1;
                        }
 
                        disksize = (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.blockCount) *
@@ -1218,6 +1256,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
                        /* Note: relative block count adjustment (in case this is an embedded volume). */
                        hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize;
                        hfsmp->hfs_logical_block_size = log_blksize;
+                       hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize;
  
                        if (hfsmp->jnl) {
                            // close and re-open this with the new block size
@@ -3155,9 +3194,6 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
                /* If ioctl is not supported, force physical and logical sector size to be same */
                phys_sectorsize = sectorsize;
        }
-       if (phys_sectorsize != hfsmp->hfs_physical_block_size) {
-               return (ENXIO);
-       }
        oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
 
        /*
@@ -4493,19 +4529,29 @@ end_iteration:
        /* Now move any files that are in the way. */
        for (i = 0; i < filecnt; ++i) {
                struct vnode * rvp;
+        struct cnode * cp;
 
                if (hfs_vget(hfsmp, cnidbufp[i], &vp, 0) != 0)
                        continue;
 
+        /* Relocating directory hard links is not supported, so we
+         * punt (see radar 6217026). */
+        cp = VTOC(vp);
+        if ((cp->c_flag & C_HARDLINK) && vnode_isdir(vp)) {
+            printf("hfs_reclaimspace: unable to relocate directory hard link %d\n", cp->c_cnid);
+            error = EINVAL;
+            goto out;
+        }
+
                /* Relocate any data fork blocks. */
-               if (VTOF(vp)->ff_blocks > 0) {
+               if (VTOF(vp) && VTOF(vp)->ff_blocks > 0) {
                        error = hfs_relocate(vp, hfsmp->hfs_metazone_end + 1, kauth_cred_get(), current_proc());
                }
                if (error) 
                        break;
 
                /* Relocate any resource fork blocks. */
-               if ((VTOC((vp))->c_blocks - VTOF((vp))->ff_blocks) > 0) {
+               if ((cp->c_blocks - (VTOF(vp) ? VTOF((vp))->ff_blocks : 0)) > 0) {
                        error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE);
                        if (error)
                                break;
@@ -4514,7 +4560,7 @@ end_iteration:
                        if (error)
                                break;
                }
-               hfs_unlock(VTOC(vp));
+               hfs_unlock(cp);
                vnode_put(vp);
                vp = NULL;
 
index 43e5ae8be942e7b46ba2332a411a2083546e953d..ce577ec74165a91d876ad56cfc7e3aef5e61b3b1 100644 (file)
@@ -346,6 +346,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
         */
        if (blockSize < hfsmp->hfs_physical_block_size) {
                hfsmp->hfs_physical_block_size = hfsmp->hfs_logical_block_size;
+               hfsmp->hfs_log_per_phys = 1;
        }
 
        /*
@@ -438,14 +439,18 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
        retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork,
                                 &hfsmp->hfs_extents_vp);
        if (retval)
+       {
                goto ErrorExit;
+       }
        hfsmp->hfs_extents_cp = VTOC(hfsmp->hfs_extents_vp);
        hfs_unlock(hfsmp->hfs_extents_cp);
 
        retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_extents_vp),
                                          (KeyCompareProcPtr) CompareExtentKeysPlus));
        if (retval)
+       {
                goto ErrorExit;
+       }
        /*
         * Set up Catalog B-tree vnode
         */ 
@@ -2372,6 +2377,16 @@ hfs_start_transaction(struct hfsmount *hfsmp)
        unlock_on_err = 1;
     }
 
+       /* If a downgrade to read-only mount is in progress, no other
+        * process than the downgrade process is allowed to modify 
+        * the file system.
+        */
+       if ((hfsmp->hfs_flags & HFS_RDONLY_DOWNGRADE) && 
+                       (hfsmp->hfs_downgrading_proc != thread)) {
+               ret = EROFS;
+               goto out;
+       }
+
     if (hfsmp->jnl) {
        ret = journal_start_transaction(hfsmp->jnl);
        if (ret == 0) {
@@ -2381,6 +2396,7 @@ hfs_start_transaction(struct hfsmount *hfsmp)
        ret = 0;
     }
 
+out:
     if (ret != 0 && unlock_on_err) {
        lck_rw_unlock_shared(&hfsmp->hfs_global_lock);
     }
index 7377435358ca785d5148746628e36094185ec332..ee4b63f4013b8a99500d0dbe8dd536e754e498a3 100644 (file)
@@ -1043,17 +1043,27 @@ kdbg_control_chud(int val, void *fn)
 
        
 int
-kdbg_control(int *name, __unused u_int namelen, user_addr_t where, size_t *sizep)
+kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep)
 {
         int ret=0;
        size_t size=*sizep;
-       unsigned int value = name[1];
+       unsigned int value = 0;
        kd_regtype kd_Reg;
        kbufinfo_t kd_bufinfo;
        pid_t curpid;
        struct proc *p, *curproc;
 
-
+       if (name[0] == KERN_KDGETENTROPY ||
+               name[0] == KERN_KDEFLAGS ||
+               name[0] == KERN_KDDFLAGS ||
+               name[0] == KERN_KDENABLE ||
+               name[0] == KERN_KDSETBUF) {
+               
+               if ( namelen < 2 )
+               return(EINVAL);
+               value = name[1];
+       }
+       
        kdbg_lock_init();
 
        if ( !(kdebug_flags & KDBG_LOCKINIT))
index 89d2e8dbf2ddfc777cd164b00cf685d5e6ed61bf..c7b4ca7a896f360961f6fc781eded5d1766ff6fd 100644 (file)
@@ -3239,9 +3239,15 @@ kauth_cred_label_update(kauth_cred_t cred, struct label *label)
  *             vp                              The exec vnode
  *             scriptl                         The script MAC label
  *             execl                           The executable MAC label
+ *             disjointp                       Pointer to flag to set if old
+ *                                             and returned credentials are
+ *                                             disjoint
  *
  * Returns:    (kauth_cred_t)                  The updated credential
  *
+ * Implicit returns:
+ *             *disjointp                      Set to 1 for disjoint creds
+ *
  * IMPORTANT:  This function is implemented via kauth_cred_update(), which,
  *             if it returns a credential other than the one it is passed,
  *             will have dropped the reference on the passed credential.  All
@@ -3257,7 +3263,8 @@ kauth_cred_label_update(kauth_cred_t cred, struct label *label)
 static
 kauth_cred_t
 kauth_cred_label_update_execve(kauth_cred_t cred, vfs_context_t ctx,
-       struct vnode *vp, struct label *scriptl, struct label *execl)
+       struct vnode *vp, struct label *scriptl, struct label *execl,
+       int *disjointp)
 {
        kauth_cred_t newcred;
        struct ucred temp_cred;
@@ -3266,8 +3273,8 @@ kauth_cred_label_update_execve(kauth_cred_t cred, vfs_context_t ctx,
 
        mac_cred_label_init(&temp_cred);
        mac_cred_label_associate(cred, &temp_cred);
-       mac_cred_label_update_execve(ctx, &temp_cred, 
-                                    vp, scriptl, execl);
+       *disjointp = mac_cred_label_update_execve(ctx, &temp_cred, 
+                                                    vp, scriptl, execl);
 
        newcred = kauth_cred_update(cred, &temp_cred, TRUE);
        mac_cred_label_destroy(&temp_cred);
@@ -3349,14 +3356,21 @@ int kauth_proc_label_update(struct proc *p, struct label *label)
  *             scriptl                 The script MAC label
  *             execl                   The executable MAC label
  *
+ * Returns:    0                       Label update did not make credential
+ *                                     disjoint
+ *             1                       Label update caused credential to be
+ *                                     disjoint
+ *
  * Notes:      The credential associated with the process WILL change as a
  *             result of this call.  The caller should not assume the process
  *             reference to the old credential still exists.
  */
-int kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx,
+int
+kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx,
        struct vnode *vp, struct label *scriptl, struct label *execl)
 {
        kauth_cred_t my_cred, my_new_cred;
+       int disjoint = 0;
 
        my_cred = kauth_cred_proc_ref(p);
 
@@ -3372,7 +3386,7 @@ int kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx,
                 * passed in.  The subsequent compare is safe, because it is
                 * a pointer compare rather than a contents compare.
                 */
-               my_new_cred = kauth_cred_label_update_execve(my_cred, ctx, vp, scriptl, execl);
+               my_new_cred = kauth_cred_label_update_execve(my_cred, ctx, vp, scriptl, execl, &disjoint);
                if (my_cred != my_new_cred) {
 
                        DEBUG_CRED_CHANGE("kauth_proc_label_update_execve_unlocked CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags);
@@ -3400,7 +3414,7 @@ int kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx,
        /* Drop old proc reference or our extra reference */
        kauth_cred_unref(&my_cred);
        
-       return (0);
+       return (disjoint);
 }
 
 #if 1
index 43ab48894ec26a8a6dc295f16e6db945ee4a27fa..71dd14c128373537f1cbe708c4737ca36eb63f5d 100644 (file)
@@ -2543,24 +2543,33 @@ exec_handle_sugid(struct image_params *imgp)
        kauth_cred_t            cred = vfs_context_ucred(imgp->ip_vfs_context);
        proc_t                  p = vfs_context_proc(imgp->ip_vfs_context);
        int                     i;
-       int                     is_member = 0;
+       int                     leave_sugid_clear = 0;
        int                     error = 0;
        struct vnode    *dev_null = NULLVP;
-#if CONFIG_MACF
-       kauth_cred_t    my_cred;
-#endif
-
 #if CONFIG_MACF
        int                     mac_transition;
-       mac_transition = mac_cred_check_label_update_execve(imgp->ip_vfs_context, imgp->ip_vp,
-           imgp->ip_scriptlabelp, imgp->ip_execlabelp, p);
+
+       /*
+        * Determine whether a call to update the MAC label will result in the
+        * credential changing.
+        *
+        * Note:        MAC policies which do not actually end up modifying
+        *              the label subsequently are strongly encouraged to
+        *              return 0 for this check, since a non-zero answer will
+        *              slow down the exec fast path for normal binaries.
+        */
+       mac_transition = mac_cred_check_label_update_execve(
+                                                       imgp->ip_vfs_context,
+                                                       imgp->ip_vp,
+                                                       imgp->ip_scriptlabelp,
+                                                       imgp->ip_execlabelp, p);
 #endif
 
        OSBitAndAtomic(~((uint32_t)P_SUGID), (UInt32 *)&p->p_flag);
 
        /*
         * Order of the following is important; group checks must go last,
-        * as we use the success of the 'is_member' check combined with the
+        * as we use the success of the 'ismember' check combined with the
         * failure of the explicit match to indicate that we will be setting
         * the egid of the process even though the new process did not
         * require VSUID/VSGID bits in order for it to set the new group as
@@ -2574,13 +2583,15 @@ exec_handle_sugid(struct image_params *imgp)
         */
        if (((imgp->ip_origvattr->va_mode & VSUID) != 0 &&
             kauth_cred_getuid(cred) != imgp->ip_origvattr->va_uid) ||
-#if CONFIG_MACF
-               mac_transition ||       /* A policy wants to transition */
-#endif
            ((imgp->ip_origvattr->va_mode & VSGID) != 0 &&
-                ((kauth_cred_ismember_gid(cred, imgp->ip_origvattr->va_gid, &is_member) || !is_member) ||
+                ((kauth_cred_ismember_gid(cred, imgp->ip_origvattr->va_gid, &leave_sugid_clear) || !leave_sugid_clear) ||
                 (cred->cr_gid != imgp->ip_origvattr->va_gid)))) {
 
+#if CONFIG_MACF
+/* label for MAC transition and neither VSUID nor VSGID */
+handle_mac_transition:
+#endif
+
                /*
                 * Replace the credential with a copy of itself if euid or
                 * egid change.
@@ -2606,28 +2617,36 @@ exec_handle_sugid(struct image_params *imgp)
 
 #if CONFIG_MACF
                /* 
-                * XXXMAC: In FreeBSD, we set P_SUGID on a MAC transition
-                * to protect against debuggers being attached by an 
-                * insufficiently privileged process onto the result of
-                * a transition to a more privileged credential.  This is
-                * too conservative on FreeBSD, but we need to do
-                * something similar here, or risk vulnerability.
-                *
-                * Before we make the call into the MAC policies, get a new
+                * If a policy has indicated that it will transition the label,
+                * before making the call into the MAC policies, get a new
                 * duplicate credential, so they can modify it without
                 * modifying any others sharing it.
                 */
-               if (mac_transition && !imgp->ip_no_trans) { 
-                       kauth_proc_label_update_execve(p,
-                               imgp->ip_vfs_context,
-                               imgp->ip_vp, 
-                               imgp->ip_scriptlabelp, imgp->ip_execlabelp);
+               if (mac_transition) { 
+                       kauth_cred_t    my_cred;
+                       if (kauth_proc_label_update_execve(p,
+                                               imgp->ip_vfs_context,
+                                               imgp->ip_vp, 
+                                               imgp->ip_scriptlabelp,
+                                               imgp->ip_execlabelp)) {
+                               /*
+                                * If updating the MAC label resulted in a
+                                * disjoint credential, flag that we need to
+                                * set the P_SUGID bit.  This protects
+                                * against debuggers being attached by an
+                                * insufficiently privileged process onto the
+                                * result of a transition to a more privileged
+                                * credential.
+                                */
+                               leave_sugid_clear = 0;
+                       }
 
                        my_cred = kauth_cred_proc_ref(p);
                        mac_task_label_update_cred(my_cred, p->task);
                        kauth_cred_unref(&my_cred);
                }
-#endif
+#endif /* CONFIG_MACF */
+
                /*
                 * Have mach reset the task and thread ports.
                 * We don't want anyone who had the ports before
@@ -2640,13 +2659,15 @@ exec_handle_sugid(struct image_params *imgp)
                }
 
                /*
-                * If 'is_member' is non-zero, then we passed the VSUID and
-                * MACF checks, and successfully determined that the previous
-                * cred was a member of the VSGID group, but that it was not
-                * the default at the time of the execve.  So we don't set the
-                * P_SUGID on the basis of simply running this code.
+                * If 'leave_sugid_clear' is non-zero, then we passed the
+                * VSUID and MACF checks, and successfully determined that
+                * the previous cred was a member of the VSGID group, but
+                * that it was not the default at the time of the execve,
+                * and that the post-labelling credential was not disjoint.
+                * So we don't set the P_SUGID on the basis of simply
+                * running this code.
                 */
-               if (!is_member)
+               if (!leave_sugid_clear)
                        OSBitOrAtomic(P_SUGID, (UInt32 *)&p->p_flag);
 
                /* Cache the vnode for /dev/null the first time around */
@@ -2713,6 +2734,21 @@ exec_handle_sugid(struct image_params *imgp)
                        dev_null = NULLVP;
                }
        }
+#if CONFIG_MACF
+       else {
+               /*
+                * We are here because we were told that the MAC label will
+                * be transitioned, and the binary is not VSUID or VSGID; to
+                * deal with this case, we could either duplicate a lot of
+                * code, or we can indicate we want to default the P_SUGID
+                * bit clear and jump back up.
+                */
+               if (mac_transition) {
+                       leave_sugid_clear = 1;
+                       goto handle_mac_transition;
+               }
+       }
+#endif /* CONFIG_MACF */
 
        /*
         * Implement the semantic where the effective user and group become
index 4e61180b6a492e0980c21a1d6c9fd72d98c58c82..7269357e438ac48709acbd74f5f8da28223a64e2 100644 (file)
@@ -131,7 +131,15 @@ static struct lockf *lf_getblock(struct lockf *);
 static int      lf_getlock(struct lockf *, struct flock *);
 static int      lf_setlock(struct lockf *);
 static int      lf_split(struct lockf *, struct lockf *);
-static void     lf_wakelock(struct lockf *);
+static void     lf_wakelock(struct lockf *, boolean_t);
+
+
+/*
+ * in order to mitigate risk
+ * don't switch to new wake-one method unless
+ * we have at least this many waiters to wake up
+ */
+#define SAFE_WAITER_LIMIT    20
 
 
 /*
@@ -259,9 +267,13 @@ lf_advlock(struct vnop_advlock_args *ap)
        lock->lf_type = fl->l_type;
        lock->lf_head = head;
        lock->lf_next = (struct lockf *)0;
+       lock->lf_waiters = 0;
        TAILQ_INIT(&lock->lf_blkhd);
        lock->lf_flags = ap->a_flags;
 
+       if (ap->a_flags & F_FLOCK)
+               lock->lf_flags |= F_WAKE1_SAFE;
+
        lck_mtx_lock(&vp->v_lock);      /* protect the lockf list */
        /*
         * Do the requested operation.
@@ -502,6 +514,11 @@ lf_setlock(struct lockf *lock)
                 */
                lock->lf_next = block;
                TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
+               block->lf_waiters++;
+
+               if ( !(lock->lf_flags & F_FLOCK))
+                       block->lf_flags &= ~F_WAKE1_SAFE;
+
 #ifdef LOCKF_DEBUGGING
                if (lockf_debug & 1) {
                        lf_print("lf_setlock: blocking on", block);
@@ -509,6 +526,20 @@ lf_setlock(struct lockf *lock)
                }
 #endif /* LOCKF_DEBUGGING */
                error = msleep(lock, &vp->v_lock, priority, lockstr, 0);
+
+               if (!TAILQ_EMPTY(&lock->lf_blkhd)) {
+                       struct lockf *tlock;
+
+                       if ((block = lf_getblock(lock))) {
+                               TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) {
+                                       tlock->lf_next = block;
+                               }
+                               TAILQ_CONCAT(&block->lf_blkhd, &lock->lf_blkhd, lf_block);
+
+                               block->lf_waiters += lock->lf_waiters;
+                               lock->lf_waiters = 0;
+                       }
+               }
                if (error) {    /* XXX */
                        /*
                         * We may have been awakened by a signal and/or by a
@@ -520,8 +551,12 @@ lf_setlock(struct lockf *lock)
                         */
                        if (lock->lf_next) {
                                TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
+                               lock->lf_next->lf_waiters--;
                                lock->lf_next = NOLOCKF;
                        }
+                       if (!TAILQ_EMPTY(&lock->lf_blkhd))
+                               lf_wakelock(lock, TRUE);
+                         
                        FREE(lock, M_LOCKF);
                        return (error);
                }       /* XXX */
@@ -565,7 +600,7 @@ lf_setlock(struct lockf *lock)
                         */
                        if (lock->lf_type == F_RDLCK &&
                            overlap->lf_type == F_WRLCK)
-                               lf_wakelock(overlap);
+                               lf_wakelock(overlap, TRUE);
                        overlap->lf_type = lock->lf_type;
                        FREE(lock, M_LOCKF);
                        lock = overlap; /* for lf_coelesce_adjacent() */
@@ -595,7 +630,7 @@ lf_setlock(struct lockf *lock)
                                        return (ENOLCK);
                                }
                        }
-                       lf_wakelock(overlap);
+                       lf_wakelock(overlap, TRUE);
                        break;
 
                case OVERLAP_CONTAINED_BY_LOCK:
@@ -605,14 +640,18 @@ lf_setlock(struct lockf *lock)
                         */
                        if (lock->lf_type == F_RDLCK &&
                            overlap->lf_type == F_WRLCK) {
-                               lf_wakelock(overlap);
+                               lf_wakelock(overlap, TRUE);
                        } else {
                                while (!TAILQ_EMPTY(&overlap->lf_blkhd)) {
                                        ltmp = TAILQ_FIRST(&overlap->lf_blkhd);
                                        TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
                                            lf_block);
+                                       overlap->lf_waiters--;
+
                                        TAILQ_INSERT_TAIL(&lock->lf_blkhd,
                                            ltmp, lf_block);
+                                       lock->lf_waiters++;
+
                                        ltmp->lf_next = lock;
                                }
                        }
@@ -637,7 +676,7 @@ lf_setlock(struct lockf *lock)
                        overlap->lf_next = lock;
                        overlap->lf_end = lock->lf_start - 1;
                        prev = &lock->lf_next;
-                       lf_wakelock(overlap);
+                       lf_wakelock(overlap, TRUE);
                        needtolink = 0;
                        continue;
 
@@ -650,7 +689,7 @@ lf_setlock(struct lockf *lock)
                                lock->lf_next = overlap;
                        }
                        overlap->lf_start = lock->lf_end + 1;
-                       lf_wakelock(overlap);
+                       lf_wakelock(overlap, TRUE);
                        break;
                }
                break;
@@ -704,7 +743,7 @@ lf_clearlock(struct lockf *unlock)
                /*
                 * Wakeup the list of locks to be retried.
                 */
-               lf_wakelock(overlap);
+               lf_wakelock(overlap, FALSE);
 
                switch (ovcase) {
                case OVERLAP_NONE:      /* satisfy compiler enum/switch */
@@ -1048,19 +1087,42 @@ lf_split(struct lockf *lock1, struct lockf *lock2)
  *             in a real-world performance problem.
  */
 static void
-lf_wakelock(struct lockf *listhead)
+lf_wakelock(struct lockf *listhead, boolean_t force_all)
 {
        struct lockf *wakelock;
+       boolean_t wake_all = TRUE;
+
+       if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE) && listhead->lf_waiters > SAFE_WAITER_LIMIT)
+               wake_all = FALSE;
 
        while (!TAILQ_EMPTY(&listhead->lf_blkhd)) {
                wakelock = TAILQ_FIRST(&listhead->lf_blkhd);
                TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
+               listhead->lf_waiters--;
+
                wakelock->lf_next = NOLOCKF;
 #ifdef LOCKF_DEBUGGING
                if (lockf_debug & 2)
                        lf_print("lf_wakelock: awakening", wakelock);
 #endif /* LOCKF_DEBUGGING */
+               if (wake_all == FALSE) {
+
+                       TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block);
+                       wakelock->lf_waiters = listhead->lf_waiters;
+                       listhead->lf_waiters = 0;
+
+                       if (!TAILQ_EMPTY(&wakelock->lf_blkhd)) {
+                               struct lockf *tlock;
+
+                               TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) {
+                                       tlock->lf_next = wakelock;
+                               }
+                       }
+               }
                wakeup(wakelock);
+
+               if (wake_all == FALSE)
+                       break;
        }
 }
 
index 01ac7e637e1e1664379268f83eb495bb40161361..0788ed33ecb251d81ce380287136be31b8ff897f 100644 (file)
@@ -654,6 +654,4 @@ sysctl_mib_init(void)
 # warning we do not support this platform yet
 #endif /* __ppc__ */
 
-
 }
-
index b8673b4bc0f862b19e35f5f1dba54b1c208f4d6d..d1985d21d540fa589b10bbf8a283d9c45009c67e 100644 (file)
@@ -1588,6 +1588,9 @@ kdebug_ops(int *name, u_int namelen, user_addr_t where,
 {
        int ret=0;
 
+       if (namelen == 0)
+               return(ENOTSUP);
+
     ret = suser(kauth_cred_get(), &p->p_acflag);
        if (ret)
                return(ret);
@@ -1637,7 +1640,7 @@ sysctl_procargs2(int *name, u_int namelen, user_addr_t where,
 }
 
 static int
-sysctl_procargsx(int *name, __unused u_int namelen, user_addr_t where, 
+sysctl_procargsx(int *name, u_int namelen, user_addr_t where, 
                  size_t *sizep, proc_t cur_proc, int argc_yes)
 {
        proc_t p;
@@ -1657,6 +1660,9 @@ sysctl_procargsx(int *name, __unused u_int namelen, user_addr_t where,
        kauth_cred_t my_cred;
        uid_t uid;
 
+       if ( namelen < 1 )
+               return(EINVAL);
+
        if (argc_yes)
                buflen -= sizeof(int);          /* reserve first word to return argc */
 
index cefe3047326b8b6be6f5173cf816845d714c0312..76cb302f78ebe66363c9f86e380302bf8e12225f 100644 (file)
@@ -36,6 +36,8 @@
 #include <kern/locks.h>
 #include <net/kext_net.h>
 
+#include <libkern/libkern.h>
+
 #include <string.h>
 
 static struct socket_filter_list       sock_filter_head;
@@ -327,8 +329,7 @@ sflt_detach_private(
                        lck_mtx_unlock(sock_filter_lock);
                        return;
                }
-       }
-       else {
+       } else {
                /*
                 * Clear the removing flag. We will perform the detach here or
                 * request a delayed detach.  Since we do an extra ref release
@@ -344,9 +345,19 @@ sflt_detach_private(
        if (entry->sfe_socket->so_filteruse != 0) {
                entry->sfe_flags |= SFEF_DETACHUSEZERO;
                lck_mtx_unlock(sock_filter_lock);
+
+               if (unregistering) {
+#if DEBUG
+                       printf("sflt_detach_private unregistering SFEF_DETACHUSEZERO "
+                               "so%p so_filteruse %u so_usecount %d\n",
+                               entry->sfe_socket, entry->sfe_socket->so_filteruse, 
+                               entry->sfe_socket->so_usecount);
+#endif
+                       socket_unlock(entry->sfe_socket, 0);    
+               }
+
                return;
-       }
-       else {
+       } else {
                /*
                 * Check if we are removing the last attached filter and
                 * the parent filter is being unregistered.
index 123339d3aacc15abbaaee7853b490d76f7fd51d3..f6ec97d7df7c80387bf29c2fcfae3371ce418b80 100644 (file)
@@ -568,10 +568,13 @@ parse_machfile(
                                        (struct encryption_info_command *) lcp,
                                        addr, map, vp);
                                if (ret != LOAD_SUCCESS) {
-                                       printf("proc %d: set unprotect error %d "
+                                       printf("proc %d: set_code_unprotect() error %d "
                                               "for file \"%s\"\n",
                                               p->p_pid, ret, vp->v_name);
-                                       ret = LOAD_SUCCESS; /* ignore error */
+                                       /* Don't let the app run if it's 
+                                        * encrypted but we failed to set up the
+                                        * decrypter */
+                                        psignal(p, SIGKILL);
                                }
                                break;
 #endif
@@ -1451,7 +1454,7 @@ set_code_unprotect(
                        cryptname="com.apple.null";
                        break;
                default:
-                       return LOAD_FAILURE;
+                       return LOAD_BADMACHO;
        }
        
        len = MAXPATHLEN;
@@ -1463,9 +1466,9 @@ set_code_unprotect(
        kr=text_crypter_create(&crypt_info, cryptname, (void*)vpath);
        
        if(kr) {
-               printf("set_code_unprotect: unable to find decrypter %s, kr=%d\n",
+               printf("set_code_unprotect: unable to create decrypter %s, kr=%d\n",
                       cryptname, kr);
-               return LOAD_FAILURE;
+               return LOAD_RESOURCE;
        }
        
        /* this is terrible, but we have to rescan the load commands to find the
@@ -1509,12 +1512,16 @@ set_code_unprotect(
        }
        
        /* if we get here, did not find anything */
-       return LOAD_FAILURE;
+       return LOAD_BADMACHO;
        
 remap_now:
        /* now remap using the decrypter */
        kr = vm_map_apple_protected(map, map_offset, map_offset+map_size, &crypt_info);
-       if(kr) printf("set_code_unprotect(): mapping failed with %x\n", kr);
+       if(kr) {
+               printf("set_code_unprotect(): mapping failed with %x\n", kr);
+               crypt_info.crypt_end(crypt_info.crypt_ops);
+               return LOAD_PROTECT;
+       }
        
        return LOAD_SUCCESS;
 }
index 4ccfd04fcb4389e2958505b0ed98d402eb514215..be4e9c165e8e2782dd739d2703f6e9c42941b728 100644 (file)
@@ -1556,6 +1556,9 @@ workq_ops(struct proc *p, struct workq_ops_args  *uap, __unused register_t *retv
 
                        KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, (int)item, 0, 0, 0, 0);
 
+                       if ((prio < 0) || (prio >= 5))
+                               return (EINVAL);
+
                        workqueue_lock_spin(p);
 
                        if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
@@ -1568,6 +1571,9 @@ workq_ops(struct proc *p, struct workq_ops_args  *uap, __unused register_t *retv
                        break;
                case WQOPS_QUEUE_REMOVE: {
 
+                       if ((prio < 0) || (prio >= 5))
+                               return (EINVAL);
+
                        workqueue_lock_spin(p);
 
                        if ((wq = (struct workqueue *)p->p_wqptr) == NULL) {
index 1784e5f1d7a4893487fc5a9e5239221e50899622..0c0a278551fc6473ace82c3bbc05430941382fa0 100644 (file)
@@ -388,7 +388,6 @@ typedef struct mcl_slab {
  * whenever a new piece of memory mapped in from the VM crosses the 1MB
  * boundary.
  */
-#define        MBSHIFT         20                              /* 1MB */
 #define        NSLABSPMB       ((1 << MBSHIFT) >> MCLSHIFT)    /* 512 slabs/grp */
 
 typedef struct mcl_slabg {
index 31d3cd0825cbe6913958e10c59daf405084cfa84..2d3d48ae76d55131397fcd89a6c447b36b587613 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
 /*XXX*/
 #include <netinet/in.h>
 #include <netinet/in_var.h>
+#include <netinet/ip_var.h>
 #if INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
@@ -144,6 +145,9 @@ struct      ifnethead ifnet_head = TAILQ_HEAD_INITIALIZER(ifnet_head);
 static int     if_cloners_count;
 LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners);
 
+static struct ifaddr *ifa_ifwithnet_common(const struct sockaddr *,
+    unsigned int);
+
 #if INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
@@ -641,13 +645,77 @@ ifa_ifwithdstaddr(
        return result;
 }
 
+/*
+ * Locate the source address of an interface based on a complete address.
+ */
+struct ifaddr *
+ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope)
+{
+       struct ifaddr *result = NULL;
+       struct ifnet *ifp;
+
+       if (ifscope == IFSCOPE_NONE)
+               return (ifa_ifwithaddr(addr));
+
+       ifnet_head_lock_shared();
+       if (ifscope > (unsigned int)if_index) {
+               ifnet_head_done();
+               return (NULL);
+       }
+
+       ifp = ifindex2ifnet[ifscope];
+       if (ifp != NULL) {
+               struct ifaddr *ifa = NULL;
+
+               /*
+                * This is suboptimal; there should be a better way
+                * to search for a given address of an interface.
+                */
+               ifnet_lock_shared(ifp);
+               for (ifa = ifp->if_addrhead.tqh_first; ifa != NULL;
+                   ifa = ifa->ifa_link.tqe_next) {
+                       if (ifa->ifa_addr->sa_family != addr->sa_family)
+                               continue;
+                       if (equal(addr, ifa->ifa_addr)) {
+                               result = ifa;
+                               break;
+                       }
+                       if ((ifp->if_flags & IFF_BROADCAST) &&
+                           ifa->ifa_broadaddr != NULL &&
+                           /* IP6 doesn't have broadcast */
+                           ifa->ifa_broadaddr->sa_len != 0 &&
+                           equal(ifa->ifa_broadaddr, addr)) {
+                               result = ifa;
+                               break;
+                       }
+               }
+               if (result != NULL)
+                       ifaref(result);
+               ifnet_lock_done(ifp);
+       }
+       ifnet_head_done();
+
+       return (result);
+}
+
+struct ifaddr *
+ifa_ifwithnet(const struct sockaddr *addr)
+{
+       return (ifa_ifwithnet_common(addr, IFSCOPE_NONE));
+}
+
+struct ifaddr *
+ifa_ifwithnet_scoped(const struct sockaddr *addr, unsigned int ifscope)
+{
+       return (ifa_ifwithnet_common(addr, ifscope));
+}
+
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
-struct ifaddr *
-ifa_ifwithnet(
-       const struct sockaddr *addr)
+static struct ifaddr *
+ifa_ifwithnet_common(const struct sockaddr *addr, unsigned int ifscope)
 {
        struct ifnet *ifp;
        struct ifaddr *ifa = NULL;
@@ -655,6 +723,9 @@ ifa_ifwithnet(
        u_int af = addr->sa_family;
        const char *addr_data = addr->sa_data, *cplim;
 
+       if (!ip_doscopedroute || addr->sa_family != AF_INET)
+               ifscope = IFSCOPE_NONE;
+
        ifnet_head_lock_shared();
        /*
         * AF_LINK addresses can be looked up directly by their index number,
@@ -711,6 +782,14 @@ next:                              continue;
                        } else
 #endif /* __APPLE__*/
                        {
+                               /*
+                                * If we're looking up with a scope,
+                                * find using a matching interface.
+                                */
+                               if (ifscope != IFSCOPE_NONE &&
+                                   ifp->if_index != ifscope)
+                                       continue;
+
                                /*
                                 * if we have a special address handler,
                                 * then use it instead of the generic one.
index 02735e846b6a9d236240bb7258a5e6f61e433c51..f26aebe00073b9b0b115ca667465899534b04b18 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -672,10 +672,14 @@ void      ifma_reference(struct ifmultiaddr *ifma);
 void   ifma_release(struct ifmultiaddr *ifma);
 
 struct ifaddr *ifa_ifwithaddr(const struct sockaddr *);
+struct ifaddr *ifa_ifwithaddr_scoped(const struct sockaddr *, unsigned int);
 struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *);
 struct ifaddr *ifa_ifwithnet(const struct sockaddr *);
+struct ifaddr *ifa_ifwithnet_scoped(const struct sockaddr *, unsigned int);
 struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, const struct sockaddr *);
 struct ifaddr *ifa_ifwithroute_locked(int, const struct sockaddr *, const struct sockaddr *);
+struct ifaddr *ifa_ifwithroute_scoped_locked(int, const struct sockaddr *,
+    const struct sockaddr *, unsigned int);
 struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *);
 struct ifaddr *ifa_ifpgetprimary(struct ifnet *, int);
 void   ifafree(struct ifaddr *);
index 36aa3bc0d6dd5c8d868f1e5eb4bb406e9c28bbcf..876675d542b5086d68e28f5770b768395e67d3c4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -112,8 +112,10 @@ static int rn_lexobetter(void *m_arg, void *n_arg);
 static struct radix_mask *
                rn_new_radix_mask(struct radix_node *tt,
                                       struct radix_mask *next);
-static int     rn_satsifies_leaf(char *trial, struct radix_node *leaf,
-                                      int skip);
+static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip,
+    rn_matchf_t *f, void *w);
+
+#define        RN_MATCHF(rn, f, arg)   (f == NULL || (*f)((rn), arg))
 
 /*
  * The data structure for the keys is a radix tree with one way
@@ -208,6 +210,13 @@ rn_refines(void *m_arg, void *n_arg)
 
 struct radix_node *
 rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head)
+{
+       return (rn_lookup_args(v_arg, m_arg, head, NULL, NULL));
+}
+
+struct radix_node *
+rn_lookup_args(void *v_arg, void *m_arg, struct radix_node_head *head,
+    rn_matchf_t *f, void *w)
 {
        struct radix_node *x;
        caddr_t netmask = NULL;
@@ -218,7 +227,7 @@ rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head)
                        return (NULL);
                netmask = x->rn_key;
        }
-       x = rn_match(v_arg, head);
+       x = rn_match_args(v_arg, head, f, w);
        if (x && netmask) {
                while (x && x->rn_mask != netmask)
                        x = x->rn_dupedkey;
@@ -226,8 +235,16 @@ rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head)
        return x;
 }
 
+/*
+ * Returns true if address 'trial' has no bits differing from the
+ * leaf's key when compared under the leaf's mask.  In other words,
+ * returns true when 'trial' matches leaf.  If a leaf-matching
+ * routine is passed in, it is also used to find a match on the
+ * conditions defined by the caller of rn_match.
+ */
 static int
-rn_satsifies_leaf(char *trial, struct radix_node *leaf, int skip)
+rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip,
+    rn_matchf_t *f, void *w)
 {
        char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask;
        char *cplim;
@@ -241,11 +258,19 @@ rn_satsifies_leaf(char *trial, struct radix_node *leaf, int skip)
        for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
                if ((*cp ^ *cp2) & *cp3)
                        return 0;
-       return 1;
+
+       return (RN_MATCHF(leaf, f, w));
 }
 
 struct radix_node *
 rn_match(void *v_arg, struct radix_node_head *head)
+{
+       return (rn_match_args(v_arg, head, NULL, NULL));
+}
+
+struct radix_node *
+rn_match_args(void *v_arg, struct radix_node_head *head,
+    rn_matchf_t *f, void *w)
 {
        caddr_t v = v_arg;
        struct radix_node *t = head->rnh_treetop, *x;
@@ -291,11 +316,26 @@ rn_match(void *v_arg, struct radix_node_head *head)
         */
        if (t->rn_flags & RNF_ROOT)
                t = t->rn_dupedkey;
-       return t;
+       if (t == NULL || RN_MATCHF(t, f, w)) {
+               return (t);
+       } else {
+               /*
+                * Although we found an exact match on the key,
+                * f() is looking for some other criteria as well.
+                * Continue looking as if the exact match failed.
+                */
+               if (t->rn_parent->rn_flags & RNF_ROOT) {
+                       /* Hit the top; have to give up */
+                       return (NULL);
+               }
+               b = 0;
+               goto keeplooking;
+       }
 on1:
        test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
        for (b = 7; (test >>= 1) > 0;)
                b--;
+keeplooking:
        matched_off = cp - v;
        b += matched_off << 3;
        rn_bit = -1 - b;
@@ -304,17 +344,19 @@ on1:
         */
        if ((saved_t = t)->rn_mask == 0)
                t = t->rn_dupedkey;
-       for (; t; t = t->rn_dupedkey)
+       for (; t; t = t->rn_dupedkey) {
                /*
                 * Even if we don't match exactly as a host,
                 * we may match if the leaf we wound up at is
                 * a route to a net.
                 */
                if (t->rn_flags & RNF_NORMAL) {
-                       if (rn_bit <= t->rn_bit)
-                               return t;
-               } else if (rn_satsifies_leaf(v, t, matched_off))
-                               return t;
+                       if ((rn_bit <= t->rn_bit) && RN_MATCHF(t, f, w))
+                               return (t);
+               } else if (rn_satisfies_leaf(v, t, matched_off, f, w)) {
+                       return (t);
+               }
+       }
        t = saved_t;
        /* start searching up the tree */
        do {
@@ -329,20 +371,21 @@ on1:
                 */
                while (m) {
                        if (m->rm_flags & RNF_NORMAL) {
-                               if (rn_bit <= m->rm_bit)
+                               if ((rn_bit <= m->rm_bit) &&
+                                   RN_MATCHF(m->rm_leaf, f, w))
                                        return (m->rm_leaf);
                        } else {
                                off = min(t->rn_offset, matched_off);
                                x = rn_search_m(v, t, m->rm_mask);
                                while (x && x->rn_mask != m->rm_mask)
                                        x = x->rn_dupedkey;
-                               if (x && rn_satsifies_leaf(v, x, off))
-                                       return x;
+                               if (x && rn_satisfies_leaf(v, x, off, f, w))
+                                       return (x);
                        }
                        m = m->rm_mklist;
                }
        } while (t != top);
-       return NULL;
+       return (NULL);
 }
 
 #ifdef RN_DEBUG
@@ -1093,7 +1136,9 @@ rn_inithead(void **head, int off)
        rnh->rnh_addaddr = rn_addroute;
        rnh->rnh_deladdr = rn_delete;
        rnh->rnh_matchaddr = rn_match;
+       rnh->rnh_matchaddr_args = rn_match_args;
        rnh->rnh_lookup = rn_lookup;
+       rnh->rnh_lookup_args = rn_lookup_args;
        rnh->rnh_walktree = rn_walktree;
        rnh->rnh_walktree_from = rn_walktree_from;
        rnh->rnh_treetop = t;
index 3431a2d2530580329f886c6f29585bfb91da4956..6fa9f77cac589a9f54b0df21484c6994abaa7244 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -140,6 +140,7 @@ struct radix_mask {
 #define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);}
 
 typedef int walktree_f_t(struct radix_node *, void *);
+typedef int rn_matchf_t(struct radix_node *, void *);
 
 struct radix_node_head {
        struct  radix_node *rnh_treetop;
@@ -157,8 +158,16 @@ struct radix_node_head {
                (void *v, void *mask, struct radix_node_head *head);
        struct  radix_node *(*rnh_matchaddr)    /* locate based on sockaddr */
                (void *v, struct radix_node_head *head);
+       /* locate based on sockaddr and rn_matchf_t() */
+       struct  radix_node *(*rnh_matchaddr_args)
+               (void *v, struct radix_node_head *head,
+               rn_matchf_t *f, void *w);
        struct  radix_node *(*rnh_lookup)       /* locate based on sockaddr */
                (void *v, void *mask, struct radix_node_head *head);
+       /* locate based on sockaddr, mask and rn_matchf_t() */
+       struct  radix_node *(*rnh_lookup_args)
+               (void *v, void *mask, struct radix_node_head *head,
+               rn_matchf_t *f, void *);
        struct  radix_node *(*rnh_matchpkt)     /* locate based on packet hdr */
                (void *v, struct radix_node_head *head);
        int     (*rnh_walktree)                 /* traverse tree */
@@ -195,7 +204,10 @@ struct radix_node
                        struct radix_node [2]),
         *rn_delete(void *, void *, struct radix_node_head *),
         *rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head),
-        *rn_match(void *, struct radix_node_head *);
+        *rn_lookup_args(void *v_arg, void *m_arg, struct radix_node_head *head,
+            rn_matchf_t *, void *),
+        *rn_match(void *, struct radix_node_head *),
+        *rn_match_args(void *, struct radix_node_head *, rn_matchf_t *, void *);
 
 #endif /* PRIVATE */
 #endif /* _RADIX_H_ */
index 7f4ec5ac694ebb5e74550c9a9d48ba412ac4763d..4e4fb302ec1823fea2543e2fec5731bf7c0773f5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -76,7 +76,9 @@
 #include <net/route.h>
 
 #include <netinet/in.h>
+#include <netinet/in_var.h>
 #include <netinet/ip_mroute.h>
+#include <netinet/ip_var.h>
 
 #include <net/if_dl.h>
 
@@ -166,10 +168,198 @@ static void rt_maskedcopy(struct sockaddr *,
 static void rtable_init(void **);
 static inline void rtref_audit(struct rtentry_dbg *);
 static inline void rtunref_audit(struct rtentry_dbg *);
+static struct rtentry *rtalloc1_common_locked(struct sockaddr *, int, u_long,
+    unsigned int);
+static int rtrequest_common_locked(int, struct sockaddr *,
+    struct sockaddr *, struct sockaddr *, int, struct rtentry **,
+    unsigned int);
+static void rtalloc_ign_common_locked(struct route *, u_long, unsigned int);
+static inline void sa_set_ifscope(struct sockaddr *, unsigned int);
+static struct sockaddr *sin_copy(struct sockaddr_in *, struct sockaddr_in *,
+    unsigned int);
+static struct sockaddr *mask_copy(struct sockaddr *, struct sockaddr_in *,
+    unsigned int);
+static struct radix_node *node_lookup(struct sockaddr *, struct sockaddr *,
+    unsigned int);
+static struct radix_node *node_lookup_default(void);
+static int rn_match_ifscope(struct radix_node *, void *);
+static struct ifaddr *ifa_ifwithroute_common_locked(int,
+    const struct sockaddr *, const struct sockaddr *, unsigned int);
 
 __private_extern__ u_long route_generation = 0;
 extern int use_routegenid;
 
+/*
+ * sockaddr_in with embedded interface scope; this is used internally
+ * to keep track of scoped route entries in the routing table.  The
+ * fact that such a scope is embedded in the structure is an artifact
+ * of the current implementation which could change in future.
+ */
+struct sockaddr_inifscope {
+       __uint8_t       sin_len;
+       sa_family_t     sin_family;
+       in_port_t       sin_port;
+       struct  in_addr sin_addr;
+       /*
+        * To avoid possible conflict with an overlaid sockaddr_inarp
+        * having sin_other set to SIN_PROXY, we use the first 4-bytes
+        * of sin_zero since sin_srcaddr is one of the unused fields
+        * in sockaddr_inarp.
+        */
+       union {
+               char    sin_zero[8];
+               struct {
+                       __uint32_t      ifscope;
+               } _in_index;
+       } un;
+#define        sin_ifscope     un._in_index.ifscope
+};
+
+#define        SIN(sa)         ((struct sockaddr_in *)(size_t)(sa))
+#define        SINIFSCOPE(sa)  ((struct sockaddr_inifscope *)(size_t)(sa))
+
+#define        ASSERT_SINIFSCOPE(sa) {                                         \
+       if ((sa)->sa_family != AF_INET ||                               \
+           (sa)->sa_len < sizeof (struct sockaddr_in))                 \
+               panic("%s: bad sockaddr_in %p\n", __func__, sa);        \
+}
+
+/*
+ * Argument to leaf-matching routine; at present it is scoped routing
+ * specific but can be expanded in future to include other search filters.
+ */
+struct matchleaf_arg {
+       unsigned int    ifscope;        /* interface scope */
+};
+
+/*
+ * For looking up the non-scoped default route (sockaddr instead
+ * of sockaddr_in for convenience).
+ */
+static struct sockaddr sin_def = {
+       sizeof (struct sockaddr_in), AF_INET, { 0, }
+};
+
+/*
+ * Interface index (scope) of the primary interface; determined at
+ * the time when the default, non-scoped route gets added, changed
+ * or deleted.  Protected by rt_mtx.
+ */
+static unsigned int primary_ifscope = IFSCOPE_NONE;
+
+#define        INET_DEFAULT(dst)       \
+       ((dst)->sa_family == AF_INET && SIN(dst)->sin_addr.s_addr == 0)
+
+#define        RT(r)           ((struct rtentry *)r)
+#define        RT_HOST(r)      (RT(r)->rt_flags & RTF_HOST)
+
+/*
+ * Given a route, determine whether or not it is the non-scoped default
+ * route; dst typically comes from rt_key(rt) but may be coming from
+ * a separate place when rt is in the process of being created.
+ */
+boolean_t
+rt_inet_default(struct rtentry *rt, struct sockaddr *dst)
+{
+       return (INET_DEFAULT(dst) && !(rt->rt_flags & RTF_IFSCOPE));
+}
+
+/*
+ * Set the ifscope of the primary interface; caller holds rt_mtx.
+ */
+void
+set_primary_ifscope(unsigned int ifscope)
+{
+       primary_ifscope = ifscope;
+}
+
+/*
+ * Return the ifscope of the primary interface; caller holds rt_mtx.
+ */
+unsigned int
+get_primary_ifscope(void)
+{
+       return (primary_ifscope);
+}
+
+/*
+ * Embed ifscope into a given a sockaddr_in.
+ */
+static inline void
+sa_set_ifscope(struct sockaddr *sa, unsigned int ifscope)
+{
+       /* Caller must pass in sockaddr_in */
+       ASSERT_SINIFSCOPE(sa);
+
+       SINIFSCOPE(sa)->sin_ifscope = ifscope;
+}
+
+/*
+ * Given a sockaddr_in, return the embedded ifscope to the caller.
+ */
+unsigned int
+sa_get_ifscope(struct sockaddr *sa)
+{
+       /* Caller must pass in sockaddr_in */
+       ASSERT_SINIFSCOPE(sa);
+
+       return (SINIFSCOPE(sa)->sin_ifscope);
+}
+
+/*
+ * Copy a sockaddr_in src to dst and embed ifscope into dst.
+ */
+static struct sockaddr *
+sin_copy(struct sockaddr_in *src, struct sockaddr_in *dst, unsigned int ifscope)
+{
+       *dst = *src;
+       sa_set_ifscope(SA(dst), ifscope);
+
+       return (SA(dst));
+}
+
+/*
+ * Copy a mask from src to a sockaddr_in dst and embed ifscope into dst.
+ */
+static struct sockaddr *
+mask_copy(struct sockaddr *src, struct sockaddr_in *dst, unsigned int ifscope)
+{
+       /* We know dst is at least the size of sockaddr{_in} */
+       bzero(dst, sizeof (*dst));
+       rt_maskedcopy(src, SA(dst), src);
+
+       /*
+        * The length of the mask sockaddr would need to be adjusted
+        * to cover the additional sin_ifscope field; when ifscope is
+        * IFSCOPE_NONE, we'd end up clearing the embedded ifscope on
+        * the destination mask in addition to extending the length
+        * of the sockaddr, as a side effect.  This is okay, as any
+        * trailing zeroes would be skipped by rn_addmask prior to
+        * inserting or looking up the mask in the mask tree.
+        */
+       SINIFSCOPE(dst)->sin_ifscope = ifscope;
+       SINIFSCOPE(dst)->sin_len =
+           offsetof(struct sockaddr_inifscope, sin_ifscope) +
+           sizeof (SINIFSCOPE(dst)->sin_ifscope);
+
+       return (SA(dst));
+}
+
+/*
+ * Callback leaf-matching routine for rn_matchaddr_args used
+ * for looking up an exact match for a scoped route entry.
+ */
+static int
+rn_match_ifscope(struct radix_node *rn, void *arg)
+{
+       struct rtentry *rt = (struct rtentry *)rn;
+       struct matchleaf_arg *ma = arg;
+
+       if (!(rt->rt_flags & RTF_IFSCOPE) || rt_key(rt)->sa_family != AF_INET)
+               return (0);
+
+       return (SINIFSCOPE(rt_key(rt))->sin_ifscope == ma->ifscope);
+}
 
 static void
 rtable_init(void **table)
@@ -232,17 +422,29 @@ rtalloc(struct route *ro)
 
 void
 rtalloc_ign_locked(struct route *ro, u_long ignore)
+{
+       return (rtalloc_ign_common_locked(ro, ignore, IFSCOPE_NONE));
+}
+
+void
+rtalloc_scoped_ign_locked(struct route *ro, u_long ignore, unsigned int ifscope)
+{
+       return (rtalloc_ign_common_locked(ro, ignore, ifscope));
+}
+
+static void
+rtalloc_ign_common_locked(struct route *ro, u_long ignore,
+    unsigned int ifscope)
 {
        struct rtentry *rt;
 
        if ((rt = ro->ro_rt) != NULL) {
                if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
                        return;
-               /* XXX - We are probably always at splnet here already. */
                rtfree_locked(rt);
                ro->ro_rt = NULL;
        }
-       ro->ro_rt = rtalloc1_locked(&ro->ro_dst, 1, ignore);
+       ro->ro_rt = rtalloc1_common_locked(&ro->ro_dst, 1, ignore, ifscope);
        if (ro->ro_rt)
                ro->ro_rt->generation_id = route_generation;
 }
@@ -255,76 +457,99 @@ rtalloc_ign(struct route *ro, u_long ignore)
        lck_mtx_unlock(rt_mtx);
 }
 
+struct rtentry *
+rtalloc1_locked(struct sockaddr *dst, int report, u_long ignflags)
+{
+       return (rtalloc1_common_locked(dst, report, ignflags, IFSCOPE_NONE));
+}
+
+struct rtentry *
+rtalloc1_scoped_locked(struct sockaddr *dst, int report, u_long ignflags,
+    unsigned int ifscope)
+{
+       return (rtalloc1_common_locked(dst, report, ignflags, ifscope));
+}
+
 /*
  * Look up the route that matches the address given
  * Or, at least try.. Create a cloned route if needed.
  */
-struct rtentry *
-rtalloc1_locked(struct sockaddr *dst, int report, u_long ignflags)
+static struct rtentry *
+rtalloc1_common_locked(struct sockaddr *dst, int report, u_long ignflags,
+    unsigned int ifscope)
 {
        struct radix_node_head *rnh = rt_tables[dst->sa_family];
-       struct rtentry *rt;
-       struct radix_node *rn;
-       struct rtentry *newrt = 0;
+       struct rtentry *rt, *newrt = NULL;
        struct rt_addrinfo info;
        u_long nflags;
        int  err = 0, msgtype = RTM_MISS;
+
+       if (rnh == NULL)
+               goto unreachable;
+
        /*
-        * Look up the address in the table for that Address Family
+        * Find the longest prefix or exact (in the scoped case) address match;
+        * callee adds a reference to entry and checks for root node as well
         */
-       if (rnh && (rn = rnh->rnh_matchaddr((caddr_t)dst, rnh)) &&
-           ((rn->rn_flags & RNF_ROOT) == 0)) {
+       rt = rt_lookup(FALSE, dst, NULL, rnh, ifscope);
+       if (rt == NULL)
+               goto unreachable;
+
+       newrt = rt;
+       nflags = rt->rt_flags & ~ignflags;
+       if (report && (nflags & (RTF_CLONING | RTF_PRCLONING))) {
                /*
-                * If we find it and it's not the root node, then
-                * get a refernce on the rtentry associated.
+                * We are apparently adding (report = 0 in delete).
+                * If it requires that it be cloned, do so.
+                * (This implies it wasn't a HOST route.)
                 */
-               newrt = rt = (struct rtentry *)rn;
-               nflags = rt->rt_flags & ~ignflags;
-               if (report && (nflags & (RTF_CLONING | RTF_PRCLONING))) {
+               err = rtrequest_locked(RTM_RESOLVE, dst, NULL, NULL, 0, &newrt);
+               if (err) {
                        /*
-                        * We are apparently adding (report = 0 in delete).
-                        * If it requires that it be cloned, do so.
-                        * (This implies it wasn't a HOST route.)
+                        * If the cloning didn't succeed, maybe what we
+                        * have from lookup above will do.  Return that;
+                        * no need to hold another reference since it's
+                        * already done.
                         */
-                       err = rtrequest_locked(RTM_RESOLVE, dst, SA(0),
-                                             SA(0), 0, &newrt);
-                       if (err) {
-                               /*
-                                * If the cloning didn't succeed, maybe
-                                * what we have will do. Return that.
-                                */
-                               newrt = rt;
-                               rtref(rt);
-                               goto miss;
-                       }
-                       if ((rt = newrt) && (rt->rt_flags & RTF_XRESOLVE)) {
-                               /*
-                                * If the new route specifies it be
-                                * externally resolved, then go do that.
-                                */
-                               msgtype = RTM_RESOLVE;
-                               goto miss;
-                       }
-               } else
-                       rtref(rt);
-       } else {
+                       newrt = rt;
+                       goto miss;
+               }
+
                /*
-                * Either we hit the root or couldn't find any match,
-                * Which basically means
-                * "caint get there frm here"
+                * We cloned it; drop the original route found during lookup.
+                * The resulted cloned route (newrt) would now have an extra
+                * reference held during rtrequest.
                 */
-               rtstat.rts_unreach++;
-       miss:   if (report) {
+               rtfree_locked(rt);
+               if ((rt = newrt) && (rt->rt_flags & RTF_XRESOLVE)) {
                        /*
-                        * If required, report the failure to the supervising
-                        * Authorities.
-                        * For a delete, this is not an error. (report == 0)
+                        * If the new route specifies it be
+                        * externally resolved, then go do that.
                         */
-                       bzero((caddr_t)&info, sizeof(info));
-                       info.rti_info[RTAX_DST] = dst;
-                       rt_missmsg(msgtype, &info, 0, err);
+                       msgtype = RTM_RESOLVE;
+                       goto miss;
                }
        }
+       goto done;
+
+unreachable:
+       /*
+        * Either we hit the root or couldn't find any match,
+        * Which basically means "cant get there from here"
+        */
+       rtstat.rts_unreach++;
+miss:
+       if (report) {
+               /*
+                * If required, report the failure to the supervising
+                * Authorities.
+                * For a delete, this is not an error. (report == 0)
+                */
+               bzero((caddr_t)&info, sizeof(info));
+               info.rti_info[RTAX_DST] = dst;
+               rt_missmsg(msgtype, &info, 0, err);
+       }
+done:
        return (newrt);
 }
 
@@ -370,10 +595,6 @@ rtfree_locked(struct rtentry *rt)
        if (rt->rt_refcnt > 0)
                return;
 
-       if ((rt->rt_flags & RTF_TRACKREFS) != 0)
-               printf("%s rt(%p)->rt_refcnt(%d), caller=%p\n", __FUNCTION__,
-                       rt, rt->rt_refcnt, __builtin_return_address(0));
-       
        /*
         * On last reference give the "close method" a chance to cleanup
         * private state.  This also permits (for IPv4 and IPv6) a chance
@@ -500,10 +721,6 @@ rtref(struct rtentry *p)
                rtref_audit((struct rtentry_dbg *)p);
 
        p->rt_refcnt++;
-       
-       if ((p->rt_flags & RTF_TRACKREFS) != 0)
-               printf("%s rt(%p)->rt_refcnt(%d), caller=%p\n", __FUNCTION__,
-                       p, p->rt_refcnt, __builtin_return_address(0));
 }
 
 static inline void
@@ -580,31 +797,40 @@ ifaref(struct ifaddr *ifa)
  * destination to go through the given gateway.
  * Normally called as a result of a routing redirect
  * message from the network layer.
- *
- * N.B.: must be called at splnet
- *
  */
 void
-rtredirect(struct sockaddr *dst, struct sockaddr *gateway,
-          struct sockaddr *netmask, int flags, struct sockaddr *src,
-          struct rtentry **rtp)
+rtredirect(struct ifnet *ifp, struct sockaddr *dst, struct sockaddr *gateway,
+   struct sockaddr *netmask, int flags, struct sockaddr *src,
+   struct rtentry **rtp)
 {
-       struct rtentry *rt;
+       struct rtentry *rt = NULL;
        int error = 0;
        short *stat = 0;
        struct rt_addrinfo info;
        struct ifaddr *ifa = NULL;
+       unsigned int ifscope = (ifp != NULL) ? ifp->if_index : IFSCOPE_NONE;
+       struct sockaddr_in sin;
 
        lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_NOTOWNED);
        lck_mtx_lock(rt_mtx);
 
-       /* verify the gateway is directly reachable */
-       if ((ifa = ifa_ifwithnet(gateway)) == 0) {
+       /*
+        * Verify the gateway is directly reachable; if scoped routing
+        * is enabled, verify that it is reachable from the interface
+        * where the ICMP redirect arrived on.
+        */
+       if ((ifa = ifa_ifwithnet_scoped(gateway, ifscope)) == NULL) {
                error = ENETUNREACH;
                goto out;
        }
 
-       rt = rtalloc1_locked(dst, 0, RTF_CLONING | RTF_PRCLONING);
+       /* Lookup route to the destination (from the original IP header) */
+       rt = rtalloc1_scoped_locked(dst, 0, RTF_CLONING|RTF_PRCLONING, ifscope);
+
+       /* Embed scope in src for comparison against rt_gateway below */
+       if (ip_doscopedroute && src->sa_family == AF_INET)
+               src = sin_copy(SIN(src), &sin, ifscope);
+
        /*
         * If the redirect isn't from our current router for this dst,
         * it's either old or wrong.  If it redirects us to ourselves,
@@ -647,13 +873,14 @@ rtredirect(struct sockaddr *dst, struct sockaddr *gateway,
                if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
                        /*
                         * Changing from route to net => route to host.
-                        * Create new route, rather than smashing route to net.
+                        * Create new route, rather than smashing route
+                        * to net; similar to cloned routes, the newly
+                        * created host route is scoped as well.
                         */
                create:
                        flags |=  RTF_GATEWAY | RTF_DYNAMIC;
-                       error = rtrequest_locked((int)RTM_ADD, dst, gateway,
-                                   netmask, flags,
-                                   (struct rtentry **)0);
+                       error = rtrequest_scoped_locked(RTM_ADD, dst,
+                           gateway, netmask, flags, NULL, ifscope);
                        stat = &rtstat.rts_dynamic;
                } else {
                        /*
@@ -666,10 +893,11 @@ rtredirect(struct sockaddr *dst, struct sockaddr *gateway,
                        /*
                         * add the key and gateway (in one malloc'd chunk).
                         */
-                       rt_setgate(rt, rt_key(rt), gateway);
+                       error = rt_setgate(rt, rt_key(rt), gateway);
                }
-       } else
+       } else {
                error = EHOSTUNREACH;
+       }
 done:
        if (rt) {
                if (rtp && !error)
@@ -678,10 +906,14 @@ done:
                        rtfree_locked(rt);
        }
 out:
-       if (error)
+       if (error) {
                rtstat.rts_badredirect++;
-       else if (stat != NULL)
-               (*stat)++;
+       } else {
+               if (stat != NULL)
+                       (*stat)++;
+               if (use_routegenid)
+                       route_generation++;
+       }
        bzero((caddr_t)&info, sizeof(info));
        info.rti_info[RTAX_DST] = dst;
        info.rti_info[RTAX_GATEWAY] = gateway;
@@ -721,16 +953,47 @@ ifa_ifwithroute(
 }
 
 struct ifaddr *
-ifa_ifwithroute_locked(
-       int flags,
-       const struct sockaddr *dst,
-       const struct sockaddr *gateway)
+ifa_ifwithroute_locked(int flags, const struct sockaddr *dst,
+    const struct sockaddr *gateway)
+{
+       return (ifa_ifwithroute_common_locked((flags & ~RTF_IFSCOPE), dst,
+           gateway, IFSCOPE_NONE));
+}
+
+struct ifaddr *
+ifa_ifwithroute_scoped_locked(int flags, const struct sockaddr *dst,
+    const struct sockaddr *gateway, unsigned int ifscope)
+{
+       if (ifscope != IFSCOPE_NONE)
+               flags |= RTF_IFSCOPE;
+       else
+               flags &= ~RTF_IFSCOPE;
+
+       return (ifa_ifwithroute_common_locked(flags, dst, gateway, ifscope));
+}
+
+static struct ifaddr *
+ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst,
+    const struct sockaddr *gateway, unsigned int ifscope)
 {
        struct ifaddr *ifa = NULL;
        struct rtentry *rt = NULL;
+       struct sockaddr_in dst_in, gw_in;
 
        lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED);
 
+       if (ip_doscopedroute) {
+               /*
+                * Just in case the sockaddr passed in by the caller
+                * contains embedded scope, make sure to clear it since
+                * IPv4 interface addresses aren't scoped.
+                */
+               if (dst != NULL && dst->sa_family == AF_INET)
+                       dst = sin_copy(SIN(dst), &dst_in, IFSCOPE_NONE);
+               if (gateway != NULL && gateway->sa_family == AF_INET)
+                       gateway = sin_copy(SIN(gateway), &gw_in, IFSCOPE_NONE);
+       }
+
        if (!(flags & RTF_GATEWAY)) {
                /*
                 * If we are adding a route to an interface,
@@ -743,7 +1006,7 @@ ifa_ifwithroute_locked(
                        ifa = ifa_ifwithdstaddr(dst);
                }
                if (ifa == NULL)
-                       ifa = ifa_ifwithaddr(gateway);
+                       ifa = ifa_ifwithaddr_scoped(gateway, ifscope);
        } else {
                /*
                 * If we are adding a route to a remote net
@@ -753,10 +1016,11 @@ ifa_ifwithroute_locked(
                ifa = ifa_ifwithdstaddr(gateway);
        }
        if (ifa == NULL)
-               ifa = ifa_ifwithnet(gateway);
+               ifa = ifa_ifwithnet_scoped(gateway, ifscope);
        if (ifa == NULL) {
                /* Workaround to avoid gcc warning regarding const variable */
-               rt = rtalloc1_locked((struct sockaddr *)(size_t)dst, 0, 0UL);
+               rt = rtalloc1_scoped_locked((struct sockaddr *)(size_t)dst,
+                   0, 0UL, ifscope);
                if (rt != NULL) {
                        ifa = rt->rt_ifa;
                        if (ifa != NULL)
@@ -784,8 +1048,8 @@ ifa_ifwithroute_locked(
         */
        if ((ifa == NULL ||
            !equal(ifa->ifa_addr, (struct sockaddr *)(size_t)gateway)) &&
-           (rt = rtalloc1_locked((struct sockaddr *)(size_t)gateway,
-           0, 0UL)) != NULL) {
+           (rt = rtalloc1_scoped_locked((struct sockaddr *)(size_t)gateway,
+           0, 0UL, ifscope)) != NULL) {
                if (ifa != NULL)
                        ifafree(ifa);
                ifa = rt->rt_ifa;
@@ -793,6 +1057,17 @@ ifa_ifwithroute_locked(
                        ifaref(ifa);
                rtunref(rt);
        }
+       /*
+        * If an interface scope was specified, the interface index of
+        * the found ifaddr must be equivalent to that of the scope;
+        * otherwise there is no match.
+        */
+       if ((flags & RTF_IFSCOPE) &&
+           ifa != NULL && ifa->ifa_ifp->if_index != ifscope) {
+               ifafree(ifa);
+               ifa = NULL;
+       }
+
        return (ifa);
 }
 
@@ -806,25 +1081,55 @@ struct rtfc_arg {
        struct radix_node_head *rnh;
 };
 
+int
+rtrequest_locked(int req, struct sockaddr *dst, struct sockaddr *gateway,
+    struct sockaddr *netmask, int flags, struct rtentry **ret_nrt)
+{
+       return (rtrequest_common_locked(req, dst, gateway, netmask,
+           (flags & ~RTF_IFSCOPE), ret_nrt, IFSCOPE_NONE));
+}
+
+int
+rtrequest_scoped_locked(int req, struct sockaddr *dst,
+    struct sockaddr *gateway, struct sockaddr *netmask, int flags,
+    struct rtentry **ret_nrt, unsigned int ifscope)
+{
+       if (ifscope != IFSCOPE_NONE)
+               flags |= RTF_IFSCOPE;
+       else
+               flags &= ~RTF_IFSCOPE;
+
+       return (rtrequest_common_locked(req, dst, gateway, netmask,
+           flags, ret_nrt, ifscope));
+}
+
 /*
- * Do appropriate manipulations of a routing tree given
- * all the bits of info needed
+ * Do appropriate manipulations of a routing tree given all the bits of
+ * info needed.
+ *
+ * Embedding the scope in the radix key is an internal job that should be
+ * left to routines in this module.  Callers should specify the scope value
+ * to the "scoped" variants of route routines instead of manipulating the
+ * key itself.  This is typically done when creating a scoped route, e.g.
+ * rtrequest(RTM_ADD).  Once such a route is created and marked with the
+ * RTF_IFSCOPE flag, callers can simply use its rt_key(rt) to clone it
+ * (RTM_RESOLVE) or to remove it (RTM_DELETE).  An exception to this is
+ * during certain routing socket operations where the search key might be
+ * derived from the routing message itself, in which case the caller must
+ * specify the destination address and scope value for RTM_ADD/RTM_DELETE.
  */
-int
-rtrequest_locked(
-       int req,
-       struct sockaddr *dst,
-       struct sockaddr *gateway,
-       struct sockaddr *netmask,
-       int flags,
-       struct rtentry **ret_nrt)
+static int
+rtrequest_common_locked(int req, struct sockaddr *dst0,
+    struct sockaddr *gateway, struct sockaddr *netmask, int flags,
+    struct rtentry **ret_nrt, unsigned int ifscope)
 {
        int error = 0;
        struct rtentry *rt;
        struct radix_node *rn;
        struct radix_node_head *rnh;
        struct ifaddr *ifa = NULL;
-       struct sockaddr *ndst;
+       struct sockaddr *ndst, *dst = dst0;
+       struct sockaddr_in sin, mask;
 #define senderr(x) { error = x ; goto bad; }
 
        lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED);
@@ -839,6 +1144,37 @@ rtrequest_locked(
         */
        if (flags & RTF_HOST)
                netmask = 0;
+
+       /*
+        * If RTF_IFSCOPE is specified, use a local copy of the destination
+        * address to embed the scope into.  This logic is repeated below
+        * in the RTM_RESOLVE handler since the caller does not normally
+        * specify such a flag during a resolve; instead it passes in the
+        * route used for cloning for which the scope info is derived from.
+        * Note also that in the case of RTM_DELETE, the address passed in
+        * by the caller might already contain the embedded scope info when
+        * it is the key itself, thus making RTF_IFSCOPE unnecessary; one
+        * instance where it is explicitly set is inside route_output()
+        * as part of handling a routing socket request.
+        */
+       if (req != RTM_RESOLVE && (flags & RTF_IFSCOPE)) {
+               /* Scoped routing is for AF_INET only */
+               if (dst->sa_family != AF_INET ||
+                   (req == RTM_ADD && !ip_doscopedroute))
+                       senderr(EINVAL);
+
+               if (ifscope == IFSCOPE_NONE) {
+                       flags &= ~RTF_IFSCOPE;
+               } else {
+                       /* Embed ifscope into the key (local copy) */
+                       dst = sin_copy(SIN(dst), &sin, ifscope);
+
+                       /* Embed ifscope into netmask (local copy) */
+                       if (netmask != NULL)
+                               netmask = mask_copy(netmask, &mask, ifscope);
+               }
+       }
+
        switch (req) {
        case RTM_DELETE:
                /*
@@ -901,6 +1237,13 @@ rtrequest_locked(
                            (struct rtentry_dbg *)rt, rtd_trash_link);
                }
 
+               /*
+                * If this is the (non-scoped) default route, clear
+                * the interface index used for the primary ifscope.
+                */
+               if (rt_inet_default(rt, rt_key(rt)))
+                       set_primary_ifscope(IFSCOPE_NONE);
+
                /*
                 * If the caller wants it, then it can have it,
                 * but it's up to it to free the rtentry as we won't be
@@ -926,20 +1269,54 @@ rtrequest_locked(
                gateway = rt->rt_gateway;
                if ((netmask = rt->rt_genmask) == 0)
                        flags |= RTF_HOST;
+
+               if (!ip_doscopedroute || dst->sa_family != AF_INET)
+                       goto makeroute;
+               /*
+                * When scoped routing is enabled, cloned entries are
+                * always scoped according to the interface portion of
+                * the parent route.  The exception to this are IPv4
+                * link local addresses.
+                */
+               if (!IN_LINKLOCAL(ntohl(SIN(dst)->sin_addr.s_addr))) {
+                       if (flags & RTF_IFSCOPE) {
+                               ifscope = sa_get_ifscope(rt_key(rt));
+                       } else {
+                               ifscope = rt->rt_ifp->if_index;
+                               flags |= RTF_IFSCOPE;
+                       }
+               } else {
+                       ifscope = IFSCOPE_NONE;
+                       flags &= ~RTF_IFSCOPE;
+               }
+
+               /* Embed or clear ifscope into/from the key (local copy) */
+               dst = sin_copy(SIN(dst), &sin, ifscope);
+
+               /* Embed or clear ifscope into/from netmask (local copy) */
+               if (netmask != NULL)
+                       netmask = mask_copy(netmask, &mask, ifscope);
+
                goto makeroute;
 
        case RTM_ADD:
                if ((flags & RTF_GATEWAY) && !gateway)
-                       panic("rtrequest: GATEWAY but no gateway");
+                       panic("rtrequest: RTF_GATEWAY but no gateway");
 
-               if ((ifa = ifa_ifwithroute_locked(flags, dst, gateway)) == 0)
+               if (flags & RTF_IFSCOPE) {
+                       ifa = ifa_ifwithroute_scoped_locked(flags, dst0,
+                           gateway, ifscope);
+               } else {
+                       ifa = ifa_ifwithroute_locked(flags, dst0, gateway);
+               }
+               if (ifa == NULL)
                        senderr(ENETUNREACH);
-
-       makeroute:
+makeroute:
                if ((rt = rte_alloc()) == NULL)
                        senderr(ENOBUFS);
                Bzero(rt, sizeof(*rt));
                rt->rt_flags = RTF_UP | flags;
+
                /*
                 * Add the gateway. Possibly re-malloc-ing the storage for it
                 * also add the rt_gwroute if possible.
@@ -957,9 +1334,9 @@ rtrequest_locked(
                /*
                 * make sure it contains the value we want (masked if needed).
                 */
-               if (netmask) {
+               if (netmask)
                        rt_maskedcopy(dst, ndst, netmask);
-               else
+               else
                        Bcopy(dst, ndst, dst->sa_len);
 
                /*
@@ -983,8 +1360,13 @@ rtrequest_locked(
                         * mechanism, then we just blow it away and retry
                         * the insertion of the new one.
                         */
-                       rt2 = rtalloc1_locked(dst, 0,
-                           RTF_CLONING | RTF_PRCLONING);
+                       if (flags & RTF_IFSCOPE) {
+                               rt2 = rtalloc1_scoped_locked(dst0, 0,
+                                   RTF_CLONING | RTF_PRCLONING, ifscope);
+                       } else {
+                               rt2 = rtalloc1_locked(dst, 0,
+                                   RTF_CLONING | RTF_PRCLONING);
+                       }
                        if (rt2 && rt2->rt_parent) {
                                rtrequest_locked(RTM_DELETE,
                                          (struct sockaddr *)rt_key(rt2),
@@ -1052,6 +1434,13 @@ rtrequest_locked(
                                               rt_fixchange, &arg);
                }
 
+               /*
+                * If this is the (non-scoped) default route, record
+                * the interface index used for the primary ifscope.
+                */
+               if (rt_inet_default(rt, rt_key(rt)))
+                       set_primary_ifscope(rt->rt_ifp->if_index);
+
                /*
                 * actually return a resultant rtentry and
                 * give the caller a single reference.
@@ -1121,10 +1510,6 @@ rt_fixdelete(struct radix_node *rn, void *vp)
  * routine just for adds.  I'm not sure why I thought it was necessary to do
  * changes this way.
  */
-#ifdef DEBUG
-static int rtfcdebug = 0;
-#endif
-
 static int
 rt_fixchange(struct radix_node *rn, void *vp)
 {
@@ -1135,36 +1520,20 @@ rt_fixchange(struct radix_node *rn, void *vp)
        u_char *xk1, *xm1, *xk2, *xmp;
        int i, len, mlen;
 
-#ifdef DEBUG
-       if (rtfcdebug)
-               printf("rt_fixchange: rt %p, rt0 %p\n", rt, rt0);
-#endif
-
        lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED);
 
        if (!rt->rt_parent ||
-           (rt->rt_flags & (RTF_PINNED | RTF_CLONING | RTF_PRCLONING))) {
-#ifdef DEBUG
-               if(rtfcdebug) printf("no parent or pinned\n");
-#endif
-               return 0;
-       }
+           (rt->rt_flags & (RTF_PINNED | RTF_CLONING | RTF_PRCLONING)))
+               return (0);
 
-       if (rt->rt_parent == rt0) {
-#ifdef DEBUG
-               if(rtfcdebug) printf("parent match\n");
-#endif
-               return rtrequest_locked(RTM_DELETE, rt_key(rt),
-                                (struct sockaddr *)0, rt_mask(rt),
-                                rt->rt_flags, (struct rtentry **)0);
-       }
+       if (rt->rt_parent == rt0)
+               goto delete_rt;
 
        /*
         * There probably is a function somewhere which does this...
         * if not, there should be.
         */
-       len = imin(((struct sockaddr *)rt_key(rt0))->sa_len,
-                  ((struct sockaddr *)rt_key(rt))->sa_len);
+       len = imin(rt_key(rt0)->sa_len, rt_key(rt)->sa_len);
 
        xk1 = (u_char *)rt_key(rt0);
        xm1 = (u_char *)rt_mask(rt0);
@@ -1172,140 +1541,168 @@ rt_fixchange(struct radix_node *rn, void *vp)
 
        /* avoid applying a less specific route */
        xmp = (u_char *)rt_mask(rt->rt_parent);
-       mlen = ((struct sockaddr *)rt_key(rt->rt_parent))->sa_len;
-       if (mlen > ((struct sockaddr *)rt_key(rt0))->sa_len) {
-#if DEBUG
-               if (rtfcdebug)
-                       printf("rt_fixchange: inserting a less "
-                              "specific route\n");
-#endif
-               return 0;
-       }
+       mlen = rt_key(rt->rt_parent)->sa_len;
+       if (mlen > rt_key(rt0)->sa_len)
+               return (0);
+
        for (i = rnh->rnh_treetop->rn_offset; i < mlen; i++) {
-               if ((xmp[i] & ~(xmp[i] ^ xm1[i])) != xmp[i]) {
-#if DEBUG
-                       if (rtfcdebug)
-                               printf("rt_fixchange: inserting a less "
-                                      "specific route\n");
-#endif
-                       return 0;
-               }
+               if ((xmp[i] & ~(xmp[i] ^ xm1[i])) != xmp[i])
+                       return (0);
        }
 
        for (i = rnh->rnh_treetop->rn_offset; i < len; i++) {
-               if ((xk2[i] & xm1[i]) != xk1[i]) {
-#ifdef DEBUG
-                       if(rtfcdebug) printf("no match\n");
-#endif
-                       return 0;
-               }
+               if ((xk2[i] & xm1[i]) != xk1[i])
+                       return (0);
        }
 
        /*
         * OK, this node is a clone, and matches the node currently being
         * changed/added under the node's mask.  So, get rid of it.
         */
-#ifdef DEBUG
-       if(rtfcdebug) printf("deleting\n");
-#endif
-       return rtrequest_locked(RTM_DELETE, rt_key(rt), (struct sockaddr *)0,
-                        rt_mask(rt), rt->rt_flags, (struct rtentry **)0);
+delete_rt:
+       return (rtrequest_locked(RTM_DELETE, rt_key(rt), NULL,
+           rt_mask(rt), rt->rt_flags, NULL));
 }
 
 int
-rt_setgate(struct rtentry *rt0, struct sockaddr *dst, struct sockaddr *gate)
+rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate)
 {
-       caddr_t new, old;
        int dlen = ROUNDUP(dst->sa_len), glen = ROUNDUP(gate->sa_len);
-       struct rtentry *rt = rt0;
        struct radix_node_head *rnh = rt_tables[dst->sa_family];
+
+       lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED);
+
        /*
         * A host route with the destination equal to the gateway
         * will interfere with keeping LLINFO in the routing
         * table, so disallow it.
         */
-       
-       lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED);
-
-       if (((rt0->rt_flags & (RTF_HOST|RTF_GATEWAY|RTF_LLINFO)) ==
-                                       (RTF_HOST|RTF_GATEWAY)) &&
-           (dst->sa_len == gate->sa_len) &&
+       if (((rt->rt_flags & (RTF_HOST|RTF_GATEWAY|RTF_LLINFO)) ==
+           (RTF_HOST|RTF_GATEWAY)) && (dst->sa_len == gate->sa_len) &&
            (bcmp(dst, gate, dst->sa_len) == 0)) {
                /*
                 * The route might already exist if this is an RTM_CHANGE
                 * or a routing redirect, so try to delete it.
                 */
-               if (rt_key(rt0))
-                       rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt0),
-                           rt0->rt_gateway, rt_mask(rt0), rt0->rt_flags, 0);
-               return EADDRNOTAVAIL;
+               if (rt_key(rt))
+                       rtrequest_locked(RTM_DELETE, rt_key(rt),
+                           rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL);
+               return (EADDRNOTAVAIL);
        }
 
        /*
-        * Both dst and gateway are stored in the same malloc'd chunk
-        * (If I ever get my hands on....)
-        * if we need to malloc a new chunk, then keep the old one around
-        * till we don't need it any more.
+        * The destination is not directly reachable.  Get a route
+        * to the next-hop gateway and store it in rt_gwroute.
         */
-       if (rt->rt_gateway == 0 || glen > ROUNDUP(rt->rt_gateway->sa_len)) {
-               old = (caddr_t)rt_key(rt);
-               R_Malloc(new, caddr_t, dlen + glen);
-               if (new == 0)
-                       return ENOBUFS;
-               rt->rt_nodes->rn_key = new;
-       } else {
+       if (rt->rt_flags & RTF_GATEWAY) {
+               struct rtentry *gwrt;
+               unsigned int ifscope;
+
+               ifscope = (dst->sa_family == AF_INET) ?
+                   sa_get_ifscope(dst) : IFSCOPE_NONE;
+
+               gwrt = rtalloc1_scoped_locked(gate, 1, RTF_PRCLONING, ifscope);
+
                /*
-                * otherwise just overwrite the old one
+                * Cloning loop avoidance:
+                *
+                * In the presence of protocol-cloning and bad configuration,
+                * it is possible to get stuck in bottomless mutual recursion
+                * (rtrequest rt_setgate rtalloc1).  We avoid this by not
+                * allowing protocol-cloning to operate for gateways (which
+                * is probably the correct choice anyway), and avoid the
+                * resulting reference loops by disallowing any route to run
+                * through itself as a gateway.  This is obviously mandatory
+                * when we get rt->rt_output().  It implies that a route to
+                * the gateway must already be present in the system in order
+                * for the gateway to be referred to by another route.
                 */
-               new = rt->rt_nodes->rn_key;
-               old = 0;
+               if (gwrt == rt) {
+                       rtunref(gwrt);
+                       return (EADDRINUSE); /* failure */
+               }
+
+               /* If scoped, the gateway route must use the same interface */
+               if (ifscope != IFSCOPE_NONE && (rt->rt_flags & RTF_IFSCOPE) &&
+                   gwrt != NULL && gwrt->rt_ifp != NULL &&
+                   gwrt->rt_ifp->if_index != ifscope) {
+                       rtfree_locked(gwrt);
+                       return ((rt->rt_flags & RTF_HOST) ?
+                           EHOSTUNREACH : ENETUNREACH);
+               }
+
+               if (rt->rt_gwroute != NULL)
+                       rtfree_locked(rt->rt_gwroute);
+               rt->rt_gwroute = gwrt;
+
+               /*
+                * In case the (non-scoped) default route gets modified via
+                * an ICMP redirect, record the interface index used for the
+                * primary ifscope.  Also done in rt_setif() to take care
+                * of the non-redirect cases.
+                */
+               if (rt_inet_default(rt, dst) && rt->rt_ifp != NULL)
+                       set_primary_ifscope(rt->rt_ifp->if_index);
+
+               /*
+                * Tell the kernel debugger about the new default gateway
+                * if the gateway route uses the primary interface, or
+                * if we are in a transient state before the non-scoped
+                * default gateway is installed (similar to how the system
+                * was behaving in the past).  In future, it would be good
+                * to do all this only when KDP is enabled.
+                */
+               if ((dst->sa_family == AF_INET) &&
+                   gwrt != NULL && gwrt->rt_gateway->sa_family == AF_LINK &&
+                   (gwrt->rt_ifp->if_index == get_primary_ifscope() ||
+                   get_primary_ifscope() == IFSCOPE_NONE))
+                       kdp_set_gateway_mac(SDL(gwrt->rt_gateway)->sdl_data);
        }
 
        /*
-        * copy the new gateway value into the memory chunk
+        * Prepare to store the gateway in rt_gateway.  Both dst and gateway
+        * are stored one after the other in the same malloc'd chunk.  If we
+        * have room, reuse the old buffer since rt_gateway already points
+        * to the right place.  Otherwise, malloc a new block and update
+        * the 'dst' address and point rt_gateway to the right place.
         */
-       Bcopy(gate, (rt->rt_gateway = (struct sockaddr *)(new + dlen)), glen);
+       if (rt->rt_gateway == NULL || glen > ROUNDUP(rt->rt_gateway->sa_len)) {
+               caddr_t new;
 
-       /*
-        * if we are replacing the chunk (or it's new) we need to
-        * replace the dst as well
-        */
-       if (old) {
+               /* The underlying allocation is done with M_WAITOK set */
+               R_Malloc(new, caddr_t, dlen + glen);
+               if (new == NULL) {
+                       if (rt->rt_gwroute != NULL)
+                               rtfree_locked(rt->rt_gwroute);
+                       rt->rt_gwroute = NULL;
+                       return (ENOBUFS);
+               }
+
+               /*
+                * Copy from 'dst' and not rt_key(rt) because we can get
+                * here to initialize a newly allocated route entry, in
+                * which case rt_key(rt) is NULL (and so does rt_gateway).
+                */
                Bcopy(dst, new, dlen);
-               R_Free(old);
+               R_Free(rt_key(rt));     /* free old block; NULL is okay */
+               rt->rt_nodes->rn_key = new;
+               rt->rt_gateway = (struct sockaddr *)(new + dlen);
        }
 
        /*
-        * If there is already a gwroute, it's now almost definitly wrong
-        * so drop it.
+        * Copy the new gateway value into the memory chunk.
         */
-       if (rt->rt_gwroute) {
-               rt = rt->rt_gwroute; rtfree_locked(rt);
-               rt = rt0; rt->rt_gwroute = 0;
-       }
+       Bcopy(gate, rt->rt_gateway, glen);
+
        /*
-        * Cloning loop avoidance:
-        * In the presence of protocol-cloning and bad configuration,
-        * it is possible to get stuck in bottomless mutual recursion
-        * (rtrequest rt_setgate rtalloc1).  We avoid this by not allowing
-        * protocol-cloning to operate for gateways (which is probably the
-        * correct choice anyway), and avoid the resulting reference loops
-        * by disallowing any route to run through itself as a gateway.
-        * This is obviously mandatory when we get rt->rt_output().
+        * For consistency between rt_gateway and rt_key(gwrt).
         */
-       if (rt->rt_flags & RTF_GATEWAY) {
-               rt->rt_gwroute = rtalloc1_locked(gate, 1, RTF_PRCLONING);
-               if (rt->rt_gwroute == rt) {
-                       rtfree_locked(rt->rt_gwroute);
-                       rt->rt_gwroute = 0;
-                       return EDQUOT; /* failure */
-               }
-               /* Tell the kernel debugger about the new default gateway */
-               if ((AF_INET == rt->rt_gateway->sa_family) && 
-                   rt->rt_gwroute && rt->rt_gwroute->rt_gateway && 
-                   (AF_LINK == rt->rt_gwroute->rt_gateway->sa_family)) {
-                 kdp_set_gateway_mac(((struct sockaddr_dl *)rt0->rt_gwroute->rt_gateway)->sdl_data);
-               }
+       if ((rt->rt_flags & RTF_GATEWAY) && rt->rt_gwroute != NULL &&
+           (rt->rt_gwroute->rt_flags & RTF_IFSCOPE) &&
+           rt->rt_gateway->sa_family == AF_INET &&
+           rt_key(rt->rt_gwroute)->sa_family == AF_INET) {
+               sa_set_ifscope(rt->rt_gateway,
+                   sa_get_ifscope(rt_key(rt->rt_gwroute)));
        }
 
        /*
@@ -1318,10 +1715,10 @@ rt_setgate(struct rtentry *rt0, struct sockaddr *dst, struct sockaddr *gate)
                arg.rnh = rnh;
                arg.rt0 = rt;
                rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt),
-                                      rt_fixchange, &arg);
+                   rt_fixchange, &arg);
        }
 
-       return 0;
+       return (0);
 }
 
 static void
@@ -1344,6 +1741,202 @@ rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst,
                bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
 }
 
+/*
+ * Lookup an AF_INET scoped or non-scoped route depending on the ifscope
+ * value passed in by the caller (IFSCOPE_NONE implies non-scoped).
+ */
+static struct radix_node *
+node_lookup(struct sockaddr *dst, struct sockaddr *netmask,
+    unsigned int ifscope)
+{
+       struct radix_node_head *rnh = rt_tables[AF_INET];
+       struct radix_node *rn;
+       struct sockaddr_in sin, mask;
+       struct matchleaf_arg ma = { ifscope };
+       rn_matchf_t *f = rn_match_ifscope;
+       void *w = &ma;
+
+       if (dst->sa_family != AF_INET)
+               return (NULL);
+
+       /*
+        * Embed ifscope into the search key; for a non-scoped
+        * search this will clear out any embedded scope value.
+        */
+       dst = sin_copy(SIN(dst), &sin, ifscope);
+
+       /* Embed (or clear) ifscope into netmask */
+       if (netmask != NULL)
+               netmask = mask_copy(netmask, &mask, ifscope);
+
+       if (ifscope == IFSCOPE_NONE)
+               f = w = NULL;
+
+       rn = rnh->rnh_lookup_args(dst, netmask, rnh, f, w);
+       if (rn != NULL && (rn->rn_flags & RNF_ROOT))
+               rn = NULL;
+
+       return (rn);
+}
+
+/*
+ * Lookup the AF_INET non-scoped default route.
+ */
+static struct radix_node *
+node_lookup_default(void)
+{
+       struct radix_node_head *rnh = rt_tables[AF_INET];
+       return (rnh->rnh_lookup(&sin_def, NULL, rnh));
+}
+
+/*
+ * Common routine to lookup/match a route.  It invokes the lookup/matchaddr
+ * callback which could be address family-specific.  The main difference
+ * between the two (at least for AF_INET/AF_INET6) is that a lookup does
+ * not alter the expiring state of a route, whereas a match would unexpire
+ * or revalidate the route.
+ *
+ * The optional scope or interface index property of a route allows for a
+ * per-interface route instance.  This permits multiple route entries having
+ * the same destination (but not necessarily the same gateway) to exist in
+ * the routing table; each of these entries is specific to the corresponding
+ * interface.  This is made possible by embedding the scope value into the
+ * radix key, thus making each route entry unique.  These scoped entries
+ * exist along with the regular, non-scoped entries in the same radix tree
+ * for a given address family (currently AF_INET only); the scope logically
+ * partitions it into multiple per-interface sub-trees.
+ *
+ * When a scoped route lookup is performed, the routing table is searched for
+ * the best match that would result in a route using the same interface as the
+ * one associated with the scope (the exception to this are routes that point
+ * to the loopback interface).  The search rule follows the longest matching
+ * prefix with the additional interface constraint.
+ */
+struct rtentry *
+rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask,
+    struct radix_node_head *rnh, unsigned int ifscope)
+{
+       struct radix_node *rn0, *rn;
+       boolean_t dontcare = (ifscope == IFSCOPE_NONE);
+
+       lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED);
+
+       if (!lookup_only)
+               netmask = NULL;
+
+       /*
+        * Non-scoped route lookup.
+        */
+       if (!ip_doscopedroute || dst->sa_family != AF_INET) {
+               if (lookup_only)
+                       rn = rnh->rnh_lookup(dst, netmask, rnh);
+               else
+                       rn = rnh->rnh_matchaddr(dst, rnh);
+               goto done;
+       }
+
+       /*
+        * Scoped route lookup:
+        *
+        * We first perform a non-scoped lookup for the original result.
+        * Afterwards, depending on whether or not the caller has specified
+        * a scope, we perform a more specific scoped search and fallback
+        * to this original result upon failure.
+        */
+       rn0 = rn = node_lookup(dst, netmask, IFSCOPE_NONE);
+
+       /*
+        * If the caller did not specify a scope, use the primary scope
+        * derived from the system's non-scoped default route.  If, for
+        * any reason, there is no primary interface, return what we have.
+        */
+       if (dontcare && (ifscope = get_primary_ifscope()) == IFSCOPE_NONE)
+               goto validate;
+
+       /*
+        * Keep the original result if either of the following is true:
+        *
+        *   1) The interface portion of the route has the same interface
+        *      index as the scope value and it is marked with RTF_IFSCOPE.
+        *   2) The route uses the loopback interface, in which case the
+        *      destination (host/net) is local/loopback.
+        *
+        * Otherwise, do a more specified search using the scope.
+        */
+       if (rn != NULL) {
+               struct rtentry *rt = RT(rn);
+               if (rt->rt_ifp != lo_ifp) {
+                       if (rt->rt_ifp->if_index != ifscope) {
+                               /*
+                                * Wrong interface; keep the original result
+                                * only if the caller did not specify a scope,
+                                * and do a more specific scoped search using
+                                * the scope of the found route.  Otherwise,
+                                * start again from scratch.
+                                */
+                               rn = NULL;
+                               if (dontcare)
+                                       ifscope = rt->rt_ifp->if_index;
+                               else
+                                       rn0 = NULL;
+                       } else if (!(rt->rt_flags & RTF_IFSCOPE)) {
+                               /*
+                                * Right interface, except that this route
+                                * isn't marked with RTF_IFSCOPE.  Do a more
+                                * specific scoped search.  Keep the original
+                                * result and return it it in case the scoped
+                                * search fails.
+                                */
+                               rn = NULL;
+                       }
+               }
+       }
+
+       /*
+        * Scoped search.  Find the most specific entry having the same
+        * interface scope as the one requested.  The following will result
+        * in searching for the longest prefix scoped match.
+        */
+       if (rn == NULL)
+               rn = node_lookup(dst, netmask, ifscope);
+
+       /*
+        * Use the original result if either of the following is true:
+        *
+        *   1) The scoped search did not yield any result.
+        *   2) The result from the scoped search is a scoped default route,
+        *      and the original (non-scoped) result is not a default route,
+        *      i.e. the original result is a more specific host/net route.
+        *   3) The scoped search yielded a net route but the original
+        *      result is a host route, i.e. the original result is treated
+        *      as a more specific route.
+        */
+       if (rn == NULL || (rn0 != NULL &&
+           ((INET_DEFAULT(rt_key(RT(rn))) && !INET_DEFAULT(rt_key(RT(rn0)))) ||
+           (!RT_HOST(rn) && RT_HOST(rn0)))))
+               rn = rn0;
+
+       /*
+        * If we still don't have a route, use the non-scoped default
+        * route as long as the interface portion satistifes the scope.
+        */
+       if (rn == NULL && (rn = node_lookup_default()) != NULL &&
+           RT(rn)->rt_ifp->if_index != ifscope)
+               rn = NULL;
+
+validate:
+       if (rn != NULL && !lookup_only)
+               (void) in_validate(rn);
+
+done:
+       if (rn != NULL && (rn->rn_flags & RNF_ROOT))
+               rn = NULL;
+       else if (rn != NULL)
+               rtref(RT(rn));
+
+       return (RT(rn));
+}
+
 /*
  * Set up a routing table entry, normally
  * for an interface.
index 9d26a8bbd5f2b3afad4db6779cc794412f40fb3b..cfc95a6aa7f1ea19f31d8b0a41c2789cd892573e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000,2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -84,8 +84,12 @@ struct  rtentry;
 struct route {
        struct  rtentry *ro_rt;
        struct  sockaddr ro_dst;
-       u_long  reserved[2];    /* for future use if needed */
+       u_int32_t       ro_flags;       /* route flags (see below) */
+       u_int32_t       reserved;       /* for future use if needed */
 };
+
+#define        ROF_SRCIF_SELECTED      0x1 /* source interface was selected */
+
 #else
 struct route;
 #endif /* PRIVATE */
@@ -195,8 +199,8 @@ struct ortentry {
 #define        RTF_LOCAL       0x200000        /* route represents a local address */
 #define        RTF_BROADCAST   0x400000        /* route represents a bcast address */
 #define        RTF_MULTICAST   0x800000        /* route represents a mcast address */
-#define RTF_TRACKREFS  0x1000000       /* Debug references and releases */
-                                       /* 0x1000000 and up unassigned */
+#define RTF_IFSCOPE    0x1000000       /* has valid interface scope */
+                                       /* 0x2000000 and up unassigned */
 
 /*
  * Routing statistics.
@@ -323,6 +327,11 @@ struct route_cb {
 };
 
 #ifdef KERNEL_PRIVATE
+/*
+ * For scoped routing; a zero interface scope value means nil/no scope.
+ */
+#define        IFSCOPE_NONE    0
+
 #define RTFREE(rt)     rtfree(rt)
 extern struct route_cb route_cb;
 extern struct radix_node_head *rt_tables[AF_MAX+1];
@@ -338,11 +347,19 @@ extern void rt_missmsg(int, struct rt_addrinfo *, int, int);
 extern void rt_newaddrmsg(int, struct ifaddr *, int, struct rtentry *);
 extern void rt_newmaddrmsg(int, struct ifmultiaddr *);
 extern int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *);
+extern void set_primary_ifscope(unsigned int);
+extern unsigned int get_primary_ifscope(void);
+extern boolean_t rt_inet_default(struct rtentry *, struct sockaddr *);
+extern struct rtentry *rt_lookup(boolean_t, struct sockaddr *,
+    struct sockaddr *, struct radix_node_head *, unsigned int);
 extern void rtalloc(struct route *);
 extern void rtalloc_ign(struct route *, u_long);
-extern void rtalloc_ign_locked(struct route *, u_long );
+extern void rtalloc_ign_locked(struct route *, u_long);
+extern void rtalloc_scoped_ign_locked(struct route *, u_long, unsigned int);
 extern struct rtentry *rtalloc1(struct sockaddr *, int, u_long);
 extern struct rtentry *rtalloc1_locked(struct sockaddr *, int, u_long);
+extern struct rtentry *rtalloc1_scoped_locked(struct sockaddr *, int,
+    u_long, unsigned int);
 extern void rtfree(struct rtentry *);
 extern void rtfree_locked(struct rtentry *);
 extern void rtref(struct rtentry *);
@@ -356,14 +373,17 @@ extern void rtsetifa(struct rtentry *, struct ifaddr *);
 extern int rtinit(struct ifaddr *, int, int);
 extern int rtinit_locked(struct ifaddr *, int, int);
 extern int rtioctl(int, caddr_t, struct proc *);
-extern void rtredirect(struct sockaddr *, struct sockaddr *,
+extern void rtredirect(struct ifnet *, struct sockaddr *, struct sockaddr *,
     struct sockaddr *, int, struct sockaddr *, struct rtentry **);
 extern int rtrequest(int, struct sockaddr *,
     struct sockaddr *, struct sockaddr *, int, struct rtentry **);
 extern int rtrequest_locked(int, struct sockaddr *,
     struct sockaddr *, struct sockaddr *, int, struct rtentry **);
+extern int rtrequest_scoped_locked(int, struct sockaddr *, struct sockaddr *,
+    struct sockaddr *, int, struct rtentry **, unsigned int);
 extern struct rtentry *rte_alloc(void);
 extern void rte_free(struct rtentry *);
+extern unsigned int sa_get_ifscope(struct sockaddr *);
 #endif KERNEL_PRIVATE
 
 #endif
index b4aee361bb67a881f0b671205f89d8019c706407..b6836fd6c07887efa35aadfdeea5dc8c3797b3f0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -83,7 +83,6 @@
 #include <machine/spl.h>
 
 extern struct rtstat rtstat;
-extern int rttrash;
 extern u_long route_generation;
 extern int use_routegenid;
 extern int check_routeselfref;
@@ -113,7 +112,9 @@ static int  sysctl_iflist2(int af, struct walkarg *w);
 static int      route_output(struct mbuf *, struct socket *);
 static void     rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics *);
 static void    rt_setif(struct rtentry *, struct sockaddr *, struct sockaddr *,
-                             struct sockaddr *);
+                   struct sockaddr *, unsigned int);
+
+#define        SIN(sa)         ((struct sockaddr_in *)(size_t)(sa))
 
 /* Sleazy use of local variables throughout file, warning!!!! */
 #define dst    info.rti_info[RTAX_DST]
@@ -308,10 +309,13 @@ route_output(struct mbuf *m, struct socket *so)
 #ifndef __APPLE__
        struct proc  *curproc = current_proc();
 #endif
+       struct sockaddr_in dst_in, gate_in;
        int sendonlytoself = 0;
+       unsigned int ifscope = IFSCOPE_NONE;
 
 #define senderr(e) { error = e; goto flush;}
-       if (m == 0 || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == 0))
+       if (m == NULL ||
+           ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == 0))
                return (ENOBUFS);
        if ((m->m_flags & M_PKTHDR) == 0)
                panic("route_output");
@@ -323,20 +327,20 @@ route_output(struct mbuf *m, struct socket *so)
        len = m->m_pkthdr.len;
        if (len < sizeof(*rtm) ||
            len != mtod(m, struct rt_msghdr *)->rtm_msglen) {
-               dst = 0;
+               dst = NULL;
                senderr(EINVAL);
        }
        R_Malloc(rtm, struct rt_msghdr *, len);
-       if (rtm == 0) {
-               dst = 0;
+       if (rtm == NULL) {
+               dst = NULL;
                senderr(ENOBUFS);
        }
        m_copydata(m, 0, len, (caddr_t)rtm);
        if (rtm->rtm_version != RTM_VERSION) {
-               dst = 0;
+               dst = NULL;
                senderr(EPROTONOSUPPORT);
        }
-       
+
        /*
         * Silent version of RTM_GET for Reachabiltiy APIs. We may change
         * all RTM_GETs to be silent in the future, so this is private for now.
@@ -347,26 +351,52 @@ route_output(struct mbuf *m, struct socket *so)
                sendonlytoself = 1;
                rtm->rtm_type = RTM_GET;
        }
-       
+
        /*
         * Perform permission checking, only privileged sockets
         * may perform operations other than RTM_GET
         */
        if (rtm->rtm_type != RTM_GET && (so->so_state & SS_PRIV) == 0) {
-               dst = 0;
+               dst = NULL;
                senderr(EPERM);
        }
 
        rtm->rtm_pid = proc_selfpid();
        info.rti_addrs = rtm->rtm_addrs;
        if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) {
-               dst = 0;
+               dst = NULL;
                senderr(EINVAL);
        }
-       if (dst == 0 || (dst->sa_family >= AF_MAX)
-           || (gate != 0 && (gate->sa_family >= AF_MAX))) {
+       if (dst == NULL || (dst->sa_family >= AF_MAX) ||
+           (gate != NULL && (gate->sa_family >= AF_MAX))) {
                senderr(EINVAL);
        }
+
+       if (dst->sa_family == AF_INET && dst->sa_len != sizeof (dst_in)) {
+               /* At minimum, we need up to sin_addr */
+               if (dst->sa_len < offsetof(struct sockaddr_in, sin_zero))
+                       senderr(EINVAL);
+               bzero(&dst_in, sizeof (dst_in));
+               dst_in.sin_len = sizeof (dst_in);
+               dst_in.sin_family = AF_INET;
+               dst_in.sin_port = SIN(dst)->sin_port;
+               dst_in.sin_addr = SIN(dst)->sin_addr;
+               dst = (struct sockaddr *)&dst_in;
+       }
+
+       if (gate != NULL &&
+           gate->sa_family == AF_INET && gate->sa_len != sizeof (gate_in)) {
+               /* At minimum, we need up to sin_addr */
+               if (gate->sa_len < offsetof(struct sockaddr_in, sin_zero))
+                       senderr(EINVAL);
+               bzero(&gate_in, sizeof (gate_in));
+               gate_in.sin_len = sizeof (gate_in);
+               gate_in.sin_family = AF_INET;
+               gate_in.sin_port = SIN(gate)->sin_port;
+               gate_in.sin_addr = SIN(gate)->sin_addr;
+               gate = (struct sockaddr *)&gate_in;
+       }
+
        if (genmask) {
                struct radix_node *t;
                t = rn_addmask((caddr_t)genmask, 0, 1);
@@ -375,10 +405,21 @@ route_output(struct mbuf *m, struct socket *so)
                else
                        senderr(ENOBUFS);
        }
+
+       /*
+        * If RTF_IFSCOPE flag is set, then rtm_index specifies the scope.
+        */
+       if (rtm->rtm_flags & RTF_IFSCOPE) {
+               /* Scoped routing is for AF_INET only */
+               if (dst->sa_family != AF_INET)
+                       senderr(EINVAL);
+               ifscope = rtm->rtm_index;
+       }
+
        switch (rtm->rtm_type) {
-       
+
                case RTM_ADD:
-                       if (gate == 0)
+                       if (gate == NULL)
                                senderr(EINVAL);
 
 #ifdef __APPLE__
@@ -409,8 +450,8 @@ route_output(struct mbuf *m, struct socket *so)
                        }
 }
 #endif 
-                       error = rtrequest_locked(RTM_ADD, dst, gate, netmask,
-                                               rtm->rtm_flags, &saved_nrt);
+                       error = rtrequest_scoped_locked(RTM_ADD, dst, gate,
+                           netmask, rtm->rtm_flags, &saved_nrt, ifscope);
                        if (error == 0 && saved_nrt) {
 #ifdef __APPLE__
                                /* 
@@ -441,21 +482,22 @@ route_output(struct mbuf *m, struct socket *so)
                                 * dwiggins@bbn.com
                                 */
        
-                               rt_setif(saved_nrt, ifpaddr, ifaaddr, gate);
+                               rt_setif(saved_nrt, ifpaddr, ifaaddr, gate,
+                                   ifscope);
 #endif
                                rt_setmetrics(rtm->rtm_inits,
                                        &rtm->rtm_rmx, &saved_nrt->rt_rmx);
                                saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
                                saved_nrt->rt_rmx.rmx_locks |=
                                        (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
-                               rtunref(saved_nrt);
                                saved_nrt->rt_genmask = genmask;
+                               rtunref(saved_nrt);
                        }
                        break;
 
                case RTM_DELETE:
-                       error = rtrequest_locked(RTM_DELETE, dst, gate, netmask,
-                                       rtm->rtm_flags, &saved_nrt);
+                       error = rtrequest_scoped_locked(RTM_DELETE, dst,
+                           gate, netmask, rtm->rtm_flags, &saved_nrt, ifscope);
                        if (error == 0) {
                                rt = saved_nrt;
                                goto report;
@@ -465,13 +507,17 @@ route_output(struct mbuf *m, struct socket *so)
                case RTM_GET:
                case RTM_CHANGE:
                case RTM_LOCK:
-                       if ((rnh = rt_tables[dst->sa_family]) == 0) {
+                       if ((rnh = rt_tables[dst->sa_family]) == NULL)
                                senderr(EAFNOSUPPORT);
-                       } else if ((rt = (struct rtentry *)
-                                       rnh->rnh_lookup(dst, netmask, rnh)) != NULL)
-                               rtref(rt);
-                       else
+
+                       /*
+                        * Lookup the best match based on the key-mask pair;
+                        * callee adds a reference and checks for root node.
+                        */
+                       rt = rt_lookup(TRUE, dst, netmask, rnh, ifscope);
+                       if (rt == NULL)
                                senderr(ESRCH);
+
                        switch(rtm->rtm_type) {
 
                                case RTM_GET: {
@@ -534,7 +580,8 @@ route_output(struct mbuf *m, struct socket *so)
                                         * equivalent to the code found at this very spot
                                         * in BSD.
                                         */
-                                       rt_setif(rt, ifpaddr, ifaaddr, gate);
+                                       rt_setif(rt, ifpaddr, ifaaddr, gate,
+                                           ifscope);
 #endif
                
                                        rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx,
@@ -638,11 +685,8 @@ rt_setmetrics(u_long which, struct rt_metrics *in, struct rt_metrics *out)
  * Set route's interface given ifpaddr, ifaaddr, and gateway.
  */
 static void
-rt_setif(
-       struct rtentry *rt,
-       struct sockaddr *Ifpaddr,
-       struct sockaddr *Ifaaddr,
-       struct sockaddr *Gate)
+rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr,
+    struct sockaddr *Gate, unsigned int ifscope)
 {
        struct ifaddr *ifa = 0;
        struct ifnet  *ifp = 0;
@@ -653,17 +697,16 @@ rt_setif(
        if (use_routegenid)
                route_generation++;
 
-       /* new gateway could require new ifaddr, ifp;
-          flags may also be different; ifp may be specified
-          by ll sockaddr when protocol address is ambiguous */
-       if (Ifpaddr && (ifa = ifa_ifwithnet(Ifpaddr)) &&
+       /*
+        * New gateway could require new ifaddr, ifp; flags may also
+        * be different; ifp may be specified by ll sockaddr when
+        * protocol address is ambiguous.
+        */
+       if (Ifpaddr && (ifa = ifa_ifwithnet_scoped(Ifpaddr, ifscope)) &&
            (ifp = ifa->ifa_ifp) && (Ifaaddr || Gate)) {
-           ifafree(ifa);
-               ifa = ifaof_ifpforaddr(Ifaaddr ? Ifaaddr : Gate,
-                                       ifp);
-       }
-       else
-       {
+               ifafree(ifa);
+               ifa = ifaof_ifpforaddr(Ifaaddr ? Ifaaddr : Gate, ifp);
+       } else {
                if (ifa) {
                        ifafree(ifa);
                        ifa = 0;
@@ -671,32 +714,36 @@ rt_setif(
                if (Ifpaddr && (ifp = if_withname(Ifpaddr)) ) {
                        if (Gate) {
                                ifa = ifaof_ifpforaddr(Gate, ifp);
-                       }
-                       else {
+                       } else {
                                ifnet_lock_shared(ifp);
                                ifa = TAILQ_FIRST(&ifp->if_addrhead);
                                ifaref(ifa);
                                ifnet_lock_done(ifp);
                        }
-               }
-               else if (Ifaaddr && (ifa = ifa_ifwithaddr(Ifaaddr))) {
+               } else if (Ifaaddr &&
+                   (ifa = ifa_ifwithaddr_scoped(Ifaaddr, ifscope))) {
                        ifp = ifa->ifa_ifp;
-               }
-               else if (Gate && (ifa = ifa_ifwithroute_locked(rt->rt_flags,
-                                               rt_key(rt), Gate))) {
+               } else if (Gate &&
+                   (ifa = ifa_ifwithroute_scoped_locked(rt->rt_flags,
+                   rt_key(rt), Gate, ifscope))) {
                        ifp = ifa->ifa_ifp;
                }
        }
        if (ifa) {
                struct ifaddr *oifa = rt->rt_ifa;
                if (oifa != ifa) {
-                   if (oifa && oifa->ifa_rtrequest)
-                       oifa->ifa_rtrequest(RTM_DELETE,
-                                               rt, Gate);
+                       if (oifa && oifa->ifa_rtrequest)
+                               oifa->ifa_rtrequest(RTM_DELETE, rt, Gate);
                        rtsetifa(rt, ifa);
-                   rt->rt_ifp = ifp;
-                   rt->rt_rmx.rmx_mtu = ifp->if_mtu;
-                   if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest)
+                       rt->rt_ifp = ifp;
+                       /*
+                        * If this is the (non-scoped) default route, record
+                        * the interface index used for the primary ifscope.
+                        */
+                       if (rt_inet_default(rt, rt_key(rt)))
+                               set_primary_ifscope(rt->rt_ifp->if_index);
+                       rt->rt_rmx.rmx_mtu = ifp->if_mtu;
+                       if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest)
                                rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate);
                } else {
                        ifafree(ifa);
@@ -705,7 +752,7 @@ rt_setif(
                ifafree(ifa);
                return;
        }
-      call_ifareq:
+call_ifareq:
        /* XXX: to reset gateway to correct value, at RTM_CHANGE */
        if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest)
                rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate);
@@ -1311,6 +1358,7 @@ sysctl_rttrash(struct sysctl_req *req)
 static int
 sysctl_rtsock SYSCTL_HANDLER_ARGS
 {
+#pragma unused(oidp)
        int     *name = (int *)arg1;
        u_int   namelen = arg2;
        struct radix_node_head *rnh;
index 2663dd4c374979e9c10ea5f91a0a8e8c2d9fbe99..3a1c1bc6e951357fac3ad4439ae376b29b355d44 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -785,15 +785,6 @@ in_control(
                 */
                in_ifscrub(ifp, ia, 1);
                ifa = &ia->ia_ifa;
-#if CONFIG_FORCE_OUT_IFP       
-               // Cleanup any pdp hack related route
-               if (ia->ia_route)
-               {
-                       ia->ia_route->rt_flags &= ~RTF_UP;
-                       rtfree_locked(ia->ia_route);
-                       ia->ia_route = NULL;
-               }
-#endif
                lck_mtx_unlock(rt_mtx);
                ifnet_lock_exclusive(ifp);
                if_detach_ifa(ifp, ifa);
index 7f23a9e6a6168eb1020c76fa158aa36f6cbd5a32..0fcbd52d1a8d05db0e66c4e8bb5a91000df7817e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -366,7 +366,7 @@ struct sockaddr_in {
        sa_family_t     sin_family;
        in_port_t       sin_port;
        struct  in_addr sin_addr;
-       char            sin_zero[8];            /* XXX bwg2001-004 */
+       char            sin_zero[8];
 };
 
 #define INET_ADDRSTRLEN                 16
@@ -414,7 +414,8 @@ struct ip_opts {
 #ifdef __APPLE__
 #define IP_STRIPHDR            23   /* bool: drop receive of raw IP header */
 #endif
-#define IP_RECVTTL                     24      /* bool; receive reception TTL w/dgram */
+#define IP_RECVTTL             24   /* bool; receive reception TTL w/dgram */
+#define        IP_BOUND_IF             25   /* set/get bound interface */
 
 
 #define        IP_FW_ADD               40   /* add a firewall rule to chain */
@@ -441,8 +442,7 @@ struct ip_opts {
 #define        IP_TRAFFIC_MGT_BACKGROUND       65   /* int*; get background IO flags; set background IO */
 
 #ifdef PRIVATE
-/* This is a hack, this is only a hack. */
-#define        IP_FORCE_OUT_IFP        69      /* char ifname[] - send traffic on this interface */
+#define        IP_FORCE_OUT_IFP        69  /* deprecated; use IP_BOUND_IF instead */
 #endif
 
 /* Background socket configuration flags */
index 174aa7742ce058975a915bd41fbf47c6101e2d8d..940b4bf0d701354cd0c1e8b0b9c1fc25e522e705 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -372,7 +372,8 @@ arp_lookup_route(
        const struct in_addr *addr,
        int     create,
        int proxy,
-       route_t *route)
+       route_t *route,
+       unsigned int ifscope)
 {
        struct sockaddr_inarp sin = {sizeof(sin), AF_INET, 0, {0}, {0}, 0, 0};
        const char *why = NULL;
@@ -383,8 +384,9 @@ arp_lookup_route(
 
        sin.sin_addr.s_addr = addr->s_addr;
        sin.sin_other = proxy ? SIN_PROXY : 0;
-       
-       *route = rtalloc1_locked((struct sockaddr*)&sin, create, 0);
+
+       *route = rtalloc1_scoped_locked((struct sockaddr*)&sin,
+           create, 0, ifscope);
        if (*route == NULL)
                return ENETUNREACH;
        
@@ -416,7 +418,7 @@ arp_lookup_route(
        
        if (why && create && log_arp_warnings) {
                char    tmp[MAX_IPv4_STR_LEN];
-               log(LOG_DEBUG, "arplookup %s failed: %s\n",
+               log(LOG_DEBUG, "arplookup link#%d %s failed: %s\n", ifscope,
                        inet_ntop(AF_INET, addr, tmp, sizeof(tmp)), why);
        }
        
@@ -453,7 +455,8 @@ arp_route_to_gateway_route(
                
                if ((route->rt_flags & RTF_UP) == 0) {
                        /* route is down, find a new one */
-                       hint = route = rtalloc1_locked(net_dest, 1, 0);
+                       hint = route = rtalloc1_scoped_locked(net_dest,
+                           1, 0, route->rt_ifp->if_index);
                        if (hint) {
                                rtunref(hint);
                        }
@@ -474,7 +477,9 @@ arp_route_to_gateway_route(
                                if (route->rt_gwroute != 0)
                                        rtfree_locked(route->rt_gwroute);
                                
-                               route->rt_gwroute = rtalloc1_locked(route->rt_gateway, 1, 0);
+                               route->rt_gwroute = rtalloc1_scoped_locked(
+                                   route->rt_gateway, 1, 0,
+                                   route->rt_ifp->if_index);
                                if (route->rt_gwroute == 0) {
                                        lck_mtx_unlock(rt_mtx);
                                        return EHOSTUNREACH;
@@ -560,7 +565,8 @@ arp_lookup_ip(
         * route and link layer information.
         */
        if (route == NULL || route->rt_llinfo == NULL)
-               result = arp_lookup_route(&net_dest->sin_addr, 1, 0, &route);
+               result = arp_lookup_route(&net_dest->sin_addr, 1, 0, &route,
+                   ifp->if_index);
        
        if (result || route == NULL || route->rt_llinfo == NULL) {
                char    tmp[MAX_IPv4_STR_LEN];
@@ -706,10 +712,11 @@ arp_ip_handle_input(
        
        /*
         * Look up the routing entry. If it doesn't exist and we are the
-        * target, go ahead and create one.
+        * target, and the sender isn't 0.0.0.0, go ahead and create one.
         */
-       error = arp_lookup_route(&sender_ip->sin_addr, (target_ip->sin_addr.s_addr ==
-                               best_ia->ia_addr.sin_addr.s_addr), 0, &route);
+       error = arp_lookup_route(&sender_ip->sin_addr,
+           (target_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr &&
+           sender_ip->sin_addr.s_addr != 0), 0, &route, ifp->if_index);
        if (error || route == 0 || route->rt_gateway == 0) {
                if (arpop != ARPOP_REQUEST) {
                        goto respond;
@@ -723,7 +730,8 @@ arp_ip_handle_input(
                         * Verify this ARP probe doesn't conflict with an IPv4LL we know of
                         * on another interface.
                         */
-                       error = arp_lookup_route(&target_ip->sin_addr, 0, 0, &route);
+                       error = arp_lookup_route(&target_ip->sin_addr, 0, 0,
+                           &route, ifp->if_index);
                        if (error == 0 && route && route->rt_gateway) {
                                gateway = SDL(route->rt_gateway);
                                if (route->rt_ifp != ifp && gateway->sdl_alen != 0 
@@ -768,7 +776,8 @@ arp_ip_handle_input(
                        /* don't create entry if link-local address and link-local is disabled */
                        if (!IN_LINKLOCAL(ntohl(sender_ip->sin_addr.s_addr)) 
                            || (ifp->if_eflags & IFEF_ARPLL) != 0) {
-                               error = arp_lookup_route(&sender_ip->sin_addr, 1, 0, &route);
+                               error = arp_lookup_route(&sender_ip->sin_addr,
+                                   1, 0, &route, ifp->if_index);
                                if (error == 0 && route != NULL && route->rt_gateway != NULL) {
                                        created_announcement = 1;
                                }
@@ -877,7 +886,8 @@ respond:
        if (target_ip->sin_addr.s_addr != best_ia->ia_addr.sin_addr.s_addr) {
        
                /* Find a proxy route */
-               error = arp_lookup_route(&target_ip->sin_addr, 0, SIN_PROXY, &route);
+               error = arp_lookup_route(&target_ip->sin_addr, 0, SIN_PROXY,
+                   &route, ifp->if_index);
                if (error || route == NULL) {
                        
                        /* We don't have a route entry indicating we should use proxy */
@@ -888,7 +898,9 @@ respond:
                        }
                        
                        /* See if we have a route to the target ip before we proxy it */
-                       route = rtalloc1_locked((const struct sockaddr*)target_ip, 0, 0);
+                       route = rtalloc1_scoped_locked(
+                           (const struct sockaddr *)target_ip, 0, 0,
+                           ifp->if_index);
                        if (!route) {
                                lck_mtx_unlock(rt_mtx);
                                return 0;
index 0c74fc181baa29f9faaaee4a61bc8733f52ed1b0..787c688ceb67b420b46847bf2b95c923f6331c0a 100644 (file)
@@ -113,6 +113,7 @@ in_gif_output(
        struct ip iphdr;        /* capsule IP header, host byte ordered */
        int proto, error;
        u_int8_t tos;
+       struct ip_out_args ipoa = { IFSCOPE_NONE };
 
        if (sin_src == NULL || sin_dst == NULL ||
            sin_src->sin_family != AF_INET ||
@@ -226,7 +227,7 @@ in_gif_output(
 #endif
        }
 
-       error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL);
+       error = ip_output(m, NULL, &sc->gif_ro, IP_OUTARGS, NULL, &ipoa);
        return(error);
 }
 
@@ -386,7 +387,10 @@ gif_encapcheck4(
                sin.sin_family = AF_INET;
                sin.sin_len = sizeof(struct sockaddr_in);
                sin.sin_addr = ip.ip_src;
-               rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL);
+               lck_mtx_lock(rt_mtx);
+               rt = rtalloc1_scoped_locked((struct sockaddr *)&sin, 0, 0,
+                   m->m_pkthdr.rcvif->if_index);
+               lck_mtx_unlock(rt_mtx);
                if (!rt || rt->rt_ifp != m->m_pkthdr.rcvif) {
 #if 0
                        log(LOG_WARNING, "%s: packet from 0x%x dropped "
index fce3bb78b8b2a9c8f87bf63f50fa31f961e4c004..af785060a1db4c5ada0846f783a9e4788df22ce0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -313,6 +313,7 @@ in_pcblookup_local_and_cleanup(
 }
 
 #ifdef __APPLE_API_PRIVATE
+static void
 in_pcb_conflict_post_msg(u_int16_t port)
 {
        /* 
@@ -569,77 +570,6 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
        return (0);
 }
 
-#if CONFIG_FORCE_OUT_IFP
-/*
- * pdp_context_route_locked is losely based on rtalloc_ign_locked with
- * the hope that it can be used anywhere rtalloc_ign_locked is.
- */
-__private_extern__ void
-pdp_context_route_locked(ifnet_t ifp, struct route *ro)
-{
-       struct in_ifaddr        *ia;
-       struct rtentry          *rt;
-
-       if ((rt = ro->ro_rt) != NULL) {
-               if (rt->rt_ifp == ifp && rt->rt_flags & RTF_UP)
-                       return;
-
-               rtfree_locked(rt);
-               ro->ro_rt = NULL;
-       }
-
-       if (ifp == NULL)
-               return;
-
-       /* Find the first IP address, we will use a fake route off of that */
-       TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
-               if (ia->ia_ifp == ifp)
-                       break;
-       }
-
-       /* Hrmm no IP addresses here :( */
-       if (ia == NULL)
-               return;
-
-       rt = ia->ia_route;
-       if (rt == NULL) {
-               struct sockaddr *ifa = ia->ia_ifa.ifa_addr;
-
-               /* Allocate and set up a fake route */
-               if ((rt = rte_alloc()) == NULL)
-                       return;
-
-               bzero(rt, sizeof(*rt));
-               rt->rt_flags = RTF_UP | RTF_STATIC;
-               if (rt_setgate(rt, ifa, ifa) != 0) {
-                       rte_free(rt);
-                       return;
-               }
-               /*
-                * Explicitly zero the key so that:
-                *   rt_tables[rt_key(rt)->sa_family] == rt_tables[0] == NULL
-                */
-               bzero(rt_key(rt), ifa->sa_len);
-
-               rtsetifa(rt, &ia->ia_ifa);
-               rt->rt_ifp = rt->rt_ifa->ifa_ifp;
-
-               /* Take a reference for the ia pointer to this */
-               ia->ia_route = rt;
-               rtref(rt);
-
-               /*
-                * One more rtentry floating around that is not
-                * linked to the routing table.
-                */
-               (void) OSIncrementAtomic((SInt32 *)&rttrash);
-       }
-       rt->generation_id = route_generation;
-       rtref(rt); /* increment the reference count */
-       ro->ro_rt = rt;
-}
-#endif
-
 /*
  *   Transform old in_pcbconnect() into an inner subroutine for new
  *   in_pcbconnect(): Do some validity-checking on the remote
@@ -691,8 +621,11 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
        }
        if (inp->inp_laddr.s_addr == INADDR_ANY) {
                struct route *ro;
+               unsigned int ifscope;
 
                ia = (struct in_ifaddr *)0;
+               ifscope = (inp->inp_flags & INP_BOUND_IF) ?
+                   inp->inp_boundif : IFSCOPE_NONE;
                /*
                 * If route is known or can be allocated now,
                 * our src addr is taken from the i/f, else punt.
@@ -718,14 +651,7 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
                        ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
                        ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
                                sin->sin_addr;
-#if CONFIG_FORCE_OUT_IFP                       
-                       /* If the socket has requested a specific interface, use that address */
-                       if (inp->pdp_ifp != NULL) {
-                               pdp_context_route_locked(inp->pdp_ifp, ro);
-                       }
-                       else 
-#endif /* CONFIG_FORCE_OUT_IFP */
-                               rtalloc_ign_locked(ro, 0UL);
+                       rtalloc_scoped_ign_locked(ro, 0UL, ifscope);
                }
                /*
                 * If we found a route, use the address
@@ -744,7 +670,8 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
                        sin->sin_port = 0;
                        ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
                        if (ia == 0) {
-                               ia = ifatoia(ifa_ifwithnet(sintosa(sin)));
+                               ia = ifatoia(ifa_ifwithnet_scoped(sintosa(sin),
+                                   ifscope));
                        }
                        sin->sin_port = fport;
                        if (ia == 0) {
@@ -963,7 +890,6 @@ in_pcbdispose(struct inpcb *inp)
                so->so_saved_pcb = (caddr_t) inp;
                so->so_pcb = 0; 
                inp->inp_socket = 0;
-               inp->reserved[0] = (u_int32_t)so;
 #if CONFIG_MACF_NET
                mac_inpcb_label_destroy(inp);
 #endif
@@ -1699,7 +1625,6 @@ in_pcb_detach_port(
        in_pcbremlists(inp);
        
        inp->inp_socket = 0;
-       inp->reserved[0] = (u_int32_t) so;
        zfree(pcbinfo->ipi_zone, inp);
        pcbinfo->nat_dummy_socket.so_pcb = (caddr_t)pcbinfo->nat_dummy_pcb; /* restores dummypcb */
 }
index 0186e42a5cfe8900ff0db3d3f6e486e669b71ae2..f3dec2200c751a3469ef6e11f9beffece59d8b91 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -192,17 +192,11 @@ struct inpcb {
 #else
        void      *inpcb_mtx;
 #endif
-       u_int32_t reserved[4];          /* future use (some already used) */
+       unsigned int inp_boundif;       /* interface scope for INP_BOUND_IF */
+       u_int32_t inp_reserved[3];      /* reserved for future use */
 #if CONFIG_MACF_NET
        struct label *inp_label;        /* MAC label */
 #endif
-#if CONFIG_FORCE_OUT_IFP
-#ifdef _KERN_SYS_KERNELTYPES_H_
-       ifnet_t pdp_ifp;
-#else
-       void    *pdp_ifp;
-#endif /* _KERN_SYS_KERNELTYPES_H_ */
-#endif /* CONFIG_EMBEDDED */
 #if CONFIG_IP_EDGEHOLE
        u_int32_t       inpcb_edgehole_flags;
        u_int32_t       inpcb_edgehole_mask;
@@ -448,6 +442,7 @@ struct inpcbinfo {          /* XXX documentation, prefixes */
 
 #define INP_RECVTTL            0x1000
 #define        INP_UDP_NOCKSUM         0x2000  /* Turn off outbound UDP checksum */
+#define        INP_BOUND_IF            0x4000  /* bind socket to an ifindex */
 
 #define IN6P_IPV6_V6ONLY       0x008000 /* restrict AF_INET6 socket for v6 */
 
@@ -577,9 +572,6 @@ in_pcb_rem_share_client(struct inpcbinfo *pcbinfo, u_char owner_id);
 void   in_pcbremlists(struct inpcb *inp);
 int    in_pcb_ckeckstate(struct inpcb *, int, int);
 void   inpcb_to_compat(struct inpcb *inp, struct inpcb_compat *inp_compat);
-#if CONFIG_FORCE_OUT_IFP
-void   pdp_context_route_locked(ifnet_t ifp, struct route *ro);
-#endif
 
 #endif /* KERNEL */
 #endif /* KERNEL_PRIVATE */
index 371a6db643559bbeb5405d106f9e30e76fa5f889..419befad8842de490a97882aec2a824571b28798 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000,2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -91,6 +91,9 @@ extern u_long route_generation;
 static void in_rtqtimo(void *rock);
 #endif
 
+static struct radix_node *in_matroute_args(void *, struct radix_node_head *,
+    rn_matchf_t *f, void *);
+
 #define RTPRF_OURS             RTF_PROTO3      /* set on routes we manage */
 
 /*
@@ -154,8 +157,8 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
                 * Find out if it is because of an
                 * ARP entry and delete it if so.
                 */
-               rt2 = rtalloc1_locked((struct sockaddr *)sin, 0,
-                               RTF_CLONING | RTF_PRCLONING);
+               rt2 = rtalloc1_scoped_locked(rt_key(rt), 0,
+                   RTF_CLONING | RTF_PRCLONING, sa_get_ifscope(rt_key(rt)));
                if (rt2) {
                        if (rt2->rt_flags & RTF_LLINFO &&
                                rt2->rt_flags & RTF_HOST &&
@@ -174,24 +177,43 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
        return ret;
 }
 
+/*
+ * Validate (unexpire) an expiring AF_INET route.
+ */
+struct radix_node *
+in_validate(struct radix_node *rn)
+{
+       struct rtentry *rt = (struct rtentry *)rn;
+
+       /* This is first reference? */
+       if (rt != NULL && rt->rt_refcnt == 0 && (rt->rt_flags & RTPRF_OURS)) {
+               rt->rt_flags &= ~RTPRF_OURS;
+               rt->rt_rmx.rmx_expire = 0;
+       }
+       return (rn);
+}
+
+/*
+ * Similar to in_matroute_args except without the leaf-matching parameters.
+ */
+static struct radix_node *
+in_matroute(void *v_arg, struct radix_node_head *head)
+{
+       return (in_matroute_args(v_arg, head, NULL, NULL));
+}
+
 /*
  * This code is the inverse of in_clsroute: on first reference, if we
  * were managing the route, stop doing so and set the expiration timer
  * back off again.
  */
 static struct radix_node *
-in_matroute(void *v_arg, struct radix_node_head *head)
+in_matroute_args(void *v_arg, struct radix_node_head *head,
+    rn_matchf_t *f, void *w)
 {
-       struct radix_node *rn = rn_match(v_arg, head);
-       struct rtentry *rt = (struct rtentry *)rn;
+       struct radix_node *rn = rn_match_args(v_arg, head, f, w);
 
-       if(rt && rt->rt_refcnt == 0) { /* this is first reference */
-               if(rt->rt_flags & RTPRF_OURS) {
-                       rt->rt_flags &= ~RTPRF_OURS;
-                       rt->rt_rmx.rmx_expire = 0;
-               }
-       }
-       return rn;
+       return (in_validate(rn));
 }
 
 static int rtq_reallyold = 60*60;
@@ -430,6 +452,7 @@ in_inithead(void **head, int off)
        rnh = *head;
        rnh->rnh_addaddr = in_addroute;
        rnh->rnh_matchaddr = in_matroute;
+       rnh->rnh_matchaddr_args = in_matroute_args;
        rnh->rnh_close = in_clsroute;
        in_rtqtimo(rnh);        /* kick off timeout first time */
        return 1;
index 35826b52880ae08a8de1b4ecf8dc0ea2c8715cdf..fe1bc48995133fce5c90e4a3fdea399530abcfd7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -94,9 +94,6 @@ struct in_ifaddr {
        struct sockaddr_in      ia_dstaddr;     /* reserve space for broadcast addr */
 #define        ia_broadaddr            ia_dstaddr
        struct sockaddr_in      ia_sockmask;    /* reserve space for general netmask */
-#if CONFIG_FORCE_OUT_IFP
-       struct  rtentry         *ia_route; /* PDP context hack - a faux route we can use */
-#endif
 };
 #endif /* PRIVATE */
 
@@ -307,6 +304,7 @@ void        in_delmulti(struct in_multi **);
 int    in_control(struct socket *, u_long, caddr_t, struct ifnet *,
                        struct proc *);
 void   in_rtqdrain(void);
+extern struct radix_node *in_validate(struct radix_node *);
 void   ip_input(struct mbuf *);
 int    in_ifadown(struct ifaddr *ifa, int);
 void   in_ifscrub(struct ifnet *, struct in_ifaddr *, int);
index ebc0772b0933e4d9911f5468da5b90d79ae5e978..f0708780d44f6431d481f84e8324a7879540b309 100644 (file)
@@ -351,6 +351,8 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr,
 
        /* Reinject packet into the system as incoming or outgoing */
        if (!sin || sin->sin_addr.s_addr == 0) {
+               struct ip_out_args ipoa = { IFSCOPE_NONE };
+
                /*
                 * Don't allow both user specified and setsockopt options,
                 * and don't allow packet length sizes that will crash
@@ -377,8 +379,8 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr,
                error = ip_output(m,
                            inp->inp_options, &inp->inp_route,
                        (so->so_options & SO_DONTROUTE) |
-                       IP_ALLOWBROADCAST | IP_RAWOUTPUT,
-                       inp->inp_moptions, NULL);
+                       IP_ALLOWBROADCAST | IP_RAWOUTPUT | IP_OUTARGS,
+                       inp->inp_moptions, &ipoa);
                socket_lock(so, 0);
        } else {
                struct  ifaddr *ifa;
index 426b22d5db689c904fa26c1a9ac700a195a5686f..b146d94d62f16029a2415eb041ef1746b93f0446 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -479,6 +479,7 @@ transmit_event(struct dn_pipe *pipe)
                                (void)ip_output(m, NULL, NULL, pkt->flags, NULL, NULL);
                                if (tmp_rt.ro_rt) {
                                        rtfree(tmp_rt.ro_rt);
+                                       tmp_rt.ro_rt = NULL;
                                }
                                break ;
                        }
@@ -1254,6 +1255,8 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
        
        pkt->dn_dst = fwa->dst;
        pkt->flags = fwa->flags;
+       if (fwa->ipoa != NULL)
+               pkt->ipoa = *(fwa->ipoa);
        }
     if (q->head == NULL)
        q->head = m;
@@ -1362,8 +1365,10 @@ dropit:
        struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \
        if (tag) {                                      \
                struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1);    \
-               if (n->ro.ro_rt)                                \
+               if (n->ro.ro_rt) {                              \
                        rtfree(n->ro.ro_rt);    \
+                       n->ro.ro_rt = NULL;     \
+               }                               \
        }                                                                       \
        m_tag_delete(_m, tag);                  \
        m_freem(_m);                                    \
index 312b7e266d4fa49e0db07972cb52db4c26b6ef58..1994be1ba5984779cbff125512c0669596b339e1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -148,6 +148,8 @@ struct dn_heap {
  * processing requirements.
  */
 #ifdef KERNEL
+#include <netinet/ip_var.h>    /* for ip_out_args */
+
 struct dn_pkt_tag {
     struct ip_fw *rule;                /* matching rule */
     int dn_dir;                        /* action when packet comes out. */
@@ -160,6 +162,7 @@ struct dn_pkt_tag {
     struct sockaddr_in *dn_dst ;
     struct route ro;           /* route, for ip_output. MUST COPY      */
     int flags ;                        /* flags, for ip_output (IPv6 ?)        */
+    struct ip_out_args ipoa;   /* output args, for ip_output. MUST COPY */
 };
 #else
 struct dn_pkt;
index d6c1f128fa16462976d1cb95535b8ce0c1971b06..4fb3f8596b3a9caf6ef176203a0adaa98c6d420d 100644 (file)
@@ -279,6 +279,7 @@ ipflow_reap(
        LIST_REMOVE(ipf, ipf_next);
        ipflow_addstats(ipf);
        rtfree(ipf->ipf_ro.ro_rt);
+       ipf->ipf_ro.ro_rt = NULL;
        return ipf;
 }
 /* note: called under the ip_mutex lock */
@@ -344,6 +345,7 @@ ipflow_create(
                LIST_REMOVE(ipf, ipf_next);
                ipflow_addstats(ipf);
                rtfree(ipf->ipf_ro.ro_rt);
+               ipf->ipf_ro.ro_rt = NULL;
                ipf->ipf_uses = ipf->ipf_last_uses = 0;
                ipf->ipf_errors = ipf->ipf_dropped = 0;
        }
index ea482f0c6fdb005aa7a49b7b629e8835849b980d..400e032b551fb4369df46999ef90a16ec88af13f 100644 (file)
@@ -1309,8 +1309,10 @@ send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags)
        ip_rtaddr(ip->ip_dst, &sro);
        m->m_flags |= M_SKIP_FIREWALL;
        ip_output_list(m, 0, NULL, &sro, 0, NULL, NULL);
-       if (sro.ro_rt)
+       if (sro.ro_rt) {
                RTFREE(sro.ro_rt);
+               sro.ro_rt = NULL;
+       }
 }
 
 /*
index 1e36b65a9f70a4006a7527641036faecd9fe5015..24ef2abe62b0ae15e97a29ebdbbd17ade2a52950 100644 (file)
@@ -453,6 +453,7 @@ struct ip_fw_args {
        struct route    *ro;            /* for dummynet                 */
        struct sockaddr_in *dst;        /* for dummynet                 */
        int flags;                      /* for dummynet                 */
+       struct ip_out_args *ipoa;       /* for dummynet                 */
 
        struct ipfw_flow_id f_id;       /* grabbed from IP header       */
        u_int16_t       divert_rule;    /* divert cookie                */
index 3ed8a2d459e42e5c2993095ec8694bc4f242e3ef..995ca8346c96b2f22edda5d23aa0c14714451bfb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -641,10 +641,9 @@ reflect:
                }
 #endif
                icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
-               rtredirect((struct sockaddr *)&icmpsrc,
-                 (struct sockaddr *)&icmpdst,
-                 (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST,
-                 (struct sockaddr *)&icmpgw, (struct rtentry **)0);
+               rtredirect(m->m_pkthdr.rcvif, (struct sockaddr *)&icmpsrc,
+                 (struct sockaddr *)&icmpdst, NULL, RTF_GATEWAY | RTF_HOST,
+                 (struct sockaddr *)&icmpgw, NULL);
                pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc);
 #if IPSEC
                key_sa_routechange((struct sockaddr *)&icmpsrc);
@@ -826,6 +825,10 @@ icmp_send(struct mbuf *m, struct mbuf *opts)
        int hlen;
        struct icmp *icp;
        struct route ro;
+       struct ip_out_args ipoa = { IFSCOPE_NONE };
+
+       if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL)
+               ipoa.ipoa_ifscope = m->m_pkthdr.rcvif->if_index;
 
        hlen = IP_VHL_HL(ip->ip_vhl) << 2;
        m->m_data += hlen;
@@ -849,9 +852,11 @@ icmp_send(struct mbuf *m, struct mbuf *opts)
        }
 #endif
        bzero(&ro, sizeof ro);
-       (void) ip_output(m, opts, &ro, 0, NULL, NULL);
-       if (ro.ro_rt)
+       (void) ip_output(m, opts, &ro, IP_OUTARGS, NULL, &ipoa);
+       if (ro.ro_rt) {
                rtfree(ro.ro_rt);
+               ro.ro_rt = NULL;
+       }
 }
 
 n_time
@@ -1075,6 +1080,10 @@ icmp_dgram_ctloutput(struct socket *so, struct sockopt *sopt)
 #endif
                case IP_STRIPHDR:
                case IP_RECVTTL:
+               case IP_BOUND_IF:
+#if CONFIG_FORCE_OUT_IFP
+               case IP_FORCE_OUT_IFP:
+#endif
                        error = rip_ctloutput(so, sopt);
                        break;
                
index 8743d9178d9f31d484c35d3ae619d1c513c5c406..7c603ad9fac54cb97e0244e88e737692642f1551 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -189,6 +189,14 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW,
 
 static int    currentfrags = 0;
 
+#if CONFIG_SCOPEDROUTING
+int    ip_doscopedroute = 1;
+#else
+int    ip_doscopedroute = 0;
+#endif
+SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RW,
+     &ip_doscopedroute, 0, "Enable IPv4 scoped routing");
+
 /*
  * XXX - Setting ip_checkinterface mostly implements the receive side of
  * the Strong ES model described in RFC 1122, but since the routing table
@@ -586,12 +594,14 @@ ip_input(struct mbuf *m)
                panic("ip_input no HDR");
 #endif
 
+#if DUMMYNET
        if (args.rule) {        /* dummynet already filtered us */
             ip = mtod(m, struct ip *);
             hlen = IP_VHL_HL(ip->ip_vhl) << 2;
             inject_filter_ref = ipf_get_inject_filter(m);
             goto iphack ;
        }
+#endif /* DUMMYNET */
 #endif /* IPFIREWALL */
        
        /*
@@ -2080,13 +2090,10 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop, struct route
        n_long dest;
        struct in_addr pkt_dst;
        struct ifnet *destifp;
-       struct ifnet *rcvif = m->m_pkthdr.rcvif;
 #if IPSEC
        struct ifnet dummyifp;
 #endif
 
-       m->m_pkthdr.rcvif = NULL;
-
        dest = 0;
        /*
         * Cache the destination address of the packet; this may be
index db39fe174d3ca7277baad8bd55a0b98a89bb8c62..047b6b7ceb3dc00b02fb169ba7ebe0065985931a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -81,6 +81,7 @@
 #include <sys/sysctl.h>
 
 #include <net/if.h>
+#include <net/if_dl.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
@@ -145,6 +146,8 @@ static int  ip_pcbopts(int, struct mbuf **, struct mbuf *);
 static int     ip_setmoptions(struct sockopt *, struct ip_moptions **);
 
 static void ip_out_cksum_stats(int, u_int32_t);
+static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
+static void ip_bindif(struct inpcb *, unsigned int);
 
 int ip_createmoptions(struct ip_moptions **imop);
 int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq);
@@ -175,6 +178,11 @@ static int forge_ce = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW,
     &forge_ce, 0, "Forge ECN CE");
 #endif /* DEBUG */
+
+static int ip_select_srcif_debug = 0;
+SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW,
+    &ip_select_srcif_debug, 0, "log source interface selection debug info");
+
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
@@ -188,10 +196,10 @@ ip_output(
        struct route *ro,
        int flags,
        struct ip_moptions *imo,
-       struct ifnet *ifp)
+       struct ip_out_args *ipoa)
 {
        int error;
-       error = ip_output_list(m0, 0, opt, ro, flags, imo, ifp);
+       error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa);
        return error;
 }
 
@@ -225,11 +233,7 @@ ip_output_list(
        struct route *ro,
        int flags,
        struct ip_moptions *imo,
-#if CONFIG_FORCE_OUT_IFP
-       struct ifnet *pdp_ifp
-#else
-       __unused struct ifnet *unused_ifp
-#endif
+       struct ip_out_args *ipoa
        )
 {
        struct ip *ip, *mhip;
@@ -256,9 +260,11 @@ ip_output_list(
        ipfilter_t inject_filter_ref = 0;
        struct m_tag    *tag;
        struct route    saved_route;
+       struct ip_out_args saved_ipoa;
        struct mbuf * packetlist;
        int pktcnt = 0;
-       
+       unsigned int ifscope;
+       boolean_t select_srcif;
 
        KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 
@@ -268,6 +274,7 @@ ip_output_list(
        args.eh = NULL;
        args.rule = NULL;
        args.divert_rule = 0;                   /* divert cookie */
+       args.ipoa = NULL;
        
        /* Grab info from mtags prepended to the chain */
 #if DUMMYNET
@@ -284,6 +291,8 @@ ip_output_list(
                dst = dn_tag->dn_dst;
                ifp = dn_tag->ifp;
                flags = dn_tag->flags;
+               saved_ipoa = dn_tag->ipoa;
+               ipoa = &saved_ipoa;
                
                m_tag_delete(m0, tag);
        }
@@ -320,6 +329,20 @@ ip_output_list(
                      mtod(m, struct ip *)->ip_p);
 #endif
 
+       /*
+        * Do not perform source interface selection when forwarding.
+        * At present the IP_OUTARGS flag implies a request for IP to
+        * perform source interface selection.
+        */
+       if (ip_doscopedroute &&
+           (flags & (IP_OUTARGS | IP_FORWARDING)) == IP_OUTARGS) {
+               select_srcif = TRUE;
+               ifscope = ipoa->ipoa_ifscope;
+       } else {
+               select_srcif = FALSE;
+               ifscope = IFSCOPE_NONE;
+       }
+
 #if IPFIREWALL
        if (args.rule != NULL) {        /* dummynet already saw us */
             ip = mtod(m, struct ip *);
@@ -419,7 +442,13 @@ loopit:
                        rtfree_locked(ro->ro_rt);
                        ro->ro_rt = NULL;
                }
-               if (ro->ro_rt && ro->ro_rt->generation_id != route_generation)
+               /*
+                * If we're doing source interface selection, we may not
+                * want to use this route; only synch up the generation
+                * count otherwise.
+                */
+               if (!select_srcif && ro->ro_rt != NULL &&
+                   ro->ro_rt->generation_id != route_generation)
                        ro->ro_rt->generation_id = route_generation;
        }
        if (ro->ro_rt == NULL) {
@@ -448,22 +477,81 @@ loopit:
                ifp = ia->ia_ifp;
                ip->ip_ttl = 1;
                isbroadcast = in_broadcast(dst->sin_addr, ifp);
+       } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
+           imo != NULL && imo->imo_multicast_ifp != NULL) {
+               /*
+                * Bypass the normal routing lookup for multicast
+                * packets if the interface is specified.
+                */
+               ifp = imo->imo_multicast_ifp;
+               isbroadcast = 0;
+               if (ia != NULL)
+                       ifafree(&ia->ia_ifa);
+
+               /* Could use IFP_TO_IA instead but rt_mtx is already held */
+               for (ia = TAILQ_FIRST(&in_ifaddrhead);
+                   ia != NULL && ia->ia_ifp != ifp;
+                   ia = TAILQ_NEXT(ia, ia_link))
+                       continue;
+
+               if (ia != NULL)
+                       ifaref(&ia->ia_ifa);
        } else {
+               boolean_t cloneok = FALSE;
+               /*
+                * Perform source interface selection; the source IP address
+                * must belong to one of the addresses of the interface used
+                * by the route.  For performance reasons, do this only if
+                * there is no route, or if the routing table has changed,
+                * or if we haven't done source interface selection on this
+                * route (for this PCB instance) before.
+                */
+               if (select_srcif && ip->ip_src.s_addr != INADDR_ANY &&
+                   (ro->ro_rt == NULL ||
+                   ro->ro_rt->generation_id != route_generation ||
+                   !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
+                       struct ifaddr *ifa;
 
-#if CONFIG_FORCE_OUT_IFP
-               /* Check if this packet should be forced out a specific interface */
-               if (ro->ro_rt == 0 && pdp_ifp != NULL) {
-                       pdp_context_route_locked(pdp_ifp, ro);
-                       
-                       if (ro->ro_rt == NULL) {
-                               OSAddAtomic(1, (UInt32*)&ipstat.ips_noroute);
-                               error = EHOSTUNREACH;
+                       /* Find the source interface */
+                       ifa = in_selectsrcif(ip, ro, ifscope);
+
+                       /*
+                        * If the source address is spoofed (in the case
+                        * of IP_RAWOUTPUT), or if this is destined for
+                        * local/loopback, just let it go out using the
+                        * interface of the route.  Otherwise, there's no
+                        * interface having such an address, so bail out.
+                        */
+                       if (ifa == NULL && !(flags & IP_RAWOUTPUT) &&
+                           ifscope != lo_ifp->if_index) {
+                               error = EADDRNOTAVAIL;
                                lck_mtx_unlock(rt_mtx);
                                goto bad;
                        }
+
+                       /*
+                        * If the caller didn't explicitly specify the scope,
+                        * pick it up from the source interface.  If the cached
+                        * route was wrong and was blown away as part of source
+                        * interface selection, don't mask out RTF_PRCLONING
+                        * since that route may have been allocated by the ULP,
+                        * unless the IP header was created by the caller or
+                        * the destination is IPv4 LLA.  The check for the
+                        * latter is needed because IPv4 LLAs are never scoped
+                        * in the current implementation, and we don't want to
+                        * replace the resolved IPv4 LLA route with one whose
+                        * gateway points to that of the default gateway on
+                        * the primary interface of the system.
+                        */
+                       if (ifa != NULL) {
+                               if (ifscope == IFSCOPE_NONE)
+                                       ifscope = ifa->ifa_ifp->if_index;
+                               ifafree(ifa);
+                               cloneok = (!(flags & IP_RAWOUTPUT) &&
+                                   !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
+                       }
                }
-#endif
-               
+
                /*
                 * If this is the case, we probably don't want to allocate
                 * a protocol-cloned route since we didn't get one from the
@@ -473,8 +561,7 @@ loopit:
                 * the link layer, as this is probably required in all cases
                 * for correct operation (as it is for ARP).
                 */
-               
-               if (ro->ro_rt == 0) {
+               if (ro->ro_rt == NULL) {
                        unsigned long ign = RTF_PRCLONING;
                        /*
                         * We make an exception here: if the destination
@@ -487,23 +574,26 @@ loopit:
                         * that allocate a route and those that don't.  The
                         * RTF_BROADCAST route is important since we'd want
                         * to send out undirected IP broadcast packets using
-                        * link-level broadcast address.
+                        * link-level broadcast address. Another exception
+                        * is for ULP-created routes that got blown away by
+                        * source interface selection (see above).
                         *
-                        * This exception will no longer be necessary when
+                        * These exceptions will no longer be necessary when
                         * the RTF_PRCLONING scheme is no longer present.
                         */
-                       if (dst->sin_addr.s_addr == INADDR_BROADCAST)
+                       if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST)
                                ign &= ~RTF_PRCLONING;
 
-                       rtalloc_ign_locked(ro, ign);
+                       rtalloc_scoped_ign_locked(ro, ign, ifscope);
                }
-               if (ro->ro_rt == 0) {
+
+               if (ro->ro_rt == NULL) {
                        OSAddAtomic(1, (SInt32*)&ipstat.ips_noroute);
                        error = EHOSTUNREACH;
                        lck_mtx_unlock(rt_mtx);
                        goto bad;
                }
-               
+
                if (ia)
                        ifafree(&ia->ia_ifa);
                ia = ifatoia(ro->ro_rt->rt_ifa);
@@ -1025,22 +1115,24 @@ skip_ipsec:
                }
 #if DUMMYNET
                 if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
-                    /*
-                     * pass the pkt to dummynet. Need to include
-                     * pipe number, m, ifp, ro, dst because these are
-                     * not recomputed in the next pass.
-                     * All other parameters have been already used and
-                     * so they are not needed anymore. 
-                     * XXX note: if the ifp or ro entry are deleted
-                     * while a pkt is in dummynet, we are in trouble!
-                     */ 
-                   args.ro = ro;
-                   args.dst = dst;
-                   args.flags = flags;
-
-                   error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
-                               &args);
-                   goto done;
+                       /*
+                        * pass the pkt to dummynet. Need to include
+                        * pipe number, m, ifp, ro, dst because these are
+                        * not recomputed in the next pass.
+                        * All other parameters have been already used and
+                        * so they are not needed anymore.
+                        * XXX note: if the ifp or ro entry are deleted
+                        * while a pkt is in dummynet, we are in trouble!
+                        */
+                       args.ro = ro;
+                       args.dst = dst;
+                       args.flags = flags;
+                       if (flags & IP_OUTARGS)
+                               args.ipoa = ipoa;
+
+                       error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
+                           &args);
+                       goto done;
                }
 #endif /* DUMMYNET */
 #if IPDIVERT
@@ -1941,58 +2033,66 @@ ip_ctloutput(so, sopt)
                        break;
 #undef OPTSET
 
-#if CONFIG_FORCE_OUT_IFP               
+#if CONFIG_FORCE_OUT_IFP
+               /*
+                * Apple private interface, similar to IP_BOUND_IF, except
+                * that the parameter is a NULL-terminated string containing
+                * the name of the network interface; an emptry string means
+                * unbind.  Applications are encouraged to use IP_BOUND_IF
+                * instead, as that is the current "official" API.
+                */
                case IP_FORCE_OUT_IFP: {
-                       char    ifname[IFNAMSIZ];
-                       ifnet_t ifp;
-                       
+                       char ifname[IFNAMSIZ];
+                       unsigned int ifscope;
+
+                       /* This option is settable only for IPv4 */
+                       if (!(inp->inp_vflag & INP_IPV4)) {
+                               error = EINVAL;
+                               break;
+                       }
+
                        /* Verify interface name parameter is sane */
                        if (sopt->sopt_valsize > sizeof(ifname)) {
                                error = EINVAL;
                                break;
                        }
-                       
+
                        /* Copy the interface name */
                        if (sopt->sopt_valsize != 0) {
-                               error = sooptcopyin(sopt, ifname, sizeof(ifname), sopt->sopt_valsize);
+                               error = sooptcopyin(sopt, ifname,
+                                   sizeof (ifname), sopt->sopt_valsize);
                                if (error)
                                        break;
                        }
-                       
-                       if (sopt->sopt_valsize == 0 || ifname[0] == 0) {
-                               // Set pdp_ifp to NULL
-                               inp->pdp_ifp = NULL;
-                               
-                               // Flush the route
-                               if (inp->inp_route.ro_rt) {
-                                       rtfree(inp->inp_route.ro_rt);
-                                       inp->inp_route.ro_rt = NULL;
+
+                       if (sopt->sopt_valsize == 0 || ifname[0] == NULL) {
+                               /* Unbind this socket from any interface */
+                               ifscope = IFSCOPE_NONE;
+                       } else {
+                               ifnet_t ifp;
+
+                               /* Verify name is NULL terminated */
+                               if (ifname[sopt->sopt_valsize - 1] != NULL) {
+                                       error = EINVAL;
+                                       break;
                                }
-                               
-                               break;
-                       }
-                       
-                       /* Verify name is NULL terminated */
-                       if (ifname[sopt->sopt_valsize - 1] != 0) {
-                               error = EINVAL;
-                               break;
-                       }
-                       
-                       if (ifnet_find_by_name(ifname, &ifp) != 0) {
-                               error = ENXIO;
-                               break;
-                       }
-                       
-                       /* Won't actually free. Since we don't release this later, we should do it now. */
-                       ifnet_release(ifp);
-                       
-                       /* This only works for point-to-point interfaces */
-                       if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
-                               error = ENOTSUP;
-                               break;
+
+                               /* Bail out if given bogus interface name */
+                               if (ifnet_find_by_name(ifname, &ifp) != 0) {
+                                       error = ENXIO;
+                                       break;
+                               }
+
+                               /* Bind this socket to this interface */
+                               ifscope = ifp->if_index;
+
+                               /*
+                                * Won't actually free; since we don't release
+                                * this later, we should do it now.
+                                */
+                               ifnet_release(ifp);
                        }
-                       
-                       inp->pdp_ifp = ifp;
+                       ip_bindif(inp, ifscope);
                }
                break;
 #endif
@@ -2080,6 +2180,40 @@ ip_ctloutput(so, sopt)
                }
 #endif /* TRAFFIC_MGT */
 
+               /*
+                * On a multihomed system, scoped routing can be used to
+                * restrict the source interface used for sending packets.
+                * The socket option IP_BOUND_IF binds a particular AF_INET
+                * socket to an interface such that data sent on the socket
+                * is restricted to that interface.  This is unlike the
+                * SO_DONTROUTE option where the routing table is bypassed;
+                * therefore it allows for a greater flexibility and control
+                * over the system behavior, and does not place any restriction
+                * on the destination address type (e.g.  unicast, multicast,
+                * or broadcast if applicable) or whether or not the host is
+                * directly reachable.  Note that in the multicast transmit
+                * case, IP_MULTICAST_IF takes precedence over IP_BOUND_IF,
+                * since the former practically bypasses the routing table;
+                * in this case, IP_BOUND_IF sets the default interface used
+                * for sending multicast packets in the absence of an explicit
+                * transmit interface set via IP_MULTICAST_IF.
+                */
+               case IP_BOUND_IF:
+                       /* This option is settable only for IPv4 */
+                       if (!(inp->inp_vflag & INP_IPV4)) {
+                               error = EINVAL;
+                               break;
+                       }
+
+                       error = sooptcopyin(sopt, &optval, sizeof (optval),
+                           sizeof (optval));
+
+                       if (error)
+                               break;
+
+                       ip_bindif(inp, optval);
+                       break;
+
                default:
                        error = ENOPROTOOPT;
                        break;
@@ -2198,6 +2332,12 @@ ip_ctloutput(so, sopt)
                }
 #endif /* TRAFFIC_MGT */
 
+               case IP_BOUND_IF:
+                       if (inp->inp_flags & INP_BOUND_IF)
+                               optval = inp->inp_boundif;
+                       error = sooptcopyout(sopt, &optval, sizeof (optval));
+                       break;
+
                default:
                        error = ENOPROTOOPT;
                        break;
@@ -2870,3 +3010,189 @@ ip_mloopback(ifp, m, dst, hlen)
                m_freem(copym);
        }
 }
+
+/*
+ * Given a source IP address (and route, if available), determine the best
+ * interface to send the packet from.
+ */
+static struct ifaddr *
+in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
+{
+       struct ifaddr *ifa = NULL;
+       struct sockaddr src = { sizeof (struct sockaddr_in), AF_INET, { 0, } };
+       struct ifnet *rt_ifp;
+       char ip_src[16], ip_dst[16];
+
+       if (ip_select_srcif_debug) {
+               (void) inet_ntop(AF_INET, &ip->ip_src.s_addr, ip_src,
+                   sizeof (ip_src));
+               (void) inet_ntop(AF_INET, &ip->ip_dst.s_addr, ip_dst,
+                   sizeof (ip_dst));
+       }
+
+       lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED);
+
+       ((struct sockaddr_in *)&src)->sin_addr.s_addr = ip->ip_src.s_addr;
+       rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
+
+       /*
+        * Given the source IP address, find a suitable source interface
+        * to use for transmission; if the caller has specified a scope,
+        * optimize the search by looking at the addresses only for that
+        * interface.  This is still suboptimal, however, as we need to
+        * traverse the per-interface list.
+        */
+       if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
+               unsigned int scope = ifscope;
+
+               /*
+                * If no scope is specified and the route is stale (pointing
+                * to a defunct interface) use the current primary interface;
+                * this happens when switching between interfaces configured
+                * with the same IP address.  Otherwise pick up the scope
+                * information from the route; the ULP may have looked up a
+                * correct route and we just need to verify it here and mark
+                * it with the ROF_SRCIF_SELECTED flag below.
+                */
+               if (scope == IFSCOPE_NONE) {
+                       scope = rt_ifp->if_index;
+                       if (scope != get_primary_ifscope() &&
+                           ro->ro_rt->generation_id != route_generation)
+                               scope = get_primary_ifscope();
+               }
+
+               ifa = ifa_ifwithaddr_scoped(&src, scope);
+
+               if (ip_select_srcif_debug && ifa != NULL) {
+                       if (ro->ro_rt != NULL) {
+                               printf("%s->%s ifscope %d->%d ifa_if %s%d "
+                                   "ro_if %s%d\n", ip_src, ip_dst, ifscope,
+                                   scope, ifa->ifa_ifp->if_name,
+                                   ifa->ifa_ifp->if_unit, rt_ifp->if_name,
+                                   rt_ifp->if_unit);
+                       } else {
+                               printf("%s->%s ifscope %d->%d ifa_if %s%d\n",
+                                   ip_src, ip_dst, ifscope, scope,
+                                   ifa->ifa_ifp->if_name,
+                                   ifa->ifa_ifp->if_unit);
+                       }
+               }
+       }
+
+       /*
+        * Slow path; search for an interface having the corresponding source
+        * IP address if the scope was not specified by the caller, and:
+        *
+        *   1) There currently isn't any route, or,
+        *   2) The interface used by the route does not own that source
+        *      IP address; in this case, the route will get blown away
+        *      and we'll do a more specific scoped search using the newly
+        *      found interface.
+        */
+       if (ifa == NULL && ifscope == IFSCOPE_NONE) {
+               ifa = ifa_ifwithaddr(&src);
+
+               if (ip_select_srcif_debug && ifa != NULL) {
+                       printf("%s->%s ifscope %d ifa_if %s%d\n",
+                           ip_src, ip_dst, ifscope, ifa->ifa_ifp->if_name,
+                           ifa->ifa_ifp->if_unit);
+               }
+       }
+
+       /*
+        * If there is a non-loopback route with the wrong interface, or if
+        * there is no interface configured with such an address, blow it
+        * away.  Except for local/loopback, we look for one with a matching
+        * interface scope/index.
+        */
+       if (ro->ro_rt != NULL &&
+           (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
+           !(ro->ro_rt->rt_flags & RTF_UP))) {
+               if (ip_select_srcif_debug) {
+                       if (ifa != NULL) {
+                               printf("%s->%s ifscope %d ro_if %s%d != "
+                                   "ifa_if %s%d (cached route cleared)\n",
+                                   ip_src, ip_dst, ifscope, rt_ifp->if_name,
+                                   rt_ifp->if_unit, ifa->ifa_ifp->if_name,
+                                   ifa->ifa_ifp->if_unit);
+                       } else {
+                               printf("%s->%s ifscope %d ro_if %s%d "
+                                   "(no ifa_if found)\n",
+                                   ip_src, ip_dst, ifscope, rt_ifp->if_name,
+                                   rt_ifp->if_unit);
+                       }
+               }
+
+               rtfree_locked(ro->ro_rt);
+               ro->ro_rt = NULL;
+               ro->ro_flags &= ~ROF_SRCIF_SELECTED;
+
+               /*
+                * If the destination is IPv4 LLA and the route's interface
+                * doesn't match the source interface, then the source IP
+                * address is wrong; it most likely belongs to the primary
+                * interface associated with the IPv4 LL subnet.  Drop the
+                * packet rather than letting it go out and return an error
+                * to the ULP.  This actually applies not only to IPv4 LL
+                * but other shared subnets; for now we explicitly test only
+                * for the former case and save the latter for future.
+                */
+               if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) &&
+                   !IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) && ifa != NULL) {
+                       ifafree(ifa);
+                       ifa = NULL;
+               }
+       }
+
+       if (ip_select_srcif_debug && ifa == NULL) {
+               printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
+                   ip_src, ip_dst, ifscope);
+       }
+
+       /*
+        * If there is a route, mark it accordingly.  If there isn't one,
+        * we'll get here again during the next transmit (possibly with a
+        * route) and the flag will get set at that point.  For IPv4 LLA
+        * destination, mark it only if the route has been fully resolved;
+        * otherwise we want to come back here again when the route points
+        * to the interface over which the ARP reply arrives on.
+        */
+       if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
+           (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
+           SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
+               ro->ro_flags |= ROF_SRCIF_SELECTED;
+               ro->ro_rt->generation_id = route_generation;
+       }
+
+       return (ifa);
+}
+
+/*
+ * Handler for setting IP_FORCE_OUT_IFP or IP_BOUND_IF socket option.
+ */
+static void
+ip_bindif(struct inpcb *inp, unsigned int ifscope)
+{
+       /*
+        * A zero interface scope value indicates an "unbind".
+        * Otherwise, take in whatever value the app desires;
+        * the app may already know the scope (or force itself
+        * to such a scope) ahead of time before the interface
+        * gets attached.  It doesn't matter either way; any
+        * route lookup from this point on will require an
+        * exact match for the embedded interface scope.
+        */
+       inp->inp_boundif = ifscope;
+       if (inp->inp_boundif == IFSCOPE_NONE)
+               inp->inp_flags &= ~INP_BOUND_IF;
+       else
+               inp->inp_flags |= INP_BOUND_IF;
+
+       lck_mtx_lock(rt_mtx);
+       /* Blow away any cached route in the PCB */
+       if (inp->inp_route.ro_rt != NULL) {
+               rtfree_locked(inp->inp_route.ro_rt);
+               inp->inp_route.ro_rt = NULL;
+       }
+       lck_mtx_unlock(rt_mtx);
+}
index 81eb6f135bacacb08edc7db4203964009a6262c2..0861cf587c246c00ee418a0aa1c218cf848646b4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -195,12 +195,20 @@ struct ip_linklocal_stat {
 #define        IP_NOIPSEC              0x4             /* No IPSec processing */
 #define        IP_ROUTETOIF            SO_DONTROUTE    /* bypass routing tables (0x0010) */
 #define        IP_ALLOWBROADCAST       SO_BROADCAST    /* can send broadcast packets (0x0020) */
+#define        IP_OUTARGS              0x100           /* has ancillary output info */
 
 struct ip;
 struct inpcb;
 struct route;
 struct sockopt;
 
+/*
+ * Extra information passed to ip_output when IP_OUTARGS is set.
+ */
+struct ip_out_args {
+       unsigned int    ipoa_ifscope;   /* interface scope */
+};
+
 extern struct  ipstat  ipstat;
 #if !defined(RANDOM_IP_ID) || RANDOM_IP_ID == 0
 extern u_short ip_id;                          /* ip packet ctr, for ids */
@@ -214,6 +222,7 @@ extern int  (*legal_vif_num)(int);
 extern u_long  (*ip_mcast_src)(int);
 extern int rsvp_on;
 extern struct  pr_usrreqs rip_usrreqs;
+extern int     ip_doscopedroute;
 
 int     ip_ctloutput(struct socket *, struct sockopt *sopt);
 void    ip_drain(void);
@@ -221,10 +230,10 @@ void       ip_freemoptions(struct ip_moptions *);
 void    ip_init(void) __attribute__((section("__TEXT, initcode")));
 extern int      (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
                          struct ip_moptions *);
-int     ip_output(struct mbuf *,
-           struct mbuf *, struct route *, int, struct ip_moptions *, struct ifnet *);
-int     ip_output_list(struct mbuf *, int,
-           struct mbuf *, struct route *, int, struct ip_moptions *, struct ifnet *);
+extern int ip_output(struct mbuf *, struct mbuf *, struct route *, int,
+    struct ip_moptions *, struct ip_out_args *);
+extern int ip_output_list(struct mbuf *, int, struct mbuf *, struct route *,
+    int, struct ip_moptions *, struct ip_out_args *);
 struct in_ifaddr *
         ip_rtaddr(struct in_addr, struct route *);
 void    ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
index 3ade07635501b5bf388739dcf4e8b7bb667a548a..11e005de8ac986425b64fc9be32fde8c6d1358ec 100644 (file)
@@ -316,8 +316,10 @@ ipf_injectv4_out(
        error = ip_output(m, NULL, &ro, IP_ALLOWBROADCAST | IP_RAWOUTPUT, imo, NULL);
        
        /* Release the route */
-       if (ro.ro_rt)
+       if (ro.ro_rt) {
                rtfree(ro.ro_rt);
+               ro.ro_rt = NULL;
+       }
        
        return error;
 }
@@ -390,8 +392,10 @@ ipf_injectv6_out(
        error = ip6_output(m, NULL, &ro, 0, im6o, NULL, 0);
        
        /* Release the route */
-       if (ro.ro_rt)
+       if (ro.ro_rt) {
                rtfree(ro.ro_rt);
+               ro.ro_rt = NULL;
+       }
        
        return error;
 }
index 533184f4ea09602a29c67169733a6214202bcc40..0cc25b61638685501fd112cfad5b371cd976b3d8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -320,6 +320,12 @@ rip_output(m, so, dst)
        register struct ip *ip;
        register struct inpcb *inp = sotoinpcb(so);
        int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST;
+       struct ip_out_args ipoa;
+
+       /* If socket was bound to an ifindex, tell ip_output about it */
+       ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
+           inp->inp_boundif : IFSCOPE_NONE;
+       flags |= IP_OUTARGS;
 
        /*
         * If the user handed us a complete IP packet, use it.
@@ -384,14 +390,8 @@ rip_output(m, so, dst)
 #if CONFIG_IP_EDGEHOLE
        ip_edgehole_mbuf_tag(inp, m);
 #endif
-
-#if CONFIG_FORCE_OUT_IFP
-       return (ip_output_list(m, 0, inp->inp_options, &inp->inp_route, flags,
-                         inp->inp_moptions, inp->pdp_ifp));
-#else
-       return (ip_output_list(m, 0, inp->inp_options, &inp->inp_route, flags,
-                         inp->inp_moptions, NULL));
-#endif
+       return (ip_output(m, inp->inp_options, &inp->inp_route, flags,
+           inp->inp_moptions, &ipoa));
 }
 
 #if IPFIREWALL
index 36756785d677bc1892e5208b0efaa1c9bf9626e2..138bcb3c7e438b2a050611653ee5eb3a7a62a8b2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -210,7 +210,7 @@ struct inpcbhead tcb;
 struct inpcbinfo tcbinfo;
 
 static void     tcp_dooptions(struct tcpcb *,
-           u_char *, int, struct tcphdr *, struct tcpopt *);
+           u_char *, int, struct tcphdr *, struct tcpopt *, unsigned int);
 static void     tcp_pulloutofband(struct socket *,
            struct tcphdr *, struct mbuf *, int);
 static int      tcp_reass(struct tcpcb *, struct tcphdr *, int *,
@@ -552,6 +552,19 @@ tcp_input(m, off0)
 #endif
        struct m_tag *fwd_tag;
        u_char ip_ecn = IPTOS_ECN_NOTECT;
+       unsigned int ifscope;
+
+       /*
+        * Record the interface where this segment arrived on; this does not
+        * affect normal data output (for non-detached TCP) as it provides a
+        * hint about which route and interface to use for sending in the
+        * absence of a PCB, when scoped routing (and thus source interface
+        * selection) are enabled.
+        */
+       if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL)
+               ifscope = m->m_pkthdr.rcvif->if_index;
+       else
+               ifscope = IFSCOPE_NONE;
 
        /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */
        fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL);
@@ -821,6 +834,14 @@ findpcb:
            ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
       }
 
+       /*
+        * Use the interface scope information from the PCB for outbound
+        * segments.  If the PCB isn't present and if scoped routing is
+        * enabled, tcp_respond will use the scope of the interface where
+        * the segment arrived on.
+        */
+       if (inp != NULL && (inp->inp_flags & INP_BOUND_IF))
+               ifscope = inp->inp_boundif;
 #if IPSEC
        if (ipsec_bypass == 0)  {
 #if INET6
@@ -981,6 +1002,11 @@ findpcb:
                        struct inpcb *oinp = sotoinpcb(so);
 #endif /* INET6 */
                        int ogencnt = so->so_gencnt;
+                       unsigned int head_ifscope;
+
+                       /* Get listener's bound-to-interface, if any */
+                       head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
+                           inp->inp_boundif : IFSCOPE_NONE;
 
 #if !IPSEC
                        /*
@@ -1107,6 +1133,21 @@ findpcb:
                         */
                        dropsocket++;
                        inp = (struct inpcb *)so->so_pcb;
+
+                       /*
+                        * Inherit INP_BOUND_IF from listener; testing if
+                        * head_ifscope is non-zero is sufficient, since it
+                        * can only be set to a non-zero value earlier if
+                        * the listener has such a flag set.
+                        */
+#if INET6
+                       if (head_ifscope != IFSCOPE_NONE && !isipv6) {
+#else
+                       if (head_ifscope != IFSCOPE_NONE) {
+#endif /* INET6 */
+                               inp->inp_flags |= INP_BOUND_IF;
+                               inp->inp_boundif = head_ifscope;
+                       }
 #if INET6
                        if (isipv6)
                                inp->in6p_laddr = ip6->ip6_dst;
@@ -1344,7 +1385,7 @@ findpcb:
         * else do it below (after getting remote address).
         */
        if (tp->t_state != TCPS_LISTEN && optp)
-               tcp_dooptions(tp, optp, optlen, th, &to);
+               tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
 
        if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
                if (to.to_flags & TOF_SCALE) {
@@ -1359,7 +1400,7 @@ findpcb:
                        tp->ts_recent_age = tcp_now;
                }
                if (to.to_flags & TOF_MSS)
-                       tcp_mss(tp, to.to_mss);
+                       tcp_mss(tp, to.to_mss, ifscope);
                if (tp->sack_enable) {
                        if (!(to.to_flags & TOF_SACK))
                                tp->sack_enable = 0;
@@ -1406,6 +1447,11 @@ findpcb:
                        tp->ts_recent = to.to_tsval;
                }
 
+               /* Force acknowledgment if we received a FIN */
+
+               if (thflags & TH_FIN)
+                       tp->t_flags |= TF_ACKNOW;
+
                if (tlen == 0) {
                        if (SEQ_GT(th->th_ack, tp->snd_una) &&
                            SEQ_LEQ(th->th_ack, tp->snd_max) &&
@@ -1700,7 +1746,7 @@ findpcb:
                        FREE(sin, M_SONAME);
                }
 
-               tcp_dooptions(tp, optp, optlen, th, &to);
+               tcp_dooptions(tp, optp, optlen, th, &to, ifscope);
 
                if (tp->sack_enable) {
                        if (!(to.to_flags & TOF_SACK))
@@ -2667,8 +2713,9 @@ process_ACK:
                                        soisdisconnected(so);
                                }
                                tp->t_state = TCPS_FIN_WAIT_2;
-                               goto drop;
+                               /* fall through and make sure we also recognize data ACKed with the FIN */
                        }
+                       tp->t_flags |= TF_ACKNOW;
                        break;
 
                /*
@@ -2691,6 +2738,7 @@ process_ACK:
                                add_to_time_wait(tp);
                                soisdisconnected(so);
                        }
+                       tp->t_flags |= TF_ACKNOW;
                        break;
 
                /*
@@ -2811,7 +2859,7 @@ dodata:                                                   /* XXX */
         * case PRU_RCVD).  If a FIN has already been received on this
         * connection then we just ignore the text.
         */
-       if ((tlen || (thflags&TH_FIN)) &&
+       if ((tlen || (thflags & TH_FIN)) &&
            TCPS_HAVERCVDFIN(tp->t_state) == 0) {
                tcp_seq save_start = th->th_seq;
                tcp_seq save_end = th->th_seq + tlen;
@@ -3056,13 +3104,13 @@ dropwithreset:
        if (thflags & TH_ACK)
                /* mtod() below is safe as long as hdr dropping is delayed */
                tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
-                           TH_RST, m->m_pkthdr.rcvif);
+                   TH_RST, ifscope);
        else {
                if (thflags & TH_SYN)
                        tlen++;
                /* mtod() below is safe as long as hdr dropping is delayed */
                tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
-                           (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.rcvif);
+                   (tcp_seq)0, TH_RST|TH_ACK, ifscope);
        }
        /* destroy temporarily created socket */
        if (dropsocket) {
@@ -3099,7 +3147,7 @@ drop:
 }
 
 static void
-tcp_dooptions(tp, cp, cnt, th, to)
+tcp_dooptions(tp, cp, cnt, th, to, input_ifscope)
 /*
  * Parse TCP options and place in tcpopt.
  */
@@ -3108,6 +3156,7 @@ tcp_dooptions(tp, cp, cnt, th, to)
        int cnt;
        struct tcphdr *th;
        struct tcpopt *to;
+       unsigned int input_ifscope;
 {
        u_short mss = 0;
        int opt, optlen;
@@ -3187,7 +3236,7 @@ tcp_dooptions(tp, cp, cnt, th, to)
                }
        }
        if (th->th_flags & TH_SYN)
-               tcp_mss(tp, mss);       /* sets t_maxseg */
+               tcp_mss(tp, mss, input_ifscope);        /* sets t_maxseg */
 }
 
 /*
@@ -3361,9 +3410,10 @@ tcp_maxmtu6(struct rtentry *rt)
  *
  */
 void
-tcp_mss(tp, offer)
+tcp_mss(tp, offer, input_ifscope)
        struct tcpcb *tp;
        int offer;
+       unsigned int input_ifscope;
 {
        register struct rtentry *rt;
        struct ifnet *ifp;
@@ -3398,7 +3448,7 @@ tcp_mss(tp, offer)
        else
 #endif /* INET6 */
        {
-               rt = tcp_rtlookup(inp);
+               rt = tcp_rtlookup(inp, input_ifscope);
                if (rt && (rt->rt_gateway->sa_family == AF_LINK ||
                        rt->rt_ifp->if_flags & IFF_LOOPBACK)) 
                         isnetlocal = TRUE;
@@ -3620,7 +3670,7 @@ tcp_mssopt(tp)
                rt = tcp_rtlookup6(tp->t_inpcb);
        else
 #endif /* INET6 */
-       rt = tcp_rtlookup(tp->t_inpcb);
+       rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
        if (rt == NULL) {
                lck_mtx_unlock(rt_mtx);
                return (
index e22e04993f01589eb77a4b9e95e8ce7d995a326d..af687347820dd80942f5314c58f9197d28116f11 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -1638,6 +1638,13 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
        int error = 0;
        boolean_t chain;
        boolean_t unlocked = FALSE;
+       struct inpcb *inp = tp->t_inpcb;
+       struct ip_out_args ipoa;
+
+       /* If socket was bound to an ifindex, tell ip_output about it */
+       ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
+           inp->inp_boundif : IFSCOPE_NONE;
+       flags |= IP_OUTARGS;
 
        /* Make sure ACK/DELACK conditions are cleared before
         * we unlock the socket.
@@ -1691,13 +1698,8 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt,
                         */
                        cnt = 0;
                }
-#if CONFIG_FORCE_OUT_IFP
-               error = ip_output_list(pkt, cnt, opt, &tp->t_inpcb->inp_route,
-                   flags, 0, tp->t_inpcb->pdp_ifp);
-#else
-               error = ip_output_list(pkt, cnt, opt, &tp->t_inpcb->inp_route,
-                   flags, 0, NULL);
-#endif
+               error = ip_output_list(pkt, cnt, opt, &inp->inp_route,
+                   flags, 0, &ipoa);
                if (chain || error) {
                        /*
                         * If we sent down a chain then we are done since
index a94f8ad2a7f0e8f4cf6f10f7059919f952d3468f..f0d78d7b8caf7a51c9b49fa57517a3e668842572 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -477,11 +477,7 @@ tcp_respond(
        tcp_seq ack,
        tcp_seq seq,
        int flags,
-#if CONFIG_FORCE_OUT_IFP
-       ifnet_t ifp
-#else
-       __unused ifnet_t ifp
-#endif
+       unsigned int ifscope
        )
 {
        register int tlen;
@@ -496,7 +492,6 @@ tcp_respond(
        struct ip6_hdr *ip6;
        int isipv6;
 #endif /* INET6 */
-       int ipflags = 0;
 
 #if INET6
        isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
@@ -654,7 +649,7 @@ tcp_respond(
 #endif
 #if INET6
        if (isipv6) {
-               (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL, 0);
+               (void)ip6_output(m, NULL, ro6, 0, NULL, NULL, 0);
                if (ro6 == &sro6 && ro6->ro_rt) {
                        rtfree(ro6->ro_rt);
                        ro6->ro_rt = NULL;
@@ -662,11 +657,10 @@ tcp_respond(
        } else
 #endif /* INET6 */
        {
-#if CONFIG_FORCE_OUT_IFP
-               ifp = (tp && tp->t_inpcb) ? tp->t_inpcb->pdp_ifp :
-                         (ifp && (ifp->if_flags & IFF_POINTOPOINT) != 0) ? ifp : NULL;
-#endif
-               (void) ip_output_list(m, 0, NULL, ro, ipflags, NULL, ifp);
+               struct ip_out_args ipoa = { ifscope };
+
+               (void) ip_output(m, NULL, ro, IP_OUTARGS, NULL, &ipoa);
+
                if (ro == &sro && ro->ro_rt) {
                        rtfree(ro->ro_rt);
                        ro->ro_rt = NULL;
@@ -1561,7 +1555,7 @@ tcp_mtudisc(
                        rt = tcp_rtlookup6(inp);
                else
 #endif /* INET6 */
-               rt = tcp_rtlookup(inp);
+               rt = tcp_rtlookup(inp, IFSCOPE_NONE);
                if (!rt || !rt->rt_rmx.rmx_mtu) {
                        tp->t_maxopd = tp->t_maxseg =
 #if INET6
@@ -1631,8 +1625,9 @@ tcp_mtudisc(
  * to get the interface MTU.
  */
 struct rtentry *
-tcp_rtlookup(inp)
+tcp_rtlookup(inp, input_ifscope)
        struct inpcb *inp;
+       unsigned int input_ifscope;
 {
        struct route *ro;
        struct rtentry *rt;
@@ -1648,11 +1643,24 @@ tcp_rtlookup(inp)
        if (rt == NULL || !(rt->rt_flags & RTF_UP) || rt->generation_id != route_generation) {
                /* No route yet, so try to acquire one */
                if (inp->inp_faddr.s_addr != INADDR_ANY) {
+                       unsigned int ifscope;
+
                        ro->ro_dst.sa_family = AF_INET;
                        ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
                        ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
                                inp->inp_faddr;
-                       rtalloc_ign_locked(ro, 0UL);
+
+                       /*
+                        * If the socket was bound to an interface, then
+                        * the bound-to-interface takes precedence over
+                        * the inbound interface passed in by the caller
+                        * (if we get here as part of the output path then
+                        * input_ifscope is IFSCOPE_NONE).
+                        */
+                       ifscope = (inp->inp_flags & INP_BOUND_IF) ?
+                           inp->inp_boundif : input_ifscope;
+
+                       rtalloc_scoped_ign_locked(ro, 0UL, ifscope);
                        rt = ro->ro_rt;
                }
        }
@@ -1807,7 +1815,7 @@ tcp_gettaocache(inp)
                rt = tcp_rtlookup6(inp);
        else
 #endif /* INET6 */
-       rt = tcp_rtlookup(inp);
+       rt = tcp_rtlookup(inp, IFSCOPE_NONE);
 
        /* Make sure this is a host route and is up. */
        if (rt == NULL ||
index e8de99b71f73b2e6167fd787d3793c638234f177..9ad7badac8f52abba51519510da13665eadeddf0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -800,9 +800,16 @@ tcp_timers(tp, timer)
                        tcpstat.tcps_keepprobe++;
                        t_template = tcp_maketemplate(tp);
                        if (t_template) {
+                               unsigned int ifscope;
+
+                               if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
+                                       ifscope = tp->t_inpcb->inp_boundif;
+                               else
+                                       ifscope = IFSCOPE_NONE;
+
                                tcp_respond(tp, t_template->tt_ipgen,
                                    &t_template->tt_t, (struct mbuf *)NULL,
-                                   tp->rcv_nxt, tp->snd_una - 1, 0, NULL);
+                                   tp->rcv_nxt, tp->snd_una - 1, 0, ifscope);
                                (void) m_free(dtom(t_template));
                        }
                        tp->t_timer[TCPT_KEEP] = tcp_keepintvl;
index d194a867feb498ce3a8b1d2ccb54ace82b73b556..9fcfa87e4b4f937a1f238d2ed7b6bbe9222cfaed 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -710,7 +710,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
                        if (error)
                                goto out;
                        tp->snd_wnd = TTCP_CLIENT_SND_WND;
-                       tcp_mss(tp, -1);
+                       tcp_mss(tp, -1, IFSCOPE_NONE);
                }
 
                if (flags & PRUS_EOF) {
@@ -759,7 +759,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
                        if (error)
                                goto out;
                        tp->snd_wnd = TTCP_CLIENT_SND_WND;
-                       tcp_mss(tp, -1);
+                       tcp_mss(tp, -1, IFSCOPE_NONE);
                }
                tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
                tp->t_force = 1;
index 281e250ffd174235d1fb366e6b4753e619d980e5..618fc7fed0a2a7fab978277d38a1704735e6ddb6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -688,7 +688,7 @@ struct rmxp_tao *
         tcp_gettaocache(struct inpcb *);
 void    tcp_init(void) __attribute__((section("__TEXT, initcode")));
 void    tcp_input(struct mbuf *, int);
-void    tcp_mss(struct tcpcb *, int);
+void    tcp_mss(struct tcpcb *, int, unsigned int);
 int     tcp_mssopt(struct tcpcb *);
 void    tcp_drop_syn_sent(struct inpcb *, int);
 void    tcp_mtudisc(struct inpcb *, int);
@@ -697,9 +697,9 @@ struct tcpcb *
 int     tcp_output(struct tcpcb *);
 void    tcp_quench(struct inpcb *, int);
 void    tcp_respond(struct tcpcb *, void *,
-           struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int, ifnet_t);
-struct rtentry *
-        tcp_rtlookup(struct inpcb *);
+           struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int,
+           unsigned int);
+struct rtentry *tcp_rtlookup(struct inpcb *, unsigned int);
 void    tcp_setpersist(struct tcpcb *);
 void    tcp_slowtimo(void);
 struct tcptemp *
index 88e5413f5b9eca9dec7c3570077de27dcbb4d66c..ec3ff435e8669a2cdf5525aad706e118f2cba5c5 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -1008,10 +1008,11 @@ udp_output(inp, m, addr, control, p)
        struct sockaddr_in *ifaddr;
        int error = 0, udp_dodisconnect = 0;
        struct socket *so = inp->inp_socket;
-       int soopts;
+       int soopts = 0;
        struct mbuf *inpopts;
        struct ip_moptions *mopts;
        struct route ro;
+       struct ip_out_args ipoa;
 
        KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
 
@@ -1027,15 +1028,17 @@ udp_output(inp, m, addr, control, p)
                goto release;
        }
 
+        lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
+
+       /* If socket was bound to an ifindex, tell ip_output about it */
+       ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
+           inp->inp_boundif : IFSCOPE_NONE;
+       soopts |= IP_OUTARGS;
+
        /* If there was a routing change, discard cached route and check
         * that we have a valid source address. 
         * Reacquire a new source address if INADDR_ANY was specified
         */
-
-#if 1
-        lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
-#endif
-
        if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->generation_id != route_generation) {
                if (ifa_foraddr(inp->inp_laddr.s_addr) == 0) { /* src address is gone */
                        if (inp->inp_flags & INP_INADDR_ANY)
@@ -1158,7 +1161,7 @@ udp_output(inp, m, addr, control, p)
        m->m_pkthdr.socket_id = get_socket_id(inp->inp_socket);
 
        inpopts = inp->inp_options;
-       soopts = (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST));
+       soopts |= (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST));
        mopts = inp->inp_moptions;
 
        /* We don't want to cache the route for non-connected UDP */
@@ -1170,13 +1173,15 @@ udp_output(inp, m, addr, control, p)
        socket_unlock(so, 0);
        /* XXX jgraessley please look at XXX */
        error = ip_output_list(m, 0, inpopts,
-           udp_dodisconnect ? &ro : &inp->inp_route, soopts, mopts, NULL);
+           udp_dodisconnect ? &ro : &inp->inp_route, soopts, mopts, &ipoa);
        socket_lock(so, 0);
 
        if (udp_dodisconnect) {
                /* Discard the cached route, if there is one */
-               if (ro.ro_rt != NULL)
+               if (ro.ro_rt != NULL) {
                        rtfree(ro.ro_rt);
+                       ro.ro_rt = NULL;
+               }
                in_pcbdisconnect(inp);
                inp->inp_laddr = origladdr;     /* XXX rehash? */
        }
index 11b39041314d81dd9dc3b8e8b024799f57f64f52..e847a3319315948156885609d7214df9598e5644 100644 (file)
@@ -1040,6 +1040,16 @@ icmp6_mtudisc_update(ip6cp, validated)
        if (!validated)
                return;
 
+       /*
+        * In case the suggested mtu is less than IPV6_MMTU, we
+        * only need to remember that it was for above mentioned
+        * "alwaysfrag" case.
+        * Try to be as close to the spec as possible.
+        */
+       if (mtu < IPV6_MMTU)
+               mtu = IPV6_MMTU - 8;
+
+
        bzero(&sin6, sizeof(sin6));
        sin6.sin6_family = PF_INET6;
        sin6.sin6_len = sizeof(struct sockaddr_in6);
@@ -2061,8 +2071,10 @@ icmp6_reflect(m, off)
                 */
                bzero(&ro, sizeof(ro));
                src = in6_selectsrc(&sa6_src, NULL, NULL, &ro, NULL, &src_storage, &e);
-               if (ro.ro_rt)
+               if (ro.ro_rt) {
                        rtfree(ro.ro_rt); /* XXX: we could use this */
+                       ro.ro_rt = NULL;
+               }
                if (src == NULL) {
                        nd6log((LOG_DEBUG,
                            "icmp6_reflect: source can't be determined: "
@@ -2307,10 +2319,9 @@ icmp6_redirect_input(m, off)
                bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr));
                bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
                bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr));
-               rtredirect((struct sockaddr *)&sdst, (struct sockaddr *)&sgw,
-                          (struct sockaddr *)NULL, RTF_GATEWAY | RTF_HOST,
-                          (struct sockaddr *)&ssrc,
-                          (struct rtentry **)NULL);
+               rtredirect(ifp, (struct sockaddr *)&sdst,
+                   (struct sockaddr *)&sgw, NULL, RTF_GATEWAY | RTF_HOST,
+                   (struct sockaddr *)&ssrc, NULL);
        }
        /* finally update cached route in each socket via pfctlinput */
     {
index f98b3d35fa048a4ffb07fc119e34789ef97d2b88..1b481b21fa1b32d59d2f8b364d4a51c35bdbc2c4 100644 (file)
@@ -758,8 +758,10 @@ in6_pcbdetach(inp)
                        m_freem(inp->in6p_options);
                ip6_freepcbopts(inp->in6p_outputopts);
                ip6_freemoptions(inp->in6p_moptions);
-               if (inp->in6p_route.ro_rt)
+               if (inp->in6p_route.ro_rt) {
                        rtfree(inp->in6p_route.ro_rt);
+                       inp->in6p_route.ro_rt = NULL;
+               }
                /* Check and free IPv4 related resources in case of mapped addr */
                if (inp->inp_options)
                        (void)m_free(inp->inp_options);
index fdaf9143ff78cb599ab272a4a8aa9d6c7fd16da0..178dd14d164b4efb822619f67eb0aa3981c2edd2 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -133,6 +133,9 @@ static void in6_rtqtimo(void *rock);
 static void in6_mtutimo(void *rock);
 extern int tvtohz(struct timeval *);
 
+static struct radix_node *in6_matroute_args(void *, struct radix_node_head *,
+    rn_matchf_t *, void *);
+
 #define RTPRF_OURS             RTF_PROTO3      /* set on routes we manage */
 
 /*
@@ -236,15 +239,25 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
        return ret;
 }
 
+/*
+ * Similar to in6_matroute_args except without the leaf-matching parameters.
+ */
+static struct radix_node *
+in6_matroute(void *v_arg, struct radix_node_head *head)
+{
+       return (in6_matroute_args(v_arg, head, NULL, NULL));
+}
+
 /*
  * This code is the inverse of in6_clsroute: on first reference, if we
  * were managing the route, stop doing so and set the expiration timer
  * back off again.
  */
 static struct radix_node *
-in6_matroute(void *v_arg, struct radix_node_head *head)
+in6_matroute_args(void *v_arg, struct radix_node_head *head,
+    rn_matchf_t *f, void *w)
 {
-       struct radix_node *rn = rn_match(v_arg, head);
+       struct radix_node *rn = rn_match_args(v_arg, head, f, w);
        struct rtentry *rt = (struct rtentry *)rn;
 
        if (rt && rt->rt_refcnt == 0) { /* this is first reference */
@@ -253,7 +266,7 @@ in6_matroute(void *v_arg, struct radix_node_head *head)
                        rt->rt_rmx.rmx_expire = 0;
                }
        }
-       return rn;
+       return (rn);
 }
 
 SYSCTL_DECL(_net_inet6_ip6);
@@ -527,6 +540,7 @@ in6_inithead(void **head, int off)
        rnh = *head;
        rnh->rnh_addaddr = in6_addroute;
        rnh->rnh_matchaddr = in6_matroute;
+       rnh->rnh_matchaddr_args = in6_matroute_args;
        rnh->rnh_close = in6_clsroute;
        in6_rtqtimo(rnh);       /* kick off timeout first time */
        in6_mtutimo(rnh);       /* kick off timeout first time */
index 7d5aa9b72ce861fd5bf1378e9ac3f54aace37615..6abcb4cf2afcba22aaf140c160c29b6f50df526c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -866,7 +866,7 @@ got_match:
                        }
                        bcopy(&ti, ip6, sizeof(ti));
                        tcp_respond(NULL, ip6, (struct tcphdr *)(ip6 + 1),
-                               *m, ack, seq, flags, NULL);
+                               *m, ack, seq, flags, IFSCOPE_NONE);
                        *m = NULL;
                        break;
                  }
index 9e41d205fa820bbf10d87130070c2b20d2c0b190..b4c6491deeaf86bac3ea70e0a20763f4d76d264b 100644 (file)
@@ -1154,8 +1154,10 @@ done:
                lck_mtx_unlock(ip6_mutex);
        if (ro == &ip6route && ro->ro_rt) { /* brace necessary for rtfree */
                rtfree(ro->ro_rt);
+               ro->ro_rt = NULL;
        } else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) {
                rtfree(ro_pmtu->ro_rt);
+               ro_pmtu->ro_rt = NULL;
        }
 
 #if IPSEC
@@ -2174,6 +2176,7 @@ ip6_setmoptions(
                                }
                                ifp = ro.ro_rt->rt_ifp;
                                rtfree(ro.ro_rt);
+                               ro.ro_rt = NULL;
                        }
                } else
                        ifp = ifindex2ifnet[mreq->ipv6mr_interface];
index 879dcb22b0438c0049303bb025e3ae1f9e6b55ae..3482531079655b9d455928ce0c75ed6b4aee3bd5 100644 (file)
@@ -50,6 +50,7 @@
  * DKIOCISFORMATTED                      is media formatted?
  * DKIOCISWRITABLE                       is media writable?
  *
+ * DKIOCREQUESTIDLE                      idle media
  * DKIOCDISCARD                          delete unused data
  *
  * DKIOCGETMAXBLOCKCOUNTREAD             get maximum block count for reads
@@ -114,6 +115,7 @@ typedef struct
 #define DKIOCISFORMATTED                      _IOR('d', 23, uint32_t)
 #define DKIOCISWRITABLE                       _IOR('d', 29, uint32_t)
 
+#define DKIOCREQUESTIDLE                      _IO('d', 30)
 #define DKIOCDISCARD                          _IOW('d', 31, dk_discard_t)
 
 #define DKIOCGETMAXBLOCKCOUNTREAD             _IOR('d', 64, uint64_t)
index 9be90489bcc4435606cc5fef3048abb403de1b74..ebe0e4f49943b4a83fa3a30b47ab47600a0a5c11 100644 (file)
@@ -2385,6 +2385,8 @@ extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t));
 #define DTRACE_INVOP_BCTR               6
 #define DTRACE_INVOP_TAILJUMP           7
 #endif
+
+
 #endif /* __APPLE__ */
 
 #ifdef  __cplusplus
index 200b3e7ac5c698ae0a53095e206f0ed5df4f60d4..98b5c7c42176d5ef1f306ee6fd28a2ae7884908b 100644 (file)
@@ -258,6 +258,7 @@ typedef __darwin_pid_t      pid_t;
 #define        F_FLOCK         0x020           /* Use flock(2) semantics for lock */
 #define        F_POSIX         0x040           /* Use POSIX semantics for lock */
 #define        F_PROV          0x080           /* Non-coelesced provisional lock */
+#define F_WAKE1_SAFE    0x100           /* its safe to only wake one waiter */
 #endif
 
 /*
index f5f8ad03a0d3030380aa668cf115f012c8f45c53..df4dec9c1874ee09fa54f8eb46e0881ba52a6a99 100644 (file)
@@ -90,6 +90,7 @@ struct lockf {
        off_t   lf_start;           /* Byte # of the start of the lock */
        off_t   lf_end;             /* Byte # of the end of the lock (-1=EOF) */
        caddr_t lf_id;              /* Id of the resource holding the lock */
+        uint32_t lf_waiters;        /* count of waiters on this lock */
        struct  lockf **lf_head;    /* Back pointer to the head of the locf list */
        struct vnode *lf_vnode;     /* Back pointer to the inode */
        struct  lockf *lf_next;     /* Pointer to the next lock on this inode */
index 72d3ca1c8b57f08118bb28daae9b8f2a5cba1305..a7c659970db85e9f7841e7b09246d7dcc4a96fce 100644 (file)
@@ -187,7 +187,6 @@ extern void (*lockstat_probe)(dtrace_id_t, uint64_t, uint64_t,
 #if    CONFIG_DTRACE
 extern int lockstat_depth(void);
 extern void lockstat_hot_patch(boolean_t);
-extern void dtrace_membar_producer(void);
 
 /*
  * Macros to record lockstat probes.
index ec7c2733e51aba3d269e1f43ea0853379565115e..779dbde507e44c32bae3470971b3152ca068bb4a 100644 (file)
@@ -399,6 +399,8 @@ union m16kcluster {
 /* compatiblity with 4.3 */
 #define  m_copy(m, o, l)       m_copym((m), (o), (l), M_DONTWAIT)
 
+#define MBSHIFT         20                              /* 1MB */
+
 #endif /* KERNEL_PRIVATE */
 
 /*
index 7aef5e9e89370b1bfeca13d8dc5249ad95506f68..5814b6eeab99f85c2e085867af7b8b81add0f7dd 100644 (file)
@@ -524,6 +524,7 @@ int vnode_ischr(vnode_t);
 
 #ifdef __APPLE_API_UNSTABLE
 int    vnode_isnamedstream(vnode_t);
+int     vnode_isshadow(vnode_t);
 #endif
 
 enum vtype     vnode_iftovt(int);
index 66e32d7c303c1cb9332a83272df25346879fe048..8948d8310d6d416b74ed68600567831bb7ced378 100644 (file)
@@ -227,7 +227,9 @@ struct vnode {
 #define        VAGE            0x001000        /* Insert vnode at head of free list */
 #define        VRAOFF          0x002000        /* read ahead disabled */
 #define        VNCACHEABLE     0x004000        /* vnode is allowed to be put back in name cache */
-#define        VUINACTIVE      0x008000        /* UBC vnode is on inactive list */
+#if NAMEDSTREAMS
+#define VISSHADOW       0x008000        /* vnode is a shadow file */
+#endif
 #define        VSWAP           0x010000        /* vnode is being used as swapfile */
 #define        VTHROTTLED      0x020000        /* writes or pageouts have been throttled */
                /* wakeup tasks waiting when count falls below threshold */
index 79e526ef21e206a13dda1b7f37a35a6b32f7b092..44c482c8f425ea44bf807579708bc64bc08e93e8 100644 (file)
@@ -1599,6 +1599,22 @@ vnode_isnamedstream(
 #endif
 }
 
+int     
+vnode_isshadow(
+#if NAMEDSTREAMS
+                               vnode_t vp
+#else
+                                               __unused vnode_t vp
+#endif
+                                                               )    
+{
+#if NAMEDSTREAMS
+               return ((vp->v_flag & VISSHADOW) ? 1 : 0);
+#else
+                       return (0); 
+#endif
+}
+
 /* TBD:  set vnode_t to not cache data after it is consumed once; used for quota */
 void 
 vnode_setnocache(vnode_t vp)
@@ -4366,7 +4382,7 @@ VNOP_INACTIVE(struct vnode *vp, vfs_context_t ctx)
         */
        if (vnode_isnamedstream(vp) &&
                        (vp->v_parent != NULLVP) &&
-                       ((vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) &&
+                       (vnode_isshadow(vp)) &&
                        ((vp->v_lflag & VL_TERMINATE) == 0)) {
                vnode_recycle(vp);
        }
index eb070de33b22916d53780eae0d7908598f355487..6e53e0169da9f3afdffdf263cce96ba9da60b53d 100644 (file)
@@ -1704,8 +1704,28 @@ journal_open(struct vnode *jvp,
        }
 
     if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
-               printf("jnl: %s: open: phys_blksz %lu does not match journal header size %d\n",
-                   jdev_name, phys_blksz, jnl->jhdr->jhdr_size);
+       /*
+        * The volume has probably been resized (such that we had to adjust the
+        * logical sector size), or copied to media with a different logical
+        * sector size.  If the journal is empty, then just switch to the
+        * current logical sector size.  If the journal is not empty, then
+        * fail to open the journal.
+        */
+        
+       if (jnl->jhdr->start == jnl->jhdr->end) {
+           int err;
+           printf("jnl: %s: open: changing journal header size from %d to %lu\n",
+               jdev_name, jnl->jhdr->jhdr_size, phys_blksz);
+           jnl->jhdr->jhdr_size = phys_blksz;
+           if (write_journal_header(jnl)) {
+               printf("jnl: %s: open: failed to update journal header size\n", jdev_name);
+               goto bad_journal;
+           }
+       } else {
+           printf("jnl: %s: open: phys_blksz %lu does not match journal header size %d, and journal is not empty!\n",
+               jdev_name, phys_blksz, jnl->jhdr->jhdr_size);
+           goto bad_journal;
+       }
     }
 
     if (   jnl->jhdr->start <= 0
index bb8c5dd2bec6624db0d7cd283560a261a9f42b42..0c5299ae6c2fed84a5e05b03d37b30a562851abe 100644 (file)
@@ -935,10 +935,22 @@ nextname:
                }
                switch (cnp->cn_nameiop) {
                case DELETE:
-                       nsop = NS_DELETE;
+                       if (cnp->cn_flags & CN_ALLOWRSRCFORK) {
+                               nsop = NS_DELETE;
+                       }
+                       else {
+                               error = EPERM;
+                               goto bad;
+                       }
                        break;
                case CREATE:
-                       nsop = NS_CREATE;
+                       if (cnp->cn_flags & CN_ALLOWRSRCFORK) {
+                               nsop = NS_CREATE;
+                       }
+                       else {
+                               error = EPERM;
+                               goto bad;
+                       }
                        break;
                case LOOKUP:
                        /* Make sure our lookup of "/..namedfork/rsrc" is allowed. */
index c5d91125ddf9186e380222993c8dbb73cf89ab74..bfee0d8b4c4eb21d2dcf877a196ec88a5d9b9817 100644 (file)
@@ -1195,8 +1195,6 @@ insmntque(vnode_t vp, mount_t mp)
                        TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
                if (vp->v_lflag & VNAMED_MOUNT)
                        panic("insmntque: vp already in mount vnode list");
-               if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
-                       panic("insmntque: vp on the free list\n");
                vp->v_lflag |= VNAMED_MOUNT;
                mount_ref(mp, 1);
                mount_unlock(mp);
@@ -1976,7 +1974,7 @@ vclean(vnode_t vp, int flags)
        /* Delete the shadow stream file before we reclaim its vnode */
        if ((is_namedstream != 0) &&
                        (vp->v_parent != NULLVP) &&
-                       ((vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0)) {
+                       (vnode_isshadow(vp))) {
                vnode_relenamedstream(vp->v_parent, vp, ctx);
        }
 #endif
@@ -4019,6 +4017,9 @@ vnode_create(int flavor, size_t size, void *data, vnode_t *vpp)
                                        if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL)
                                                vp->v_flag |= VLOCKLOCAL;
                                if (insert) {
+                                       if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb))
+                                               panic("insmntque: vp on the free list\n");
+
                                        /*
                                         * enter in mount vnode list
                                         */
index be9bfe17a6c3fbd5aac65ca2c40b23d265b6115b..869f3f5b3d1ccb5da7ed2a4766d5918e5df9bd46 100644 (file)
@@ -2938,6 +2938,11 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy)
        int need_event = 0;
        int has_listeners = 0;
 
+#if NAMEDRSRCFORK
+       /* unlink or delete is allowed on rsrc forks and named streams */
+       ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK;
+#endif
+
        ndp->ni_cnd.cn_flags |= LOCKPARENT;
        cnp = &ndp->ni_cnd;
 
@@ -3051,6 +3056,15 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy)
         * since it may need to release the fs_nodelock on the dvp
         */
 out:
+#if NAMEDRSRCFORK
+       /* recycle deleted rsrc fork to force reclaim on shadow file if necessary */
+       if ((vnode_isnamedstream(ndp->ni_vp)) &&
+                       (ndp->ni_vp->v_parent != NULLVP) &&
+                       (vnode_isshadow(ndp->ni_vp))) {
+               vnode_recycle(ndp->ni_vp);
+       }       
+#endif
+
        nameidone(ndp);
        vnode_put(dvp);
        vnode_put(vp);
@@ -3540,7 +3554,7 @@ access(__unused proc_t p, struct access_args *uap, __unused register_t *retval)
         */
        if (vnode_isnamedstream(nd.ni_vp) &&
                        (nd.ni_vp->v_parent != NULLVP) &&
-                       ((nd.ni_vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0)) {
+                       (vnode_isshadow(nd.ni_vp))) {
                is_namedstream = 1;
                vnode_ref(nd.ni_vp);
        }
@@ -3606,7 +3620,7 @@ stat2(vfs_context_t ctx, struct nameidata *ndp, user_addr_t ub, user_addr_t xsec
         */
        if (vnode_isnamedstream(ndp->ni_vp) &&
                        (ndp->ni_vp->v_parent != NULLVP) &&
-                       ((ndp->ni_vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0)) {
+                       (vnode_isshadow(ndp->ni_vp))) {
                is_namedstream = 1;
                vnode_ref (ndp->ni_vp);
        }
@@ -4593,7 +4607,7 @@ fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused register_t *r
        if ((error == 0) &&
            (vp->v_flag & VISNAMEDSTREAM) && 
            (vp->v_parent != NULLVP) &&
-           !(vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) &&
+           (vnode_isshadow(vp)) &&
            (fp->f_flags & FP_WRITTEN)) {
                (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
        }
index a8fc43f7c406b28937756011c947eeaf61b849ae..0eb1036ade45c86b423757895466f188b838da06 100644 (file)
@@ -424,7 +424,13 @@ bad2:
 bad:
        ndp->ni_vp = NULL;
        if (vp) {
-               vnode_put(vp);
+#if NAMEDRSRCFORK
+               if ((vnode_isnamedstream(vp)) && (vp->v_parent != NULLVP) &&
+                                       (vnode_isshadow (vp))) {
+                       vnode_recycle(vp);
+               }
+#endif
+               vnode_put(vp);
                /*
                 * Check for a race against unlink.  We had a vnode
                 * but according to vnode_authorize or VNOP_OPEN it
@@ -489,7 +495,7 @@ vn_close(struct vnode *vp, int flags, vfs_context_t ctx)
        /* Sync data from resource fork shadow file if needed. */
        if ((vp->v_flag & VISNAMEDSTREAM) && 
            (vp->v_parent != NULLVP) &&
-           !(vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS)) {
+           (vnode_isshadow(vp))) {
                if (flags & FWASWRITTEN) {
                        (void) vnode_flushnamedstream(vp->v_parent, vp, ctx);
                }
index 5c59cfbc8bfc331fa47710865799ca4c40fe7ddb..43f8991d8c85d6e68478e3fd23bf02cf1fb17af8 100644 (file)
@@ -394,11 +394,16 @@ vnode_getnamedstream(vnode_t vp, vnode_t *svpp, const char *name, enum nsoperati
                error = default_getnamedstream(vp, svpp, name, op, context);
 
        if (error == 0) {
+               uint32_t streamflags = VISNAMEDSTREAM;
                vnode_t svp = *svpp;
-               
+
+               if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) { 
+                       streamflags |= VISSHADOW;
+               }    
+
                /* Tag the vnode. */
-               vnode_lock(svp);
-               svp->v_flag |= VISNAMEDSTREAM;
+               vnode_lock_spin(svp);
+               svp->v_flag |= streamflags;
                vnode_unlock(svp);
                /* Make the file its parent. 
                 * Note: This parent link helps us distinguish vnodes for 
@@ -427,12 +432,19 @@ vnode_makenamedstream(vnode_t vp, vnode_t *svpp, const char *name, int flags, vf
                error = default_makenamedstream(vp, svpp, name, context);
 
        if (error == 0) {
+               uint32_t streamflags = VISNAMEDSTREAM;
                vnode_t svp = *svpp;
 
                /* Tag the vnode. */
-               vnode_lock(svp);
-               svp->v_flag |= VISNAMEDSTREAM;
-               vnode_unlock(svp);              
+               if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) {
+                       streamflags |= VISSHADOW;
+               }
+
+               /* Tag the vnode. */
+               vnode_lock_spin(svp);
+               svp->v_flag |= streamflags;
+               vnode_unlock(svp);
+
                /* Make the file its parent. 
                 * Note: This parent link helps us distinguish vnodes for 
                 * shadow stream files from vnodes for resource fork on file
index cb7928a5a2ea11fbdd80cc481e4b014be8152022..8438edd9c3c6da07042f07e076cdcb7fff607989 100644 (file)
@@ -809,13 +809,10 @@ _sha1_init:_SHA1Init
 _sha1_loop:_SHA1Update
 _sha1_result:_SHA1Final_r
 _snprintf
-_sprintf
 _sscanf
 _strcasecmp
-_strcat
 _strchr
 _strcmp
-_strcpy
 _STRDUP
 _strlen
 _strncasecmp
index 3624885939abcdfda34d7dbdfd247de8238d7a38..d7f49b7993e3b29d531c4d144d533ef3b5a249c3 100644 (file)
@@ -1,2 +1,5 @@
 _OSCompareAndSwap64
 _OSAddAtomic64
+_strcpy
+_strcat
+_sprintf
index df175fdcce0aee9162db2945c5268cfd94663859..4531e84346cae8d5f9cfd2151054ae9c5ba5b5f3 100644 (file)
@@ -18,4 +18,6 @@ __ZN8OSObject19_RESERVEDOSObject30Ev
 __ZN8OSObject19_RESERVEDOSObject31Ev
 _bcopy_nc
 _bzero_nc
-
+_strcpy
+_strcat
+_sprintf
index 78fb43d8588ee12cdb8175962b3cb60993ff2fe2..db72f8acd7e7951e97c95a5efade83de8d2d4a6b 100644 (file)
@@ -93,7 +93,7 @@ $(OBJPATH)/allsymbols: $(OBJPATH)/mach_kernel
 
 $(SYMBOL_SET_BUILD): $(OBJPATH)/%.symbolset :  %.exports %.$(ARCH_CONFIG_LC).exports $(OBJPATH)/allsymbols
        $(_v)$(KEXT_CREATE_SYMBOL_SET) \
-               $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \
+               $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \
                -import $(OBJPATH)/allsymbols \
                -export $*.exports \
                -export $*.$(ARCH_CONFIG_LC).exports \
@@ -109,7 +109,7 @@ endif
 
 build_symbol_sets:     $(SYMBOL_SET_BUILD)
        $(_v)$(KEXT_CREATE_SYMBOL_SET) \
-               $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \
+               $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \
                -import $(OBJPATH)/allsymbols \
                -export $(SRCROOT)/$(COMPONENT)/Libkern.exports \
                -export $(SRCROOT)/$(COMPONENT)/Libkern.$(ARCH_CONFIG_LC).exports \
index 38648f07c11f6399c50e9fc9958688882e12aa9a..58454343efb5b2fedc8ff9886fc5b0d7b20433dd 100644 (file)
@@ -1,4 +1,4 @@
-9.6.0
+9.7.0
 
 # The first line of this file contains the master version number for the kernel.
 # All other instances of the kernel version in xnu are derived from this file.
index efaa7c6053caa82caeda15e425e997f7c4059e21..d4882c538086f1df520d2f62a5f3960703dd9394 100644 (file)
@@ -3266,13 +3266,10 @@ _splsoftclock
 _spltty
 _splvm
 _splx
-_sprintf
 _sscanf
 _stack_privilege
-_strcat
 _strchr
 _strcmp
-_strcpy
 _strlen
 _strncat
 _strncmp
index e558b7286003ad903e664aeee0fcc38352bdec71..e876d829f55ae3cc2c10b2eb185e29f2c63bdfb8 100644 (file)
@@ -22,3 +22,6 @@ _rtc_clock_stepping
 _smp_initialized
 __ZN24IOBufferMemoryDescriptor20initWithPhysicalMaskEP4taskmyyy
 __ZN24IOBufferMemoryDescriptor22inTaskWithPhysicalMaskEP4taskmyy
+_strcat
+_strcpy
+_sprintf
index 66b5ad2e0510053a5635dc63672c910f9545bccd..000f473b0e3e34d174a8f3e354d0d9ad70585dee 100644 (file)
@@ -248,3 +248,6 @@ _pmsRunLocal
 _scc
 _rc4_crypt
 _rc4_init
+_strcat
+_strcpy
+_sprintf
index f5922d5a5e4f92ef419f1c1a7d026b414eb1d5a4..f42f66489733d327bc891fd03a8a7539a9be835e 100644 (file)
@@ -36,7 +36,13 @@ enum {
     kIOMemoryPurgeable                 = 0x00000040,
     kIOMemorySharingTypeMask           = 0x000f0000,
     kIOMemoryUnshared                  = 0x00000000,
-    kIOMemoryKernelUserShared          = 0x00010000
+    kIOMemoryKernelUserShared          = 0x00010000,
+    // shared IOMemoryDescriptor options for IOBufferMemoryDescriptor:
+    kIOBufferDescriptorMemoryFlags     = kIOMemoryDirectionMask 
+#ifdef XNU_KERNEL_PRIVATE
+                                       | kIOMemoryAutoPrepare
+#endif
+                                       | kIOMemoryThreadSafe
 };
 
 #define _IOBUFFERMEMORYDESCRIPTOR_INTASKWITHOPTIONS_   1
index 080f692f59228bb5626aae5419aca557618ff62d..ea04c67e2a73ab571fdea3b729803660ed9d8a12 100644 (file)
@@ -78,8 +78,15 @@ enum {
     kIOMemoryAsReference       = 0x00000100,
     kIOMemoryBufferPageable    = 0x00000400,
     kIOMemoryDontMap           = 0x00000800,
+#ifdef XNU_KERNEL_PRIVATE
+    kIOMemoryRedirected                = 0x00004000,
+    kIOMemoryPreparedReadOnly  = 0x00008000,
+#endif
     kIOMemoryPersistent                = 0x00010000,
-    kIOMemoryThreadSafe                = 0x00020000
+#ifdef XNU_KERNEL_PRIVATE
+    kIOMemoryReserved6156215   = 0x00020000,
+#endif
+    kIOMemoryThreadSafe                = 0x00100000,   // Shared with Buffer MD
 };
 
 #define kIOMapperNone  ((IOMapper *) -1)
@@ -742,13 +749,6 @@ public:
 // might be created by IOMemoryDescriptor::withAddress(), but there should be 
 // no need to reference as anything but a generic IOMemoryDescriptor *.
 
-// Also these flags should not overlap with the options to
-//     IOMemoryDescriptor::initWithRanges(... IOOptionsBits options);
-
-enum {
-    kIOMemoryPreparedReadOnly  = 0x00008000,
-};
-
 class IOGeneralMemoryDescriptor : public IOMemoryDescriptor
 {
     OSDeclareDefaultStructors(IOGeneralMemoryDescriptor);
index 5fbfc6715ce16a1f8bb984f31e48bac41f2e7e6d..8358a95373f13a5e2ef6ef040772298e34925a7b 100644 (file)
@@ -31,6 +31,7 @@
 #include <IOKit/IOLib.h>
 #include <IOKit/IOMapper.h>
 #include <IOKit/IOBufferMemoryDescriptor.h>
+#include <libkern/OSDebug.h>
 
 #include "IOKitKernelInternal.h"
 #include "IOCopyMapper.h"
@@ -132,8 +133,8 @@ bool IOBufferMemoryDescriptor::initWithPhysicalMask(
     range.length  = 0;
     _ranges.v64   = &range;
 
-    // Grab the direction and the Auto Prepare bits from the Buffer MD options
-    iomdOptions  |= options & (kIOMemoryDirectionMask | kIOMemoryAutoPrepare);
+    // Grab IOMD bits from the Buffer MD options
+    iomdOptions  |= (options & kIOBufferDescriptorMemoryFlags);
 
     if ((options & (kIOMemorySharingTypeMask | kIOMapCacheMask)) && (alignment < page_size))
        alignment = page_size;
index 0b5f54a2560bc69d8e1b210d39f05529b812fe7a..7ca1e8c466a410f28d6c16ed0da5eefaa7719375 100644 (file)
@@ -1251,6 +1251,7 @@ IOReturn IOCatalogue::unloadModule( OSString * moduleName ) const
         name = moduleName->getCStringNoCopy();
         k_info = kmod_lookupbyname_locked((char *)name);
         if ( k_info && (k_info->reference_count < 1) ) {
+            record_kext_unload(k_info->id);
             if ( k_info->stop &&
                  !((ret = k_info->stop(k_info, 0)) == kIOReturnSuccess) ) {
 
index ae66cb9b8a91f1deb1f1ca66c46fdd48ad61896d..9105cdbb9f362b67c627f0349fe9b335f810fc6a 100644 (file)
@@ -2145,6 +2145,9 @@ hibernate_write_image(void)
                uncompressedSize ? ((int) ((compressedSize * 100ULL) / uncompressedSize)) : 0,
                sum1, sum2);
 
+    if (vars->fileVars->io)
+        (void) IOHibernatePollerIODone(vars->fileVars, false);
+
     if (pollerOpen)
         IOHibernatePollerClose(vars->fileVars, kIOPolledBeforeSleepState);
 
index b86f7f65164c1496db20cbdb8298a0942185e78e..58e7190e60e75ca954bd7ff216885e9b31260400 100644 (file)
@@ -813,6 +813,12 @@ IOGeneralMemoryDescriptor::initWithOptions(void *  buffers,
         gIOSystemMapper = mapper = IOMapper::gSystem;
     }
 
+    // Temp binary compatibility for kIOMemoryThreadSafe
+    if (kIOMemoryReserved6156215 & options)
+    {
+       options &= ~kIOMemoryReserved6156215;
+       options |= kIOMemoryThreadSafe;
+    }
     // Remove the dynamic internal use flags from the initial setting
     options              &= ~(kIOMemoryPreparedReadOnly);
     _flags                = options;
@@ -2566,10 +2572,6 @@ IOReturn IOMemoryDescriptor::doMap(
     return (err);
 }
 
-enum {
-    kIOMemoryRedirected        = 0x00010000
-};
-
 IOReturn IOMemoryDescriptor::handleFault(
         void *                 _pager,
        vm_map_t                addressMap,
index aca7f3fbcfb23f3f3ba9c5e40b57aae349b69cf7..d32a178875b1928ed4d6c3be964e9388a72a7dcd 100644 (file)
@@ -299,7 +299,7 @@ IOReturn IOTimerEventSource::wakeAtTime(AbsoluteTime inAbstime)
         return kIOReturnNoResources;
 
     abstime = inAbstime;
-    if ( enabled && AbsoluteTime_to_scalar(&abstime) && workLoop )
+    if ( enabled && AbsoluteTime_to_scalar(&inAbstime) && AbsoluteTime_to_scalar(&abstime) && workLoop )
     {
         if (reserved)
         {
@@ -308,14 +308,14 @@ IOReturn IOTimerEventSource::wakeAtTime(AbsoluteTime inAbstime)
             reserved->workLoop = workLoop;
             reserved->calloutGeneration++;
             if (thread_call_enter1_delayed((thread_call_t) calloutEntry, 
-                    (void *) reserved->calloutGeneration, abstime))
+                    (void *) reserved->calloutGeneration, inAbstime))
             {
                 release();
                 workLoop->release();
             }
         }
         else
-            thread_call_enter_delayed((thread_call_t) calloutEntry, abstime);
+            thread_call_enter_delayed((thread_call_t) calloutEntry, inAbstime);
     }
 
     return kIOReturnSuccess;
index bb451f8c44ea54780394ce60aa3413f9f2d6a184..940197e342f5ff1746b15f92e5c2498cf18eb312 100644 (file)
@@ -210,7 +210,7 @@ bool IOMachPort::noMoreSendersForObject( OSObject * obj,
 
        machPort = (IOMachPort *) dict->getObject( (const OSSymbol *) obj );
        if( machPort) {
-           destroyed = (machPort->mscount == *mscount);
+           destroyed = (machPort->mscount <= *mscount);
            if( destroyed)
                dict->removeObject( (const OSSymbol *) obj );
            else
index 1fa767c152d951f65211f6e16da32e6858975310..7623092e2b8ac6dd14c19a926a184c2d09b58685 100644 (file)
--- a/kgmacros
+++ b/kgmacros
@@ -159,6 +159,11 @@ document kgm
 |     kdp-reenter      Schedule reentry into the debugger and continue.
 |     kdp-reboot       Restart remote target
 |
+|     zstack           Print zalloc caller stack (zone leak debugging)
+|     findoldest       Find oldest zone leak debugging record
+|     countpcs         Print how often a pc occurs in the zone leak log
+|
+|
 | Type "help <macro>" for more specific help on a particular macro.
 | Type "show user <macro>" to see what the macro is really doing.
 end
@@ -454,6 +459,7 @@ end
 define showcurrentthreads
 set $kgm_prp = (struct processor *)processor_list
     while $kgm_prp != 0
+       printf "Processor 0x%08x State %d (cpu_id %x)\n", $kgm_prp, ($kgm_prp)->state, ($kgm_prp)->cpu_num
        if ($kgm_prp)->active_thread != 0
            set $kgm_actp = ($kgm_prp)->active_thread
            showtaskheader
@@ -504,6 +510,7 @@ end
 define showcurrentstacks
 set $kgm_prp = processor_list
     while $kgm_prp != 0
+       printf "Processor 0x%08x State %d (cpu_id %x)\n", $kgm_prp, ($kgm_prp)->state, ($kgm_prp)->cpu_num
        if ($kgm_prp)->active_thread != 0
            set $kgm_actp = ($kgm_prp)->active_thread
            showtaskheader
@@ -2539,6 +2546,10 @@ define showobjectint
     set $kgm_obj = (OSObject *) $arg1
     set $kgm_vt = *((void **) $arg1)
 
+    if ($kgm_mtype == 12)
+        set $kgm_vt = $kgm_vt - 2 * sizeof(void *)
+    end
+
     if ($kgm_show_object_addrs)
        printf "`object %p, vt ", $arg1
        output /a (unsigned) $kgm_vt
@@ -2668,6 +2679,9 @@ define showregistryentryrecurse
     printf "  <object %p, ", $kgm_re
     printf "vtable "
     set $kgm_vt = (unsigned) *(void**) $kgm_re
+    if ($kgm_mtype == 12)
+        set $kgm_vt = $kgm_vt - 2 * sizeof(void *)
+    end
     output /a $kgm_vt
 
     if ($kgm_vt != _ZTV15IORegistryEntry)
@@ -5563,3 +5577,173 @@ Syntax: (gdb) pmap_vtop <pmap> <virtual_address>
 | For page-tables in <pmap> translate <virtual_address> to physical address.
 end
 
+define zstack
+       set $index = $arg0
+
+       if (log_records == 0)
+               set $count = 0
+               printf "Zone logging not enabled.  Add 'zlog=<zone name>' to boot-args.\n"
+       else 
+               if ($argc == 2)
+                       set $count = $arg1
+               else
+                       set $count = 1
+               end
+       end
+
+       while ($count)
+               printf "\n--------------- "
+
+               if (zrecords[$index].z_opcode == 1)
+                       printf "ALLOC "
+               else
+                       printf "FREE "
+               end
+
+               printf " 0x%x : index %d  :  ztime %d -------------\n", zrecords[$index].z_element, $index, zrecords[$index].z_time
+
+               set $frame = 0
+
+               while ($frame < 15)
+                       set $frame_pc = zrecords[$index].z_pc[$frame]
+
+                       if ($frame_pc == 0)
+                               loop_break
+                       end
+
+                       x/i $frame_pc
+                       set $frame = $frame + 1
+               end
+
+               set $index = $index + 1
+               set $count = $count - 1
+       end
+end
+
+document zstack
+Syntax: (gdb) zstack <index> [<count>]
+| Zone leak debugging: print the stack trace of log element at <index>.
+| If a <count> is supplied, it prints <count> log elements starting at <index>.
+|
+| The suggested usage is to look at indexes below zcurrent and look for common stack traces.
+| The stack trace that occurs the most is probably the cause of the leak.  Find the pc of the
+| function calling into zalloc and use the countpcs kgmacro to find out how often that pc occurs in the log.
+| The pc occuring in a high percentage of records is most likely the source of the leak.
+|
+| The findoldest kgmacro is also useful for leak debugging since it identifies the oldest record
+| in the log, which may indicate the leaker.
+end
+
+define findoldest
+       set $index = 0
+       set $count = log_records
+       set $cur_min = 2000000000
+       set $cur_index = 0
+
+       if (log_records == 0)
+               printf "Zone logging not enabled.  Add 'zlog=<zone name>' to boot-args.\n"
+       else
+
+               while ($count)
+                       if (zrecords[$index].z_element && zrecords[$index].z_time < $cur_min)
+                               set $cur_index = $index
+                               set $cur_min = zrecords[$index].z_time
+                       end
+       
+                       set $count = $count - 1
+                       set $index = $index + 1
+               end
+       
+               printf "oldest record is at log index %d:\n", $cur_index
+               zstack $cur_index
+       end
+end
+
+document findoldest
+Syntax: (gdb) findoldest
+| Zone leak debugging: find and print the oldest record in the log.  Note that this command
+| can take several minutes to run since it uses linear search.
+|
+| Once it prints a stack trace, find the pc of the caller above all the zalloc, kalloc and
+| IOKit layers.  Then use the countpcs kgmacro to see how often this caller has allocated
+| memory.  A caller with a high percentage of records in the log is probably the leaker.
+end
+
+define countpcs
+       set $target_pc = $arg0
+       set $index = 0
+       set $count = log_records
+       set $found = 0
+
+       if (log_records == 0)
+               printf "Zone logging not enabled.  Add 'zlog=<zone name>' to boot-args.\n"
+       else
+
+               while ($count)
+                       set $frame = 0
+       
+                       if (zrecords[$index].z_element != 0)
+                               while ($frame < 15)
+                                       if (zrecords[$index].z_pc[$frame] == $target_pc)
+                                               set $found = $found + 1
+                                               set $frame = 15
+                                       end
+               
+                                       set $frame = $frame + 1
+                               end
+                       end
+       
+                       set $index = $index + 1
+                       set $count = $count - 1
+               end
+       
+               printf "occurred %d times in log (%d%c of records)\n", $found, ($found * 100) / zrecorded, '%'
+       end
+end
+
+document countpcs
+Syntax: (gdb) countpcs <pc>
+| Zone leak debugging: search the log and print a count of all log entries that contain the given <pc>
+| in the stack trace.  This is useful for verifying a suspected <pc> as being the source of
+| the leak.  If a high percentage of the log entries contain the given <pc>, then it's most
+| likely the source of the leak.  Note that this command can take several minutes to run.
+end
+
+define findelem
+       set $fe_index = zcurrent
+       set $fe_count = log_records
+       set $fe_elem = $arg0
+       set $fe_prev_op = -1
+
+       if (log_records == 0)
+               printf "Zone logging not enabled.  Add 'zlog=<zone name>' to boot-args.\n"
+       end
+
+       while ($fe_count)
+               if (zrecords[$fe_index].z_element == $fe_elem)
+                       zstack $fe_index
+
+                       if (zrecords[$fe_index].z_opcode == $fe_prev_op)
+                               printf "***************   DOUBLE OP!   *********************\n
+                       end
+
+                       set $fe_prev_op = zrecords[$fe_index].z_opcode
+               end
+
+               set $fe_count = $fe_count - 1
+               set $fe_index = $fe_index + 1
+
+               if ($fe_index >= log_records)
+                       set $fe_index = 0
+               end
+       end
+end
+
+document findelem
+Syntax: (gdb) findelem <elem addr>
+| Zone corruption debugging: search the log and print out the stack traces for all log entries that
+| refer to the given zone element.  When the kernel panics due to a corrupted zone element, get the
+| element address and use this macro.  This will show you the stack traces of all logged zalloc and
+| zfree operations which tells you who touched the element in the recent past.  This also makes
+| double-frees readily apparent.
+end
index 3f1d09b7af00d1a875892d4a54e9feaa319f37b2..ccff380d63d71db71c6e0b703c29e458657c382e 100644 (file)
@@ -16,7 +16,6 @@ INSTINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS}
 
 INSTINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS}
 
-
 EXPINC_SUBDIRS = \
        libkern \
        uuid
@@ -29,7 +28,7 @@ EXPINC_SUBDIRS_ARM = ${EXPINC_SUBDIRS}
 
 SETUP_SUBDIRS = conf
 
-COMP_SUBDIRS = conf
+COMP_SUBDIRS = conf kmod
 
 INST_SUBDIRS = kmod
 
index f8ac932d7c5b0c35a74d05e77b0d19013f83b424..f7f1e0a3be61d4e3d0ff7f1bdb0cb316b36a22e0 100644 (file)
@@ -652,6 +652,7 @@ static void _OSMetaClassConsiderUnloads(__unused thread_call_param_t p0,
             classes->release();
 
             if (0 == checkClass) {
+                record_kext_unload(ki->id);
                 OSRuntimeUnloadCPP(ki, 0);     // call destructors
                 ret = kmod_destroy(host_priv_self(), ki->id);
                 didUnload = true;
index fcc97f211f2d1179e3858f84b924d2c1c7f8392a..bab4de5750d221eec22a5085ea4acc18863d77d8 100644 (file)
@@ -7,7 +7,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
 export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
 export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 
-
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
@@ -28,7 +27,6 @@ COMPOBJROOT = $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/kmod
 INSTOBJROOT = $(OBJROOT)/$(INSTALL_TYPE)_$(ARCH_CONFIG)/$(COMPONENT)/kmod
 endif
 
-
 KMOD_CFILES    = c_start.c c_stop.c
 KMODCPP_CFILES = cplus_start.c cplus_stop.c
 
@@ -38,36 +36,62 @@ KMODCPP_OFILES = $(KMODCPP_CFILES:.c=.o)
 ALL_OFILES = $(KMOD_OFILES) $(KMODCPP_OFILES)
 
 $(ALL_OFILES): %.o : %.c
-       ${KCC} -c ${CFLAGS} ${${join $@,_CFLAGS}} ${INCFLAGS} ${${join $@,_INCFLAGS}} -o $(COMPOBJROOT)/$(*F).o $<
+       @echo CC $@
+       $(_v)${KCC} -c ${CFLAGS} ${${join $@,_CFLAGS}} ${INCFLAGS} ${${join $@,_INCFLAGS}} -o $(COMPOBJROOT)/$(*F).o $<
 
 $(COMPOBJROOT)/$(KMOD_NAME).a: $(KMOD_OFILES)
-       libtool -static -o $@ $^
+       @echo LIBTOOL $@
+       $(_v)libtool -static -o $@ $^
 
 $(COMPOBJROOT)/$(KMODCPP_NAME).a: $(KMODCPP_OFILES)
-       libtool -static -o $@ $^
+       @echo LIBTOOL $@
+       $(_v)libtool -static -o $@ $^
 
 do_build_all: $(COMPOBJROOT)/$(KMOD_NAME).a $(COMPOBJROOT)/$(KMODCPP_NAME).a
 
 $(INSTALL_DIR)/%.a: $(INSTOBJROOT)/%.a
-       @allarchs="";                                                   \
-       for onearch in $(INSTALL_ARCHS); do                             \
-               if [ $(MACHINE_CONFIG) = DEFAULT ] ; then       \
-                       archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}/$(COMPONENT);   \
-               else            \
-                       archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}_$(MACHINE_CONFIG)/$(COMPONENT); \
-               fi;             \
-           if [ -e $${archdir}/kmod/$(*F).a ]; then                    \
-               allarchs="$${allarchs} $${archdir}/kmod/$(*F).a";               \
-           fi;                                                         \
-       done;                                                           \
+       @echo Installing $< in $@;
        $(RM) $@ || true;                                               \
        ${MKDIR} $(INSTALL_DIR) $(SYMROOT);                             \
-        cmd="lipo $${allarchs} -create -output $(SYMROOT)/$(*F).a";    \
-       echo $$cmd; eval $$cmd;                                         \
-       cmd="install $(LIB_INSTALL_FLAGS) $(SYMROOT)/$(*F).a $@";       \
+       $(_v)if [ $(MACHINE_CONFIG) = DEFAULT ] ; then                  \
+               allarchs="";                                            \
+               for onearch in $(INSTALL_ARCHS); do                     \
+                       archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}/$(COMPONENT); \
+                       if [ -e $${archdir}/kmod/$(*F).a ]; then        \
+                               allarchs="$${allarchs} $${archdir}/kmod/$(*F).a"; \
+                       fi;                                             \
+               done;                                                   \
+               cmd="$(LIPO) $${allarchs} -create -output $(SYMROOT)/$(*F).a"; \
+               echo $$cmd; eval $$cmd;                                 \
+       else                                                            \
+               my_counter=1;                                           \
+               my_innercounter=1;                                      \
+               outputfile=$(SYMROOT)/$(*F).a;                          \
+               for my_config in $(TARGET_CONFIGS_UC); do               \
+                       if [ $${my_counter} -eq 1 ]; then               \
+                               my_counter=2;                           \
+                               my_kconfig=$${my_config};               \
+                       elif [ $${my_counter} -eq 2 ]; then             \
+                               my_counter=3;                           \
+                               my_aconfig=$${my_config};               \
+                       else                                            \
+                               my_counter=1;                           \
+                               inputfile=$(OBJROOT)/$${my_kconfig}_$${my_aconfig}_$${my_config}/$(COMPONENT)/kmod/$(*F).a; \
+                               if [ -e $${inputfile} ]; then           \
+                                       if [ $${my_innercounter} -eq 1 ]; then \
+                                               my_innercounter=2;      \
+                                               cmd="$(LIPO) -create $${inputfile} -o $${outputfile}"; \
+                                       else                            \
+                                               cmd="$(LIPO) -create $${outputfile} $${inputfile} -o $${outputfile} || true"; \
+                                       fi;                             \
+                                       echo $$cmd; eval $$cmd;         \
+                               fi;                                     \
+                       fi;                                             \
+               done;                                                   \
+       fi;                                                             \
+       cmd="$(INSTALL) $(LIB_INSTALL_FLAGS) $(SYMROOT)/$(*F).a $@";    \
        echo $$cmd; eval $$cmd
 
-
 do_build_install: $(INSTALL_DIR)/$(KMOD_NAME).a $(INSTALL_DIR)/$(KMODCPP_NAME).a
 
 # include $(MakeInc_rule)
index ab7ce249aba877d95fa0bf7f2f1772ed90382553..6b650c0bd65c1361e134087ac6a2132429884f9f 100644 (file)
@@ -502,30 +502,6 @@ OSDictionary * compareExtensionVersions(
         goto finish;
      }
   
-    if (0 == strcmp("com.apple.driver.AppleIntelCPUPowerManagement",
-                    incumbentName->getCStringNoCopy())) {
-      /* Special rules. Always favor version 51.0.0 exactly at the
-       * expense of all other versions newer or older.
-       */
-      if(0 == strcmp(incumbentVersionString->getCStringNoCopy(), "51.0.0")) {
-       IOLog(VTYELLOW "Skipping duplicate extension \"%s\" with "
-             " version (%s -> %s).\n" VTRESET,
-             candidateName->getCStringNoCopy(),
-             candidateVersionString->getCStringNoCopy(),
-             incumbentVersionString->getCStringNoCopy());
-       winner = incumbent;
-       goto finish;
-      } else if (0 == strcmp(candidateVersionString->getCStringNoCopy(), "51.0.0")) {
-       IOLog(VTYELLOW "Skipping duplicate extension \"%s\" with "
-             " version (%s -> %s).\n" VTRESET,
-             candidateName->getCStringNoCopy(),
-             incumbentVersionString->getCStringNoCopy(),
-             candidateVersionString->getCStringNoCopy());
-       winner = candidate;
-       goto finish;
-      }
-    }
-
     if (candidate_vers > incumbent_vers) {
         IOLog(VTYELLOW "Replacing extension \"%s\" with newer version "
             "(%s -> %s).\n" VTRESET,
index 699706fb6173a318a4481ececfe435b6eea79a0a..8a6ff2a3bcc1359371c58729074cac27582fe980 100644 (file)
@@ -16,7 +16,7 @@ RC_ARCHS = $(ARCH)
 RC_$(RC_ARCHS) = 1
 .endif
 NARCHS != echo $(RC_ARCHS) | wc -w
-LIBSYS = $(NEXT_ROOT)/usr/local/lib/system
+LIBSYS = $(SDKROOT)/usr/local/lib/system
 NJOBS != perl -e '$$n = `/usr/sbin/sysctl -n hw.ncpu`; printf "%d\n", $$n < 2 ? 2 : ($$n * 1.5)'
 BSDMAKE = bsdmake -f Makefile
 BSDMAKEJ = $(BSDMAKE) -j $(NJOBS)
index ab642795b8be35c0845b16975e090bcc8bd73a05..a40b4fb5e94dc8eee418bd299cf243c05e91b387 100644 (file)
@@ -22,12 +22,16 @@ CC = gcc
 .ifdef ALTFRAMEWORKSPATH
 PRIVINC = -F${ALTFRAMEWORKSPATH} -I${ALTFRAMEWORKSPATH}/System.framework/PrivateHeaders
 .else
-PRIVINC = -I${NEXT_ROOT}/System/Library/Frameworks/System.framework/PrivateHeaders
+PRIVINC = -I${SDKROOT}/System/Library/Frameworks/System.framework/PrivateHeaders
 .endif
 CFLAGS += ${PRIVINC}
-CFLAGS += -no-cpp-precomp -force_cpusubtype_ALL
+.if empty $(MACHINE_ARCH:Marm*)
+CFLAGS += -force_cpusubtype_ALL
+AINC= -force_cpusubtype_ALL
+.endif
+CFLAGS += -no-cpp-precomp
 CFLAGS += -fno-common -pipe -Wmost -g
-AINC= -no-cpp-precomp -force_cpusubtype_ALL
+AINC+= -no-cpp-precomp
 AINC+= -arch ${MACHINE_ARCH} -g
 CLEANFILES+=tags
 INSTALL_PIC_ARCHIVE=   yes
@@ -43,7 +47,7 @@ MAKEOBJDIR ?= ${OBJROOT}
 # add version string
 SRCS += libsyscall_version.c
 libsyscall_version.c:
-       ${NEXT_ROOT}/Developer/Makefiles/bin/version.pl Libsyscall > $@
+       ${SDKROOT}/Developer/Makefiles/bin/version.pl Libsyscall > $@
 
 CFLAGS += -I${SYMROOT}
 .include "${.CURDIR}/Makefile.inc"
index 4b3d8d543e91dbb7ab11160ca6a122996776a021..8f6973e6c364b0f19bd650836349b2f8b2e55a63 100644 (file)
@@ -94,11 +94,7 @@ PRIVHDRSPPC = ${PRIVHDRS}/architecture/ppc
 KERNELFRAMEWORK = ${DESTDIR}/System/Library/Frameworks/Kernel.framework
 PRIVKERNELHDRS = ${KERNELFRAMEWORK}/Versions/A/PrivateHeaders
 
-.if ${MACHINE_ARCH} == armv6
-ARCHDIR = arm
-.else
-ARCHDIR = ${MACHINE_ARCH}
-.endif
+ARCHDIR = ${MACHINE_ARCH:C/^armv.*$/arm/}
 
 installhdrs-md: gen_md_mig_defs
        mkdir -p ${INCDIR}/mach/${ARCHDIR}
index 40048e71ee8b578ba8a7dd5515a304d4c28bef43..516300d2a30cf1b74b0324627813dffe3f2ec763 100644 (file)
@@ -1,9 +1,5 @@
 # machine-dependent mach sources
-.if ${MACHINE_ARCH} == armv6
-ARCHDIR = arm
-.else
-ARCHDIR = ${MACHINE_ARCH}
-.endif
+ARCHDIR = ${MACHINE_ARCH:C/^armv.*$/arm/}
 .if exists(${.CURDIR}/mach/${ARCHDIR}/Makefile.inc)
 .include "${.CURDIR}/mach/${ARCHDIR}/Makefile.inc"
 .endif
index fab9fa5244d47c538aec896d507da37b1a3c600d..c4b6c21dc26dbd9b0383fe48623e4c0fefc85344 100644 (file)
@@ -197,6 +197,10 @@ ARCH_FLAGS_PPC               = -arch ppc
 ARCH_FLAGS_I386                  = -arch i386
 ARCH_FLAGS_ARM           = $($(addsuffix $(MACHINE_CONFIG),ARCH_FLAGS_ARM_))
 
+ARCH_FLAGS_ALL_PPC             = $(ARCH_FLAGS_PPC)
+ARCH_FLAGS_ALL_I386            = $(ARCH_FLAGS_I386)
+ARCH_FLAGS_ALL_ARM             = -arch arm
+
 
 #
 # Default CFLAGS
@@ -215,35 +219,36 @@ export CFLAGS_GEN = -static $(DEBUG_CFLAGS) -nostdinc -nostdlib \
        -fno-builtin -finline -msoft-float \
        -fsigned-bitfields $(OTHER_CFLAGS)
 
+ifeq ($(BUILD_STABS),1)
+export CFLAGS_GEN += -gstabs+
+export BUILD_DWARF = 0
+export BUILD_STABS = 1
+else
+export CFLAGS_GEN += -gdwarf-2
+export BUILD_DWARF = 1
+export BUILD_STABS = 0
+endif
+
 export CFLAGS_RELEASE  = 
 export CFLAGS_DEVELOPMENT      =
 export CFLAGS_DEBUG    = 
 export CFLAGS_PROFILE  =  -pg
 
-ifeq ($(BUILD_STABS),1)
-export CFLAGS_PPC      = -Dppc -DPPC -D__PPC__ -DPAGE_SIZE_FIXED \
-                               -mno-altivec -gstabs+ -force_cpusubtype_ALL
-export CFLAGS_I386     = -Di386 -DI386 -D__I386__ \
-                               -DPAGE_SIZE_FIXED -gstabs+ -force_cpusubtype_ALL
-export CFLAGS_ARM      = -Darm -DARM -D__ARM__ -DPAGE_SIZE_FIXED \
-       -fno-strict-aliasing -gstabs+ -fno-keep-inline-functions
-export BUILD_DWARF = 0
-export BUILD_STABS = 1
-else
 export CFLAGS_PPC      = -Dppc -DPPC -D__PPC__ -DPAGE_SIZE_FIXED \
-                               -mno-altivec -gdwarf-2 -force_cpusubtype_ALL
+                               -mno-altivec -force_cpusubtype_ALL
 export CFLAGS_I386     = -Di386 -DI386 -D__I386__ \
-                               -DPAGE_SIZE_FIXED -gdwarf-2 -force_cpusubtype_ALL
+                               -DPAGE_SIZE_FIXED -force_cpusubtype_ALL
 export CFLAGS_ARM      = -Darm -DARM -D__ARM__ -DPAGE_SIZE_FIXED \
-       -fno-strict-aliasing -gdwarf-2 -fno-keep-inline-functions
-export BUILD_DWARF = 1
-export BUILD_STABS = 0
+                               -fno-strict-aliasing -fno-keep-inline-functions
+
+ifeq (-arch armv7,$(ARCH_FLAGS_ARM))
+CFLAGS_ARM             += -mthumb
 endif
 ifeq (-arch armv6,$(ARCH_FLAGS_ARM))
 CFLAGS_ARM             += -mthumb
 endif
 ifeq (-arch armv5,$(ARCH_FLAGS_ARM))
-CFLAGS_ARM             += -mthumb
+#CFLAGS_ARM            += -mthumb # <rdar://problem/6174175>
 endif
 ifeq (-arch xscale,$(ARCH_FLAGS_ARM))
 CFLAGS_ARM             += -mthumb
@@ -327,7 +332,7 @@ export LDFLAGS_COMPONENT_PROFILE =  $(COMP_LDFLAGS_COMPONENT_PROFILE)
 
 export LDFLAGS_COMPONENT_PPC   = $(COMP_LDFLAGS_COMPONENT_PPC) -force_cpusubtype_ALL
 export LDFLAGS_COMPONENT_I386  = $(COMP_LDFLAGS_COMPONENT_i386)
-export LDFLAGS_COMPONENT_ARM   = $(COMP_LDFLAGS_COMPONENT_ARM)
+export LDFLAGS_COMPONENT_ARM   = $(COMP_LDFLAGS_COMPONENT_ARM) -Wl,-new_linker
 
 export LDFLAGS_COMPONENT       = $(LDFLAGS_COMPONENT_GEN) \
                  $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \
@@ -364,6 +369,7 @@ export LDFLAGS_KERNEL_I386     = \
        -Wl,-segaddr,__TEXT,0x111000 
 
 export LDFLAGS_KERNEL_ARM     = \
+       -Wl,-new_linker \
        -Wl,-segaddr,__HIB,0xC0000000 \
        -Wl,-segaddr,__TEXT,0xC0008000
 
index 9e62069ae4ddae18bc58a13014f43cd23e0847c9..c2f11dbb55e367108a1f1ecd2d48486a950033b8 100644 (file)
@@ -625,7 +625,7 @@ $(OBJPATH)/kgmacros: $(SRCROOT)/kgmacros
 
 $(DSTROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TARGET)/mach_kernel force_file_install
        @echo Installing $< in $@;
-       @if [ ! -e $(DSTROOT)$(INSTALL_FILE_DIR) ]; then        \
+       $(_v)if [ ! -e $(DSTROOT)$(INSTALL_FILE_DIR) ]; then    \
                $(MKDIR) $(DSTROOT)$(INSTALL_FILE_DIR);         \
        fi;                                                     \
        if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \
@@ -636,14 +636,38 @@ $(DSTROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TA
                        echo >empty_file_$(notdir $@);                  \
                        lipo_arg="$(subst _empty_file, empty_file_$(notdir $@),$(foreach lipo_arch,$(INSTALL_ARCHS_LC), $(addprefix -arch , $(addsuffix _empty_file, $(lipo_arch)))))"; \
                        $(LIPO) $${lipo_arg} -create -output $@;        \
-                       $(RM) $(RMFLAGS) empty_file_$(notdir $@);               \
+                       $(RM) $(RMFLAGS) empty_file_$(notdir $@);       \
                fi;                                                     \
                $(LIPO) $@ -replace $(ARCH_CONFIG_LC)  $< -o $@;        \
+       fi;                                                             \
+       if [ $(BUILD_DWARF) -eq 1 ]; then                               \
+               if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \
+                       $(CP) -f $< $<.ctfsys;                          \
+                       $(FIND) $(OBJPATH)/ -name \*.ctf -size 0        \
+                               -exec $(RM) -rf {} \;   ;               \
+                       $(CTFMERGE) -l xnu -o $<.ctfsys                 \
+                               $(OBJPATH)/*/$(KERNEL_CONFIG)/*.*o.ctf || true; \
+                       $(INSTALL) $(FILE_INSTALL_FLAGS) $<.ctfsys $@.ctfsys; \
+               else                                                    \
+                       if [ ! -e $@.ctfsys ]; then                     \
+                               echo >empty_file_$(notdir $@);          \
+                               lipo_arg="$(subst _empty_file, empty_file_$(notdir $@),$(foreach lipo_arch,$(INSTALL_ARCHS_LC), $(addprefix -arch , $(addsuffix _empty_file, $(lipo_arch)))))"; \
+                               $(LIPO) $${lipo_arg} -create -output $@.ctfsys;\
+                               $(RM) $(RMFLAGS) empty_file_$(notdir $@);\
+                       fi;                                             \
+                       $(FIND) $(OBJPATH)/ -name \*.ctf -size 0        \
+                               -exec $(RM) -rf {} \;   ;               \
+                       $(CP) -f $< $<.ctfsys;                          \
+                       $(CTFMERGE) -l xnu -o $<.ctfsys                 \
+                               $(OBJPATH)/*/$(KERNEL_CONFIG)/*.*o.ctf || true; \
+                       $(LIPO) $@.ctfsys -replace $(ARCH_CONFIG_LC)    \
+                               $<.ctfsys -o $@.ctfsys;                 \
+               fi;                                                     \
        fi
 
 $(SYMROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TARGET)/mach_kernel.sys force_file_install
        @echo Installing $< in $@;
-       @if [ ! -e $(SYMROOT)$(INSTALL_FILE_DIR) ]; then        \
+       $(_v)if [ ! -e $(SYMROOT)$(INSTALL_FILE_DIR) ]; then    \
                $(MKDIR) $(SYMROOT)$(INSTALL_FILE_DIR);         \
        fi;                                                     \
        if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \
@@ -682,7 +706,9 @@ $(INSTALL_FILE_FILES_GENERIC): $(DSTROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/% forc
        fi;                                                             \
        if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then         \
                $(RM) $(RMFLAGS) $@;                                    \
-               $(INSTALL) $(FILE_INSTALL_FLAGS) $< $@;         \
+               if [ $(MACHINE_CONFIG) = DEFAULT ]; then                \
+                       $(INSTALL) $(FILE_INSTALL_FLAGS) $< $@;         \
+               fi;                                                     \
        else                                                            \
                if [ ! -e $@ ]; then                                    \
                        echo >empty_file_$(notdir $@);                  \
@@ -699,7 +725,9 @@ $(INSTALL_FILE_FILES_GENERIC): $(DSTROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/% forc
                                -exec $(RM) -rf {} \;   ;               \
                        $(CTFMERGE) -l xnu -o $<.ctfsys                 \
                                $(OBJPATH)/*/$(KERNEL_CONFIG)/*.*o.ctf || true; \
-                       $(INSTALL) $(FILE_INSTALL_FLAGS) $<.ctfsys $(dir $@); \
+                       if [ $(MACHINE_CONFIG) = DEFAULT ]; then        \
+                               $(INSTALL) $(FILE_INSTALL_FLAGS) $<.ctfsys $(dir $@); \
+                       fi;                                             \
                else                                                    \
                        if [ ! -e $@.ctfsys ]; then                     \
                                echo >empty_file_$(notdir $@);          \
index a41da57da6b4b8bea47bb7307d4be37c5a66fcbe..2864ea6b9ab2489d6eb9372c86009cfc37286b8e 100644 (file)
@@ -85,9 +85,13 @@ osfmk/i386/commpage/bcopy_scalar.s   standard
 osfmk/i386/commpage/bcopy_sse2.s       standard
 osfmk/i386/commpage/bcopy_sse3x.s      standard
 osfmk/i386/commpage/bcopy_sse3x_64.s   standard
+osfmk/i386/commpage/bcopy_sse42.s      standard
+osfmk/i386/commpage/bcopy_sse42_64.s   standard
 osfmk/i386/commpage/bzero_scalar.s     standard
 osfmk/i386/commpage/bzero_sse2.s       standard
 osfmk/i386/commpage/bzero_sse2_64.s    standard
+osfmk/i386/commpage/bzero_sse42.s      standard
+osfmk/i386/commpage/bzero_sse42_64.s   standard
 osfmk/i386/commpage/memset_pattern_sse2.s      standard
 osfmk/i386/commpage/memset_pattern_sse2_64.s   standard
 osfmk/i386/commpage/longcopy_sse3x.s   standard
index 73ea3948abbb7b68894ce858bbfb04e6d3de6098..b0b961b13828a8d8dfdd9a6fc53a453471a7d5f2 100644 (file)
@@ -142,6 +142,8 @@ typedef struct _cframe_t {
 static unsigned panic_io_port;
 static unsigned        commit_paniclog_to_nvram;
 
+int debug_boot_arg;
+
 void
 machine_startup(void)
 {
@@ -157,7 +159,8 @@ machine_startup(void)
                if (boot_arg & DB_PRT) disable_debug_output=FALSE; 
                if (boot_arg & DB_SLOG) systemLogDiags=TRUE; 
                if (boot_arg & DB_NMI) panicDebugging=TRUE; 
-               if (boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; 
+               if (boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE;
+               debug_boot_arg = boot_arg;
        }
 
        if (!PE_parse_boot_argn("nvram_paniclog", &commit_paniclog_to_nvram, sizeof (commit_paniclog_to_nvram)))
@@ -1052,6 +1055,9 @@ out:
                kmod_dump(&PC, 1);
 
        panic_display_system_configuration();
+       panic_display_zprint();
+       dump_kext_info(&kdb_log);
+
        /* Release print backtrace lock, to permit other callers in the
         * event of panics on multiple processors.
         */
index 418635d9b43b10a7135c7f69c72ecba84490c2a0..8e42ba0426d3dd09aa49978159eabc72d7018c4f 100644 (file)
@@ -802,4 +802,4 @@ LReverseUnalignedLoop:                  // loop over 64-byte chunks
         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 
 
-       COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,0)
+       COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2)
index eae8b0ce8f8c7248ff348e22ba270e5968169834..53f4ed76a6d12f6137991436b07202efcc74da44 100644 (file)
@@ -146,11 +146,11 @@ LNotShort:
 //      rdi = ptr to 1st dest byte not to move (aligned)
 
 LDestAligned:
-        movl    %edx,%ecx               // copy length
+        movq    %rdx,%rcx               // copy length
        movl    %esi,%eax               // copy low half of source address
         andl    $63,%edx                // get remaining bytes for LShort
        andl    $15,%eax                // mask to low 4 bits of source address
-        andl    $-64,%ecx               // get number of bytes we will copy in inner loop
+        andq    $-64,%rcx               // get number of bytes we will copy in inner loop
 // We'd like to use lea with rip-relative addressing, but cannot in a .code64 block.
 //     lea     LTable(%rip),%r8        // point to dispatch table
        movq    $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528
@@ -794,4 +794,4 @@ LReverseUnalignedLoop:                  // loop over 64-byte chunks
         jmp     LReverseShort           // copy remaining 0..63 bytes and done
 
 
-       COMMPAGE_DESCRIPTOR(bcopy_sse3x_64,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,0)
+       COMMPAGE_DESCRIPTOR(bcopy_sse3x_64,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2)
diff --git a/osfmk/i386/commpage/bcopy_sse42.s b/osfmk/i386/commpage/bcopy_sse42.s
new file mode 100644 (file)
index 0000000..9ddd281
--- /dev/null
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+        
+#include <machine/cpu_capabilities.h>
+#include <machine/commpage.h>
+
+/*
+ * The bcopy/memcpy loops, tuned for Nehalem.
+ *
+ * The following #defines are tightly coupled to the u-architecture:
+ */
+
+#define kShort  80                     // too short to bother with SSE (must be >=80)
+
+
+// void bcopy(const void *src, void *dst, size_t len);
+        .text
+        .align 5, 0x90
+Lbcopy_sse42:                          // void bcopy(const void *src, void *dst, size_t len)
+       pushl   %ebp                    // set up a frame for backtraces
+       movl    %esp,%ebp
+        pushl   %esi
+        pushl   %edi
+        movl    8(%ebp),%esi           // get source ptr
+        movl    12(%ebp),%edi           // get dest ptr
+        movl    16(%ebp),%ecx           // get length
+        movl    %edi,%edx
+        subl    %esi,%edx               // (dest - source)
+        cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
+        jb      LReverseIsland
+        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
+        jbe     Lshort                 // no
+       jmp     LNotShort
+
+//
+// void *memcpy(void *dst, const void *src, size_t len);
+// void *memmove(void *dst, const void *src, size_t len);
+//
+// NB: These need to be 32 bytes from bcopy():
+//
+
+        .align 5, 0x90
+Lmemcpy:                               // void *memcpy(void *dst, const void *src, size_t len)
+Lmemmove:                              // void *memmove(void *dst, const void *src, size_t len)
+       pushl   %ebp                    // set up a frame for backtraces
+       movl    %esp,%ebp
+        pushl   %esi
+        pushl   %edi
+        movl    8(%ebp),%edi           // get dest ptr
+        movl    12(%ebp),%esi           // get source ptr
+        movl    16(%ebp),%ecx           // get length
+        movl    %edi,%edx
+        subl    %esi,%edx               // (dest - source)
+        cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
+        jb      LReverseIsland
+        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
+        ja      LNotShort               // yes
+        
+// Handle short forward copies.  As the most common case, this is the fall-through path.
+//      ecx = length (<= kShort)
+//      esi = source ptr
+//      edi = dest ptr
+
+Lshort:
+       movl    %ecx,%edx               // copy length
+       shrl    $2,%ecx                 // get #doublewords
+       jz      3f
+2:                                     // loop copying doublewords
+       movl    (%esi),%eax
+       addl    $4,%esi
+       movl    %eax,(%edi)
+       addl    $4,%edi
+       dec     %ecx
+       jnz     2b
+3:                                     // handle leftover bytes (0..3) in last word
+       andl    $3,%edx                 // any leftover bytes?
+       jz      Lexit
+4:                                     // loop copying bytes
+       movb    (%esi),%al
+       inc     %esi
+       movb    %al,(%edi)
+       inc     %edi
+       dec     %edx
+       jnz     4b
+Lexit:
+        movl    8(%ebp),%eax           // get return value (dst ptr) for memcpy/memmove
+        popl    %edi
+        popl    %esi
+       popl    %ebp
+        ret
+
+
+LReverseIsland:                                // keep the "jb" above a short branch...
+       jmp     LReverse                // ...because reverse moves are uncommon
+
+
+// Handle forward moves that are long enough to justify use of SSE.
+// First, 16-byte align the destination.
+//      ecx = length (> kShort)
+//      esi = source ptr
+//      edi = dest ptr
+
+LNotShort:
+        movl    %edi,%edx               // copy destination
+        negl    %edx
+        andl    $15,%edx                // get #bytes to align destination
+       jz      LDestAligned            // already aligned
+        subl    %edx,%ecx               // decrement length
+1:                                     // loop copying 1..15 bytes
+       movb    (%esi),%al
+       inc     %esi
+       movb    %al,(%edi)
+       inc     %edi
+       dec     %edx
+       jnz     1b
+        
+// Destination is now aligned.  Nehalem does a great job with unaligned SSE loads,
+// so we use MOVDQU rather than aligned loads and shifts.  Since kShort>=80, we
+// know there is at least one 64-byte chunk to move.
+// When we enter the copy loops, the following registers are set up:
+//      ecx = residual length (0..63)
+//     edx = -(length to move), a multiple of 64
+//      esi = ptr to 1st source byte not to move (unaligned)
+//      edi = ptr to 1st dest byte not to move (aligned)
+
+LDestAligned:
+        movl    %ecx,%edx               // copy length
+        andl    $63,%ecx                // get remaining bytes for Lshort
+        andl    $-64,%edx               // get number of bytes we will copy in inner loop
+        addl    %edx,%esi               // point to 1st byte not copied
+        addl    %edx,%edi
+        negl    %edx                    // now generate offset to 1st byte to be copied
+       testl   $15,%esi                // source also aligned?
+       jnz     LUnalignedLoop
+       jmp     LAlignedLoop
+
+
+// Forward loop for aligned operands.
+
+       .align  4,0x90                  // 16-byte align inner loops
+LAlignedLoop:                          // loop over 64-byte chunks
+        movdqa  (%esi,%edx),%xmm0
+        movdqa  16(%esi,%edx),%xmm1
+        movdqa  32(%esi,%edx),%xmm2
+        movdqa  48(%esi,%edx),%xmm3
+
+        movdqa  %xmm0,(%edi,%edx)
+        movdqa  %xmm1,16(%edi,%edx)
+        movdqa  %xmm2,32(%edi,%edx)
+        movdqa  %xmm3,48(%edi,%edx)
+        
+        addl    $64,%edx
+        jnz     LAlignedLoop
+        
+        jmp     Lshort                  // copy remaining 0..63 bytes and done
+
+
+// Forward loop for unaligned operands.
+
+       .align  4,0x90                  // 16-byte align inner loops
+LUnalignedLoop:                                // loop over 64-byte chunks
+        movdqu  (%esi,%edx),%xmm0
+        movdqu  16(%esi,%edx),%xmm1
+        movdqu  32(%esi,%edx),%xmm2
+        movdqu  48(%esi,%edx),%xmm3
+
+        movdqa  %xmm0,(%edi,%edx)
+        movdqa  %xmm1,16(%edi,%edx)
+        movdqa  %xmm2,32(%edi,%edx)
+        movdqa  %xmm3,48(%edi,%edx)
+        
+        addl    $64,%edx
+        jnz     LUnalignedLoop
+        
+        jmp     Lshort                  // copy remaining 0..63 bytes and done
+
+
+// Reverse moves.  They are only used with destructive overlap.
+//      ecx = length
+//      esi = source ptr
+//      edi = dest ptr
+
+LReverse:
+        addl    %ecx,%esi               // point to end of strings
+        addl    %ecx,%edi
+        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
+        ja      LReverseNotShort        // yes
+
+// Handle reverse short copies.
+//      ecx = length
+//      esi = one byte past end of source
+//      edi = one byte past end of dest
+
+LReverseShort:
+       movl    %ecx,%edx               // copy length
+       shrl    $2,%ecx                 // #words
+       jz      3f
+1:
+       subl    $4,%esi
+       movl    (%esi),%eax
+       subl    $4,%edi
+       movl    %eax,(%edi)
+       dec     %ecx
+       jnz     1b
+3:
+       andl    $3,%edx                 // bytes?
+       jz      5f
+4:
+       dec     %esi
+       movb    (%esi),%al
+       dec     %edi
+       movb    %al,(%edi)
+       dec     %edx
+       jnz     4b
+5:
+        movl    8(%ebp),%eax           // get return value (dst ptr) for memcpy/memmove
+        popl    %edi
+        popl    %esi
+       popl    %ebp
+        ret
+
+// Handle a reverse move long enough to justify using SSE.
+//      ecx = length
+//      esi = one byte past end of source
+//      edi = one byte past end of dest
+
+LReverseNotShort:
+        movl    %edi,%edx               // copy destination
+        andl    $15,%edx                // get #bytes to align destination
+        je      LReverseDestAligned     // already aligned
+        subl   %edx,%ecx               // adjust length
+1:                                     // loop copying 1..15 bytes
+       dec     %esi
+       movb    (%esi),%al
+       dec     %edi
+       movb    %al,(%edi)
+       dec     %edx
+       jnz     1b
+        
+// Destination is now aligned.  Prepare for reverse loops.
+
+LReverseDestAligned:
+        movl    %ecx,%edx               // copy length
+        andl    $63,%ecx                // get remaining bytes for Lshort
+        andl    $-64,%edx               // get number of bytes we will copy in inner loop
+        subl    %edx,%esi               // point to endpoint of copy
+        subl    %edx,%edi
+       testl   $15,%esi                // is source aligned too?
+        jnz     LReverseUnalignedLoop   // no
+
+LReverseAlignedLoop:                    // loop over 64-byte chunks
+        movdqa  -16(%esi,%edx),%xmm0
+        movdqa  -32(%esi,%edx),%xmm1
+        movdqa  -48(%esi,%edx),%xmm2
+        movdqa  -64(%esi,%edx),%xmm3
+
+        movdqa  %xmm0,-16(%edi,%edx)
+        movdqa  %xmm1,-32(%edi,%edx)
+        movdqa  %xmm2,-48(%edi,%edx)
+        movdqa  %xmm3,-64(%edi,%edx)
+        
+        subl    $64,%edx
+        jne     LReverseAlignedLoop
+        
+        jmp     LReverseShort           // copy remaining 0..63 bytes and done
+
+    
+// Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
+        
+LReverseUnalignedLoop:                  // loop over 64-byte chunks
+        movdqu  -16(%esi,%edx),%xmm0
+        movdqu  -32(%esi,%edx),%xmm1
+        movdqu  -48(%esi,%edx),%xmm2
+        movdqu  -64(%esi,%edx),%xmm3
+        
+        movdqa  %xmm0,-16(%edi,%edx)
+        movdqa  %xmm1,-32(%edi,%edx)
+        movdqa  %xmm2,-48(%edi,%edx)
+        movdqa  %xmm3,-64(%edi,%edx)
+        
+        subl    $64,%edx
+        jne     LReverseUnalignedLoop
+        
+        jmp     LReverseShort           // copy remaining 0..63 bytes and done
+
+
+       COMMPAGE_DESCRIPTOR(bcopy_sse42,_COMM_PAGE_BCOPY,kHasSSE4_2,0)
diff --git a/osfmk/i386/commpage/bcopy_sse42_64.s b/osfmk/i386/commpage/bcopy_sse42_64.s
new file mode 100644 (file)
index 0000000..7de0126
--- /dev/null
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+        
+#include <machine/cpu_capabilities.h>
+#include <machine/commpage.h>
+
+/*
+ * The bcopy/memcpy loops, tuned for Nehalem.  This is the 64-bit version.
+ *
+ * The following #defines are tightly coupled to the u-architecture:
+ */
+
+#define kShort  80                     // too short to bother with SSE (must be >=80)
+
+
+// void bcopy(const void *src, void *dst, size_t len);
+        .text
+       .code64
+        .align 5, 0x90
+Lbcopy_sse42_64:                               // void bcopy(const void *src, void *dst, size_t len)
+       pushq   %rbp                    // set up a frame for backtraces
+       movq    %rsp,%rbp
+       movq    %rsi,%rax               // copy dest ptr
+       movq    %rdi,%rsi               // xchange source and dest ptrs
+       movq    %rax,%rdi
+        subq    %rsi,%rax               // (dest - source)
+        cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
+        jb      LReverseIsland
+        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
+        jbe     LShort                 // no
+       jmp     LNotShort
+
+//
+// void *memcpy(void *dst, const void *src, size_t len);
+// void *memmove(void *dst, const void *src, size_t len);
+//
+// NB: These need to be 32 bytes from bcopy():
+//
+
+        .align 5, 0x90
+Lmemcpy:                               // void *memcpy(void *dst, const void *src, size_t len)
+Lmemmove:                              // void *memmove(void *dst, const void *src, size_t len)
+       pushq   %rbp                    // set up a frame for backtraces
+       movq    %rsp,%rbp
+       movq    %rdi,%r11               // save return value here        
+        movq    %rdi,%rax
+        subq    %rsi,%rax               // (dest - source)
+        cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
+        jb      LReverseIsland
+        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
+        ja      LNotShort               // yes
+        
+// Handle short forward copies.  As the most common case, this is the fall-through path.
+//      rdx = length (<= kShort)
+//      rsi = source ptr
+//      rdi = dest ptr
+
+LShort:
+       movl    %edx,%ecx               // copy length using 32-bit operation
+       shrl    $2,%ecx                 // get #doublewords
+       jz      3f
+2:                                     // loop copying doublewords
+       movl    (%rsi),%eax
+       addq    $4,%rsi
+       movl    %eax,(%rdi)
+       addq    $4,%rdi
+       decl    %ecx
+       jnz     2b
+3:                                     // handle leftover bytes (0..3) in last word
+       andl    $3,%edx                 // any leftover bytes?
+       jz      5f
+4:                                     // loop copying bytes
+       movb    (%rsi),%al
+       incq    %rsi
+       movb    %al,(%rdi)
+       incq    %rdi
+       decl    %edx
+       jnz     4b
+5:
+        movq   %r11,%rax               // get return value (dst ptr) for memcpy/memmove
+       popq    %rbp
+        ret
+
+
+LReverseIsland:                                // keep the "jb" above a short branch...
+       jmp     LReverse                // ...because reverse moves are uncommon
+
+
+// Handle forward moves that are long enough to justify use of SSE.
+// First, 16-byte align the destination.
+//      rdx = length (> kShort)
+//      rsi = source ptr
+//      rdi = dest ptr
+
+LNotShort:
+        movl    %edi,%ecx               // copy low half of destination ptr
+        negl    %ecx
+        andl    $15,%ecx                // get #bytes to align destination
+       jz      LDestAligned            // already aligned
+        subl    %ecx,%edx               // decrement length
+1:                                     // loop copying 1..15 bytes
+       movb    (%rsi),%al
+       inc     %rsi
+       movb    %al,(%rdi)
+       inc     %rdi
+       dec     %ecx
+       jnz     1b
+
+
+// Destination is now aligned.  Nehalem does a great job with unaligned SSE loads,
+// so we use MOVDQU rather than aligned loads and shifts.  Since kShort>=80, we
+// know there is at least one 64-byte chunk to move.
+// When we enter the copy loops, the following registers are set up:
+//      rdx = residual length (0..63)
+//     rcx = -(length to move), a multiple of 64 less than 2GB
+//      rsi = ptr to 1st source byte not to move (unaligned)
+//      rdi = ptr to 1st dest byte not to move (aligned)
+
+LDestAligned:
+        movq    %rdx,%rcx               // copy length
+        andl    $63,%edx                // get remaining bytes for LShort
+        andq    $-64,%rcx               // get number of bytes we will copy in inner loop
+        addq    %rcx,%rsi               // point to 1st byte not copied
+        addq    %rcx,%rdi
+        negq    %rcx                    // now generate offset to 1st byte to be copied
+       testl   $15,%esi                // source also aligned?
+       jnz     LUnalignedLoop
+       jmp     LAlignedLoop
+
+
+// Forward loop for aligned operands.
+
+       .align  4,0x90                  // 16-byte align inner loops
+LAlignedLoop:                          // loop over 64-byte chunks
+        movdqa  (%rsi,%rcx),%xmm0
+        movdqa  16(%rsi,%rcx),%xmm1
+        movdqa  32(%rsi,%rcx),%xmm2
+        movdqa  48(%rsi,%rcx),%xmm3
+
+        movdqa  %xmm0,(%rdi,%rcx)
+        movdqa  %xmm1,16(%rdi,%rcx)
+        movdqa  %xmm2,32(%rdi,%rcx)
+        movdqa  %xmm3,48(%rdi,%rcx)
+        
+        addq    $64,%rcx
+        jnz     LAlignedLoop
+        
+        jmp     LShort                  // copy remaining 0..63 bytes and done
+
+
+// Forward loop for unaligned operands.
+
+       .align  4,0x90                  // 16-byte align inner loops
+LUnalignedLoop:                                // loop over 64-byte chunks
+        movdqu  (%rsi,%rcx),%xmm0
+        movdqu  16(%rsi,%rcx),%xmm1
+        movdqu  32(%rsi,%rcx),%xmm2
+        movdqu  48(%rsi,%rcx),%xmm3
+
+        movdqa  %xmm0,(%rdi,%rcx)
+        movdqa  %xmm1,16(%rdi,%rcx)
+        movdqa  %xmm2,32(%rdi,%rcx)
+        movdqa  %xmm3,48(%rdi,%rcx)
+        
+        addq    $64,%rcx
+        jnz     LUnalignedLoop
+        
+        jmp     LShort                  // copy remaining 0..63 bytes and done
+       
+
+// Reverse moves.  These are only used with destructive overlap.
+//      rdx = length
+//      rsi = source ptr
+//      rdi = dest ptr
+
+LReverse:
+        addq    %rdx,%rsi               // point to end of strings
+        addq    %rdx,%rdi
+        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
+        ja      LReverseNotShort        // yes
+
+// Handle reverse short copies.
+//      edx = length (<= kShort)
+//      rsi = one byte past end of source
+//      rdi = one byte past end of dest
+
+LReverseShort:
+       movl    %edx,%ecx               // copy length
+       shrl    $3,%ecx                 // #quadwords
+       jz      3f
+1:
+       subq    $8,%rsi
+       movq    (%rsi),%rax
+       subq    $8,%rdi
+       movq    %rax,(%rdi)
+       decl    %ecx
+       jnz     1b
+3:
+       andl    $7,%edx                 // bytes?
+       jz      5f
+4:
+       decq    %rsi
+       movb    (%rsi),%al
+       decq    %rdi
+       movb    %al,(%rdi)
+       decl    %edx
+       jnz     4b
+5:
+        movq   %r11,%rax               // get return value (dst ptr) for memcpy/memmove
+       popq    %rbp
+        ret
+
+// Handle a reverse move long enough to justify using SSE.
+//      rdx = length (> kShort)
+//      rsi = one byte past end of source
+//      rdi = one byte past end of dest
+
+LReverseNotShort:
+        movl    %edi,%ecx               // copy destination
+        andl    $15,%ecx                // get #bytes to align destination
+        jz      LReverseDestAligned     // already aligned
+        subq   %rcx,%rdx               // adjust length
+1:                                     // loop copying 1..15 bytes
+       decq    %rsi
+       movb    (%rsi),%al
+       decq    %rdi
+       movb    %al,(%rdi)
+       decl    %ecx
+       jnz     1b
+        
+// Destination is now aligned.  Prepare for reverse loops.
+
+LReverseDestAligned:
+        movq    %rdx,%rcx               // copy length
+        andl    $63,%edx                // get remaining bytes for LReverseShort
+        andq    $-64,%rcx               // get number of bytes we will copy in inner loop
+        subq    %rcx,%rsi               // point to endpoint of copy
+        subq    %rcx,%rdi
+       testl   $15,%esi                // is source aligned too?
+        jnz     LReverseUnalignedLoop   // no
+
+LReverseAlignedLoop:                    // loop over 64-byte chunks
+        movdqa  -16(%rsi,%rcx),%xmm0
+        movdqa  -32(%rsi,%rcx),%xmm1
+        movdqa  -48(%rsi,%rcx),%xmm2
+        movdqa  -64(%rsi,%rcx),%xmm3
+
+        movdqa  %xmm0,-16(%rdi,%rcx)
+        movdqa  %xmm1,-32(%rdi,%rcx)
+        movdqa  %xmm2,-48(%rdi,%rcx)
+        movdqa  %xmm3,-64(%rdi,%rcx)
+        
+        subq    $64,%rcx
+        jne     LReverseAlignedLoop
+        
+        jmp     LReverseShort           // copy remaining 0..63 bytes and done
+
+    
+// Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
+        
+LReverseUnalignedLoop:                  // loop over 64-byte chunks
+        movdqu  -16(%rsi,%rcx),%xmm0
+        movdqu  -32(%rsi,%rcx),%xmm1
+        movdqu  -48(%rsi,%rcx),%xmm2
+        movdqu  -64(%rsi,%rcx),%xmm3
+        
+        movdqa  %xmm0,-16(%rdi,%rcx)
+        movdqa  %xmm1,-32(%rdi,%rcx)
+        movdqa  %xmm2,-48(%rdi,%rcx)
+        movdqa  %xmm3,-64(%rdi,%rcx)
+        
+        subq    $64,%rcx
+        jne     LReverseUnalignedLoop
+        
+        jmp     LReverseShort           // copy remaining 0..63 bytes and done
+
+
+       COMMPAGE_DESCRIPTOR(bcopy_sse42_64,_COMM_PAGE_BCOPY,kHasSSE4_2,0)
index 49c94750dad29145af5f122f8ec31a3840b85221..a80418bd9f503f87a93d52e1a092e8710414639f 100644 (file)
@@ -161,4 +161,4 @@ LVeryLong:
        jmp     Lshort
 
 
-       COMMPAGE_DESCRIPTOR(bzero_sse2,_COMM_PAGE_BZERO,kHasSSE2,0)
+       COMMPAGE_DESCRIPTOR(bzero_sse2,_COMM_PAGE_BZERO,kHasSSE2,kHasSSE4_2)
index d82d77e6f88a3a7a8a9fd8f8e689bd148574e072..ef494cbbf70f49f7a57161e8d4bb923978bdb339 100644 (file)
@@ -161,4 +161,4 @@ LVeryLong:
        jmp     Lshort
 
 
-       COMMPAGE_DESCRIPTOR(bzero_sse2_64,_COMM_PAGE_BZERO,kHasSSE2,0)
+       COMMPAGE_DESCRIPTOR(bzero_sse2_64,_COMM_PAGE_BZERO,kHasSSE2,kHasSSE4_2)
diff --git a/osfmk/i386/commpage/bzero_sse42.s b/osfmk/i386/commpage/bzero_sse42.s
new file mode 100644 (file)
index 0000000..8db6b07
--- /dev/null
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <machine/cpu_capabilities.h>
+#include <machine/commpage.h>
+
+/*
+ * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem.
+ * We don't actually use SSE4.2, but rather use it to identify Nehalem.
+ *
+ * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS. 
+ *
+ * This routine is also used for memset(p,0,n), which is a common case
+ * since gcc sometimes silently maps bzero() into memset().  As a result,
+ * we always load the original ptr into %eax before returning.
+ */
+
+#define kShort         80              // too short to bother with SSE (must be >=80)
+
+
+        .text
+        .align  5, 0x90
+Lbzero_sse42:                            // void       bzero(void *b, size_t len);
+       pushl   %ebp                    // set up a frame for backtraces
+       movl    %esp,%ebp
+        pushl   %edi
+        movl    8(%ebp),%edi            // get ptr
+        movl    12(%ebp),%edx           // get length
+
+        xorl    %eax,%eax               // set fill data to 0
+        cmpl    $(kShort),%edx          // long enough for SSE?
+        jg     LNotShort               // yes
+        
+// Here for short operands or the end of long ones.
+//      %edx = length
+//      %edi = ptr
+//      %eax = zero
+
+Lshort:
+       cmpl    $12,%edx                // long enough to word align?
+       jge     3f                      // yes
+       test    %edx,%edx               // length==0?
+       jz      6f
+1:
+       movb    %al,(%edi)              // zero a byte
+       inc     %edi
+       dec     %edx
+       jnz     1b
+       jmp     6f
+2:
+       movb    %al,(%edi)              // zero a byte
+       inc     %edi
+       dec     %edx
+3:
+       test    $3,%edi                 // is ptr doubleword aligned?
+       jnz     2b                      // no
+       movl    %edx,%ecx               // copy length
+       shrl    $2,%edx                 // #doublewords to store
+4:      
+       movl    %eax,(%edi)             // zero an aligned doubleword
+       addl    $4,%edi
+       dec     %edx
+       jnz     4b
+       andl    $3,%ecx                 // mask down to #bytes at end (0..3)
+       jz      6f                      // none
+5:
+       movb    %al,(%edi)              // zero a byte
+       inc     %edi
+       dec     %ecx
+       jnz     5b
+6:
+       movl    8(%ebp),%eax            // get return value in case this was a call of memset()
+        popl    %edi
+       popl    %ebp
+        ret
+
+        
+// We will be using SSE, so align ptr.
+//      %edx = length
+//      %edi = ptr
+//      %eax = zero
+
+LNotShort:
+       testl   $3,%edi                 // 4-byte aligned?
+       jz      2f                      // yes
+       movb    %al,(%edi)              // zero another byte
+       incl    %edi
+       decl    %edx
+       jmp     LNotShort
+1:                                     // zero doublewords until 16-byte aligned
+       movl    %eax,(%edi)
+       addl    $4,%edi
+       subl    $4,%edx
+2:
+       testl   $15,%edi                // 16-byte aligned?
+       jnz     1b                      // no
+
+       
+// Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
+//      %edx = length
+//      %edi = ptr
+//      %eax = zero
+
+LDestAligned:
+        movl    %edx,%ecx
+        andl    $63,%edx                // mask down to residual length (0..63)
+        andl    $-64,%ecx               // get #bytes we will zero in this loop
+        pxor    %xmm0,%xmm0             // zero an SSE register
+        addl    %ecx,%edi               // increment ptr by length to move
+        negl    %ecx                   // negate length to move
+       jmp     1f
+       
+// Loop over 64-byte chunks, storing into cache.
+
+       .align  4,0x90                  // keep inner loops 16-byte aligned
+1:
+        movdqa  %xmm0,(%edi,%ecx)
+        movdqa  %xmm0,16(%edi,%ecx)
+        movdqa  %xmm0,32(%edi,%ecx)
+        movdqa  %xmm0,48(%edi,%ecx)
+        addl    $64,%ecx
+        jne     1b
+       
+       jmp     Lshort
+       
+
+
+       COMMPAGE_DESCRIPTOR(bzero_sse42,_COMM_PAGE_BZERO,kHasSSE4_2,0)
diff --git a/osfmk/i386/commpage/bzero_sse42_64.s b/osfmk/i386/commpage/bzero_sse42_64.s
new file mode 100644 (file)
index 0000000..5f86939
--- /dev/null
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <machine/cpu_capabilities.h>
+#include <machine/commpage.h>
+
+/*
+ * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem.
+ * We don't actually use SSE4.2, but rather use it to identify Nehalem.
+ * This is the 64-bit version.
+ *
+ * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS. 
+ *
+ * This routine is also used for memset(p,0,n), which is a common case
+ * since gcc sometimes silently maps bzero() into memset().  As a result,
+ * we always load the original ptr into %eax before returning.
+ */
+
+#define kShort         80              // too short to bother with SSE (must be >=80)
+
+
+        .text
+       .code64
+        .align  5, 0x90
+Lbzero_sse42_64:                         // void       bzero(void *b, size_t len);
+       pushq   %rbp                    // set up a frame for backtraces
+       movq    %rsp,%rbp
+        xorl    %eax,%eax               // set fill data to 0
+       movq    %rdi,%r11               // save original ptr as return value
+        cmpq    $(kShort),%rsi          // long enough for SSE?
+        jg     LNotShort               // yes
+        
+// Here for short operands or the end of long ones.
+//      %esi = length (<= kShort)
+//      %rdi = ptr
+//      %eax = zero
+
+Lshort:
+       cmpl    $12,%esi                // long enough to word align?
+       jge     3f                      // yes
+       test    %esi,%esi               // length==0?
+       jz      6f
+1:
+       movb    %al,(%rdi)              // zero a byte
+       incq    %rdi
+       decl    %esi
+       jnz     1b
+       jmp     6f
+2:
+       movb    %al,(%rdi)              // zero a byte
+       incq    %rdi
+       decl    %esi
+3:
+       testl   $3,%edi                 // is ptr doubleword aligned?
+       jnz     2b                      // no
+       movl    %esi,%ecx               // copy length
+       shrl    $2,%esi                 // #doublewords to store
+4:      
+       movl    %eax,(%rdi)             // zero an aligned doubleword
+       addq    $4,%rdi
+       decl    %esi
+       jnz     4b
+       andl    $3,%ecx                 // mask down to #bytes at end (0..3)
+       jz      6f                      // none
+5:
+       movb    %al,(%rdi)              // zero a byte
+       incq    %rdi
+       decl    %ecx
+       jnz     5b
+6:
+       movq    %r11,%rax               // set return value in case this was a call of memset()
+       popq    %rbp
+        ret
+               
+        
+// We will be using SSE, so align ptr.
+//      %rsi = length (> kShort)
+//      %rdi = ptr
+//      %eax = zero
+
+LNotShort:
+       testl   $3,%edi                 // 4-byte aligned?
+       jz      2f                      // yes
+       movb    %al,(%rdi)              // zero another byte
+       incq    %rdi
+       decq    %rsi
+       jmp     LNotShort
+1:                                     // zero doublewords until 16-byte aligned
+       movl    %eax,(%rdi)
+       addq    $4,%rdi
+       subq    $4,%rsi
+2:
+       testl   $15,%edi                // 16-byte aligned?
+       jnz     1b                      // no
+       
+// Destination is now 16-byte aligned.  Prepare to loop over 64-byte chunks.
+//      %rsi = length (> (kShort-15))
+//      %rdi = ptr (aligned)
+//      %eax = zero
+
+LDestAligned:
+        movq    %rsi,%rcx
+        andl    $63,%esi                // mask down to residual length (0..63)
+        andq    $-64,%rcx               // get #bytes we will zero in this loop
+        pxor    %xmm0,%xmm0             // zero an SSE register
+        addq    %rcx,%rdi               // increment ptr by length to move
+        negq    %rcx                   // negate length to move
+       jmp     1f
+       
+// Loop over 64-byte chunks, storing into cache.
+
+       .align  4,0x90                  // keep inner loops 16-byte aligned
+1:
+        movdqa  %xmm0,(%rdi,%rcx)
+        movdqa  %xmm0,16(%rdi,%rcx)
+        movdqa  %xmm0,32(%rdi,%rcx)
+        movdqa  %xmm0,48(%rdi,%rcx)
+        addq    $64,%rcx
+        jne     1b
+       
+       jmp     Lshort
+
+
+       COMMPAGE_DESCRIPTOR(bzero_sse42_64,_COMM_PAGE_BZERO,kHasSSE4_2,0)
index e9604430c854a57d7d193de2e51c167dd2ff6aba..6f69fa7b2929f894b8c7bb3a721a82d17695b396 100644 (file)
@@ -95,9 +95,11 @@ _commpage_32_routines:
        .long   CPN(bit_test_and_clear_up)
        .long   CPN(bzero_scalar)
        .long   CPN(bzero_sse2)
+       .long   CPN(bzero_sse42)
        .long   CPN(bcopy_scalar)
        .long   CPN(bcopy_sse2)
        .long   CPN(bcopy_sse3x)
+       .long   CPN(bcopy_sse42)
        .long   CPN(memset_pattern_sse2)
        .long   CPN(longcopy_sse3x)
        .long   CPN(nanotime)
@@ -138,7 +140,9 @@ _commpage_64_routines:
        .long   CPN(bit_test_and_clear_mp_64)
        .long   CPN(bit_test_and_clear_up_64)
        .long   CPN(bzero_sse2_64)
+       .long   CPN(bzero_sse42_64)
        .long   CPN(bcopy_sse3x_64)
+       .long   CPN(bcopy_sse42_64)
        .long   CPN(memset_pattern_sse2_64)
        .long   CPN(longcopy_sse3x_64)
        .long   CPN(nanotime_64)
index 60baed63c14eb3f45977385213867bb807b7b8a3..f10baef8b9128d49e3b3a751c11c6f3611137a7e 100644 (file)
@@ -56,6 +56,7 @@ Lnanotime:
        testl   %esi,%esi                       /* if being updated, loop until stable */
        jz      0b
 
+       lfence
        rdtsc                                   /* get TSC in %edx:%eax */
        lfence
 
@@ -99,7 +100,9 @@ Lnanotime_slow:
        testl   %esi,%esi                       /* if generation is 0, data being changed */
        jz      0b                              /* so loop until stable */
 
+       lfence
        rdtsc                                   /* get TSC in %edx:%eax */
+       lfence
        subl    _COMM_PAGE_NT_TSC_BASE,%eax
        sbbl    _COMM_PAGE_NT_TSC_BASE+4,%edx
 
@@ -161,6 +164,7 @@ Lnanotime_64:                                       // NB: must preserve r9, r10, and r11
        movl    _NT_GENERATION(%rsi),%r8d       // get generation
        testl   %r8d,%r8d                       // if 0, data is being changed...
        jz      1b                              // ...so loop until stable
+       lfence
        rdtsc                                   // edx:eax := tsc
        lfence
        shlq    $32,%rdx                        // rax := ((edx << 32) | eax), ie 64-bit tsc
index ae05b1767f87ec6f33a676286f0f461ca6130b06..e41f6b8cd9e936d18870a1eb620ea04fef88f2a9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -61,9 +61,10 @@ struct mca_state;
  * Data structures embedded in per-cpu data:
  */
 typedef struct rtclock_timer {
-       uint64_t        deadline;
-       boolean_t       is_set;
-       boolean_t       has_expired;
+       queue_head_t    queue;
+       uint64_t                deadline;
+       boolean_t               is_set;
+       boolean_t               has_expired;
 } rtclock_timer_t;
 
 
@@ -130,7 +131,6 @@ typedef struct cpu_data
        int                     cpu_subtype;
        int                     cpu_threadtype;
        int                     cpu_running;
-       uint64_t                rtclock_intr_deadline;
        rtclock_timer_t         rtclock_timer;
        boolean_t               cpu_is64bit;
        task_map_t              cpu_task_map;
index 7727eb7eac654bb9bf8482457ce7c4c6bba88bf6..6d539ffb1942f426c8986858d09f65c864bb7274 100644 (file)
@@ -43,6 +43,8 @@ void debug_topology_print(void);
 #define DBG(x...)
 #endif /* TOPO_DEBUG */
 
+void validate_topology(void);
+
 #define bitmask(h,l)   ((bit(h)|(bit(h)-1)) & ~(bit(l)-1))
 #define bitfield(x,h,l)        (((x) & bitmask(h,l)) >> l)
 
@@ -187,10 +189,6 @@ x86_LLC_info(void)
        topoParms.nCoresSharingLLC = cpuinfo->core_count;
     if (nCPUsSharing > cpuinfo->thread_count)
        topoParms.nLCPUsSharingLLC = cpuinfo->thread_count;
-
-
-    if (nCPUsSharing > cpuinfo->thread_count)
-       topoParms.maxSharingLLC = cpuinfo->thread_count;
 }
 
 static void
@@ -1039,6 +1037,180 @@ cpu_thread_halt(void)
     /* NOT REACHED */
 }
 
+/*
+ * Validates that the topology was built correctly.  Must be called only
+ * after the complete topology is built and no other changes are being made.
+ */
+void
+validate_topology(void)
+{
+    x86_pkg_t          *pkg;
+    x86_die_t          *die;
+    x86_core_t         *core;
+    x86_lcpu_t         *lcpu;
+    uint32_t           nDies;
+    uint32_t           nCores;
+    uint32_t           nCPUs;
+
+    /*
+     * XXX
+     *
+     * Right now this only works if the number of CPUs started is the total
+     * number of CPUs.  However, when specifying cpus=n the topology is only
+     * partially constructed and the checks below will fail.
+     *
+     * We should *always* build the complete topology and only start the CPUs
+     * indicated by cpus=n.  Until that happens, this code will not check the
+     * topology if the number of cpus defined is < that described the the
+     * topology parameters.
+     */
+    nCPUs = topoParms.nPackages * topoParms.nLThreadsPerPackage;
+    if (nCPUs > real_ncpus)
+       return;
+
+    pkg = x86_pkgs;
+    while (pkg != NULL) {
+       /*
+        * Make sure that the package has the correct number of dies.
+        */
+       nDies = 0;
+       die = pkg->dies;
+       while (die != NULL) {
+           if (die->package == NULL)
+               panic("Die(%d)->package is NULL",
+                     die->pdie_num);
+           if (die->package != pkg)
+               panic("Die %d points to package %d, should be %d",
+                     die->pdie_num, die->package->lpkg_num, pkg->lpkg_num);
+
+           DBG("Die(%d)->package %d\n",
+               die->pdie_num, pkg->lpkg_num);
+
+           /*
+            * Make sure that the die has the correct number of cores.
+            */
+           DBG("Die(%d)->cores: ");
+           nCores = 0;
+           core = die->cores;
+           while (core != NULL) {
+               if (core->die == NULL)
+                   panic("Core(%d)->die is NULL",
+                         core->pcore_num);
+               if (core->die != die)
+                   panic("Core %d points to die %d, should be %d",
+                         core->pcore_num, core->die->pdie_num, die->pdie_num);
+               nCores += 1;
+               DBG("%d ", core->pcore_num);
+               core = core->next_in_die;
+           }
+           DBG("\n");
+
+           if (nCores != topoParms.nLCoresPerDie)
+               panic("Should have %d Cores, but only found %d for Die %d",
+                     topoParms.nLCoresPerDie, nCores, die->pdie_num);
+
+           /*
+            * Make sure that the die has the correct number of CPUs.
+            */
+           DBG("Die(%d)->lcpus: ", die->pdie_num);
+           nCPUs = 0;
+           lcpu = die->lcpus;
+           while (lcpu != NULL) {
+               if (lcpu->die == NULL)
+                   panic("CPU(%d)->die is NULL",
+                         lcpu->cpu_num);
+               if (lcpu->die != die)
+                   panic("CPU %d points to die %d, should be %d",
+                         lcpu->cpu_num, lcpu->die->pdie_num, die->pdie_num);
+               nCPUs += 1;
+               DBG("%d ", lcpu->cpu_num);
+               lcpu = lcpu->next_in_die;
+           }
+           DBG("\n");
+
+           if (nCPUs != topoParms.nLThreadsPerDie)
+               panic("Should have %d Threads, but only found %d for Die %d",
+                     topoParms.nLThreadsPerDie, nCPUs, die->pdie_num);
+
+           nDies += 1;
+           die = die->next_in_pkg;
+       }
+
+       if (nDies != topoParms.nLDiesPerPackage)
+           panic("Should have %d Dies, but only found %d for package %d",
+                 topoParms.nLDiesPerPackage, nDies, pkg->lpkg_num);
+
+       /*
+        * Make sure that the package has the correct number of cores.
+        */
+       nCores = 0;
+       core = pkg->cores;
+       while (core != NULL) {
+           if (core->package == NULL)
+               panic("Core(%d)->package is NULL",
+                     core->pcore_num);
+           if (core->package != pkg)
+               panic("Core %d points to package %d, should be %d",
+                     core->pcore_num, core->package->lpkg_num, pkg->lpkg_num);
+           DBG("Core(%d)->package %d\n",
+               core->pcore_num, pkg->lpkg_num);
+
+           /*
+            * Make sure that the core has the correct number of CPUs.
+            */
+           nCPUs = 0;
+           lcpu = core->lcpus;
+           DBG("Core(%d)->lcpus: ");
+           while (lcpu != NULL) {
+               if (lcpu->core == NULL)
+                   panic("CPU(%d)->core is NULL",
+                         lcpu->cpu_num);
+               if (lcpu->core != core)
+                   panic("CPU %d points to core %d, should be %d",
+                         lcpu->cpu_num, lcpu->core->pcore_num, core->pcore_num);
+               DBG("%d ", lcpu->cpu_num);
+               nCPUs += 1;
+               lcpu = lcpu->next_in_core;
+           }
+           DBG("\n");
+
+           if (nCPUs != topoParms.nLThreadsPerCore)
+               panic("Should have %d Threads, but only found %d for Core %d",
+                     topoParms.nLThreadsPerCore, nCPUs, core->pcore_num);
+           nCores += 1;
+           core = core->next_in_pkg;
+       }
+
+       if (nCores != topoParms.nLCoresPerPackage)
+           panic("Should have %d Cores, but only found %d for package %d",
+                 topoParms.nLCoresPerPackage, nCores, pkg->lpkg_num);
+
+       /*
+        * Make sure that the package has the correct number of CPUs.
+        */
+       nCPUs = 0;
+       lcpu = pkg->lcpus;
+       while (lcpu != NULL) {
+           if (lcpu->package == NULL)
+               panic("CPU(%d)->package is NULL",
+                     lcpu->cpu_num);
+           if (lcpu->package != pkg)
+               panic("CPU %d points to package %d, should be %d",
+                     lcpu->cpu_num, lcpu->package->lpkg_num, pkg->lpkg_num);
+           DBG("CPU(%d)->package %d\n",
+               lcpu->cpu_num, pkg->lpkg_num);
+           nCPUs += 1;
+           lcpu = lcpu->next_in_pkg;
+       }
+
+       if (nCPUs != topoParms.nLThreadsPerPackage)
+           panic("Should have %d Threads, but only found %d for package %d",
+                 topoParms.nLThreadsPerPackage, nCPUs, pkg->lpkg_num);
+
+       pkg = pkg->next;
+    }
+}
+
 #if TOPO_DEBUG
 /*
  * Prints out the topology
index 6e823c98057f97aaa1f9c5cc902c76cb878445d9..58b15e9135870b1778565e597329e4e469256022 100644 (file)
@@ -45,6 +45,7 @@
 #define DBG(x...)
 #endif
 void debug_topology_print(void);
+void validate_topology(void);
 
 __private_extern__ void qsort(
     void * array,
@@ -144,6 +145,7 @@ cpu_topology_start(void)
 #if TOPO_DEBUG
        debug_topology_print();
 #endif /* TOPO_DEBUG */
+       validate_topology();
 
        ml_set_interrupts_enabled(istate);
        DBG("cpu_topology_start() LLC is L%d\n", topoParms.LLCDepth + 1);
index f5cbbefb4d556082175467c2f59faca779686399..d4351e6b87d5822b31849abd4297882df8a86716 100644 (file)
@@ -133,6 +133,7 @@ typedef struct x86_lcpu
     struct x86_die     *die;           /* die containing the logical cpu */
     struct x86_pkg     *package;       /* package containing the logical cpu */
     struct cpu_data    *cpu;           /* cpu_data structure */
+    uint32_t           flags;
     uint32_t           cpu_num;        /* cpu number */
     uint32_t           lnum;           /* logical cpu number (within core) */
     uint32_t           pnum;           /* physical cpu number */
@@ -150,8 +151,10 @@ typedef struct x86_lcpu
 
 #define X86CORE_FL_PRESENT     0x80000000      /* core is present */
 #define X86CORE_FL_READY       0x40000000      /* core struct is init'd */
+#define X86CORE_FL_HAS_HPET    0x10000000      /* core has HPET assigned */
 #define X86CORE_FL_HALTED      0x00008000      /* core is halted */
 #define X86CORE_FL_IDLE                0x00004000      /* core is idle */
+#define X86CORE_FL_WAKEUP      0x00002000      /* wakeup is pending */
 
 typedef struct x86_core
 {
index f9c58c5bb02a41ff41b18671a4514d3bb8f21697..23a27ef2935d6f3f8b048ae184657cb65fdedc4b 100644 (file)
@@ -376,6 +376,27 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p)
                                quad(cpuid_reg[ecx], cpuid_reg[edx]);
        }
 
+       /* Fold in the Invariant TSC feature bit, if present */
+       if (max_extid >= 0x80000007) {
+               do_cpuid(0x80000007, cpuid_reg);  
+               info_p->cpuid_extfeatures |=
+                               cpuid_reg[edx] & CPUID_EXTFEATURE_TSCI;
+       }
+
+       /* Find the microcode version number a.k.a. signature a.k.a. BIOS ID */
+        info_p->cpuid_microcode_version =
+                (uint32_t) (rdmsr64(MSR_IA32_BIOS_SIGN_ID) >> 32);
+
+       if (info_p->cpuid_model == CPUID_MODEL_NEHALEM) {
+               /*
+                * For Nehalem, find the number of enabled cores and threads
+                * (which determines whether SMT/Hyperthreading is active).
+                */
+               uint64_t msr_core_thread_count = rdmsr64(MSR_CORE_THREAD_COUNT);
+               info_p->core_count   = bitfield(msr_core_thread_count, 31, 16);
+               info_p->thread_count = bitfield(msr_core_thread_count, 15,  0);
+       }
+       
        if (info_p->cpuid_features & CPUID_FEATURE_MONITOR) {
                /*
                 * Extract the Monitor/Mwait Leaf info:
@@ -508,6 +529,8 @@ extfeature_map[] = {
        {CPUID_EXTFEATURE_XD,      "XD"},
        {CPUID_EXTFEATURE_EM64T,   "EM64T"},
        {CPUID_EXTFEATURE_LAHF,    "LAHF"},
+       {CPUID_EXTFEATURE_RDTSCP,  "RDTSCP"},
+       {CPUID_EXTFEATURE_TSCI,    "TSCI"},
        {0, 0}
 };
 
index 34eed7b4df0b210f8d530a2ae35e654a17de4812..8e690a71f9616522f04d51d3605071cc6e7b598d 100644 (file)
  */
 #define CPUID_EXTFEATURE_SYSCALL   _Bit(11)    /* SYSCALL/sysret */
 #define CPUID_EXTFEATURE_XD       _Bit(20)     /* eXecute Disable */
+#define CPUID_EXTFEATURE_RDTSCP           _Bit(27)     /* RDTSCP */
 #define CPUID_EXTFEATURE_EM64T    _Bit(29)     /* Extended Mem 64 Technology */
 
 #define CPUID_EXTFEATURE_LAHF     _HBit(20)    /* LAFH/SAHF instructions */
 
+/*
+ * The CPUID_EXTFEATURE_XXX values define 64-bit values
+ * returned in %ecx:%edx to a CPUID request with %eax of 0x80000007: 
+ */
+#define CPUID_EXTFEATURE_TSCI      _Bit(8)     /* TSC Invariant */
+
+
 #define        CPUID_CACHE_SIZE        16      /* Number of descriptor vales */
 
 #define        CPUID_CACHE_NULL           0x00 /* NULL */
index ddbd77ffcca26ef009c486ddac63f9b48d83772d..aacc02ebc12baf2502f6975d48267849ec39917f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -39,6 +39,7 @@
 
 #include <mach/mach_types.h>
 
+#include <kern/timer_queue.h>
 #include <kern/clock.h>
 #include <kern/thread.h>
 #include <kern/processor.h>
@@ -55,9 +56,6 @@
 #include <i386/cpu_topology.h>
 #include <i386/cpu_threads.h>
 
-/* XXX from <arch>/rtclock.c */
-clock_timer_func_t             rtclock_timer_expire;
-
 /*
  *     Event timer interrupt.
  *
@@ -94,8 +92,7 @@ __unused uint64_t iaddr)
        /* has a pending clock timer expired? */
        if (mytimer->deadline <= abstime) {                     /* Have we expired the deadline? */
                mytimer->has_expired = TRUE;                    /* Remember that we popped */
-               mytimer->deadline = EndOfAllTime;               /* Set timer request to the end of all time in case we have no more events */
-               (*rtclock_timer_expire)(abstime);               /* Process pop */
+               mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime);
                mytimer->has_expired = FALSE;
        }
 
@@ -105,7 +102,7 @@ __unused uint64_t iaddr)
 }
 
 /*
- * Set the clock deadline; called by the thread scheduler.
+ * Set the clock deadline.
  */
 void etimer_set_deadline(uint64_t deadline)
 {
@@ -178,3 +175,59 @@ etimer_resync_deadlines(void)
        }
        splx(s);
 }
+
+void etimer_timer_expire(void  *arg);
+
+void
+etimer_timer_expire(
+__unused void                  *arg)
+{
+       rtclock_timer_t         *mytimer;
+       uint64_t                        abstime;
+       cpu_data_t                      *pp;
+       x86_lcpu_t                      *lcpu;
+
+       pp = current_cpu_datap();
+       lcpu = x86_lcpu();
+
+       mytimer = &pp->rtclock_timer;
+       abstime = mach_absolute_time();
+
+       mytimer->has_expired = TRUE;
+       mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime);
+       mytimer->has_expired = FALSE;
+
+       lcpu->rtcPop = EndOfAllTime;
+       etimer_resync_deadlines();
+}
+
+queue_t
+timer_queue_assign(
+    uint64_t        deadline)
+{
+       cpu_data_t                      *cdp = current_cpu_datap();
+       rtclock_timer_t         *timer;
+
+       if (cdp->cpu_running) {
+               timer = &cdp->rtclock_timer;
+
+               if (deadline < timer->deadline)
+                       etimer_set_deadline(deadline);
+       }
+       else
+               timer = &cpu_datap(master_cpu)->rtclock_timer;
+
+    return (&timer->queue);
+}
+
+void
+timer_queue_cancel(
+    queue_t         queue,
+    uint64_t        deadline,
+    uint64_t        new_deadline)
+{
+    if (queue == &current_cpu_datap()->rtclock_timer.queue) {
+        if (deadline < new_deadline)
+            etimer_set_deadline(new_deadline);
+    }
+}
index b5153c73dbfd25907f0c42464f59ef4f513c727e..e4d7f7c1e466f83c423a3a49a750644622610660 100644 (file)
@@ -420,6 +420,7 @@ LEAF_ENTRY(hw_lock_to)
        push    %ebx
        mov     %edx,%edi
 
+       lfence
        rdtsc                           /* read cyclecount into %edx:%eax */
        lfence
        addl    %ecx,%eax               /* fetch and timeout */
@@ -442,6 +443,7 @@ LEAF_ENTRY(hw_lock_to)
        /*
         * Here after spinning INNER_LOOP_COUNT times, check for timeout
         */
+       lfence
        rdtsc                           /* cyclecount into %edx:%eax */
        lfence
        cmpl    %ecx,%edx               /* compare high-order 32-bits */
index 797022979c4aa14157de2c5699fb738940344257..40086ffd1087a3b3ee3c854a43ea6957b335a041 100644 (file)
@@ -382,7 +382,8 @@ i386_vm_init(uint64_t       maxmem,
        avail_start = first_avail;
        mem_actual = sane_size;
 
-#define MEG            (1024*1024)
+#define MEG            (1024*1024ULL)
+#define GIG            (1024*MEG)
 
        /*
         * For user visible memory size, round up to 128 Mb - accounting for the various stolen memory
@@ -391,6 +392,19 @@ i386_vm_init(uint64_t      maxmem,
 
        sane_size = (sane_size + 128 * MEG - 1) & ~((uint64_t)(128 * MEG - 1));
 
+#if defined(__i386__)
+#define K32_MAXMEM     (32*GIG)
+       /*
+        * For K32 we cap at K32_MAXMEM GB (currently 32GB).
+        * Unless overriden by the maxmem= boot-arg
+        * -- which is a non-zero maxmem argument to this function.
+        */
+       if (maxmem == 0 && sane_size > K32_MAXMEM) {
+               maxmem = K32_MAXMEM;
+               printf("Physical memory %lld bytes capped at %dGB for 32-bit kernel\n",
+                       sane_size, (uint32_t) (K32_MAXMEM/GIG));
+       }
+#endif
        /*
         * if user set maxmem, reduce memory sizes
         */
index 1dd1212dbac1e824b4d1a8fdb8fc1e77bc1abdc3..ef37b72fa7dd7e9f3bc4f757c56d97383bc6c56a 100644 (file)
@@ -78,6 +78,15 @@ static i386_intr_func_t      lapic_intr_func[LAPIC_FUNC_TABLE_SIZE];
 /* TRUE if local APIC was enabled by the OS not by the BIOS */
 static boolean_t lapic_os_enabled = FALSE;
 
+static boolean_t lapic_errors_masked = FALSE;
+static uint64_t lapic_last_master_error = 0;
+static uint64_t lapic_error_time_threshold = 0;
+static unsigned lapic_master_error_count = 0;
+static unsigned lapic_error_count_threshold = 5;
+static boolean_t lapic_dont_panic = FALSE;
+
+extern int     debug_boot_arg;
+
 /* Base vector for local APIC interrupt sources */
 int lapic_interrupt_base = LAPIC_DEFAULT_INTERRUPT_BASE;
 
@@ -255,6 +264,12 @@ lapic_dump(void)
                BOOL(LAPIC_READ(SVR)&LAPIC_SVR_ENABLE),
                BOOL(!(LAPIC_READ(SVR)&LAPIC_SVR_FOCUS_OFF)),
                LAPIC_READ(SVR) & LAPIC_SVR_MASK);
+       if (mca_is_cmci_present())
+               kprintf("LVT_CMCI:    Vector 0x%02x [%s] %s %cmasked\n",
+                       VEC(LVT_CMCI),
+                       DM(LVT_CMCI),
+                       DS(LVT_CMCI),
+                       MASK(LVT_CMCI));
        kprintf("LVT_TIMER:   Vector 0x%02x %s %cmasked %s\n",
                VEC(LVT_TIMER),
                DS(LVT_TIMER),
@@ -386,15 +401,15 @@ lapic_shutdown(void)
                LAPIC_WRITE(LVT_LINT0, value);
        }
 
+       /* Error: masked */
+       LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED);
+
        /* Timer: masked */
        LAPIC_WRITE(LVT_TIMER, LAPIC_READ(LVT_TIMER) | LAPIC_LVT_MASKED);
 
        /* Perfmon: masked */
        LAPIC_WRITE(LVT_PERFCNT, LAPIC_READ(LVT_PERFCNT) | LAPIC_LVT_MASKED);
 
-       /* Error: masked */
-       LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED);
-
        /* APIC software disabled */
        LAPIC_WRITE(SVR, LAPIC_READ(SVR) & ~LAPIC_SVR_ENABLE);
 
@@ -412,6 +427,13 @@ lapic_configure(void)
 {
        int     value;
 
+       if (lapic_error_time_threshold == 0 && cpu_number() == 0) {
+               nanoseconds_to_absolutetime(NSEC_PER_SEC >> 2, &lapic_error_time_threshold);
+               if (!PE_parse_boot_argn("lapic_dont_panic", &lapic_dont_panic, sizeof(lapic_dont_panic))) {
+                       lapic_dont_panic = FALSE;
+               }
+       }
+
        /* Set flat delivery model, logical processor id */
        LAPIC_WRITE(DFR, LAPIC_DFR_FLAT);
        LAPIC_WRITE(LDR, (get_cpu_number()) << LAPIC_LDR_SHIFT);
@@ -438,9 +460,15 @@ lapic_configure(void)
        /* Thermal: unmasked */
        LAPIC_WRITE(LVT_THERMAL, LAPIC_VECTOR(THERMAL));
 
-       lapic_esr_clear();
+       /* CMCI, if available */
+       if (mca_is_cmci_present())
+               LAPIC_WRITE(LVT_CMCI, LAPIC_VECTOR(CMCI));
 
-       LAPIC_WRITE(LVT_ERROR, LAPIC_VECTOR(ERROR));
+       if (((cpu_number() == master_cpu) && lapic_errors_masked == FALSE) ||
+               (cpu_number() != master_cpu)) {
+               lapic_esr_clear();
+               LAPIC_WRITE(LVT_ERROR, LAPIC_VECTOR(ERROR));
+       }
 }
 
 void
@@ -510,6 +538,7 @@ lapic_set_intr_func(int vector, i386_intr_func_t func)
        case LAPIC_TIMER_INTERRUPT:
        case LAPIC_THERMAL_INTERRUPT:
        case LAPIC_PERFCNT_INTERRUPT:
+       case LAPIC_CMCI_INTERRUPT:
                lapic_intr_func[vector] = func;
                break;
        default:
@@ -522,6 +551,7 @@ int
 lapic_interrupt(int interrupt, x86_saved_state_t *state)
 {
        int     retval = 0;
+       int     esr = -1;
 
        interrupt -= lapic_interrupt_base;
        if (interrupt < 0) {
@@ -538,17 +568,64 @@ lapic_interrupt(int interrupt, x86_saved_state_t *state)
        switch(interrupt) {
        case LAPIC_TIMER_INTERRUPT:
        case LAPIC_THERMAL_INTERRUPT:
+       case LAPIC_PERFCNT_INTERRUPT:
        case LAPIC_INTERPROCESSOR_INTERRUPT:
                if (lapic_intr_func[interrupt] != NULL)
                        (void) (*lapic_intr_func[interrupt])(state);
                if (interrupt == LAPIC_PERFCNT_INTERRUPT)
+                       /* Clear interrupt masked */
                        LAPIC_WRITE(LVT_PERFCNT, LAPIC_VECTOR(PERFCNT));
                _lapic_end_of_interrupt();
                retval = 1;
                break;
+       case LAPIC_CMCI_INTERRUPT:
+               if (lapic_intr_func[interrupt] != NULL)
+                       (void) (*lapic_intr_func[interrupt])(state);
+               /* return 0 for plaform expert to handle */
+               break;
        case LAPIC_ERROR_INTERRUPT:
+               /* We treat error interrupts on APs as fatal.
+                * The current interrupt steering scheme directs most
+                * external interrupts to the BSP (HPET interrupts being
+                * a notable exception); hence, such an error
+                * on an AP may signify LVT corruption (with "may" being
+                * the operative word). On the BSP, we adopt a more
+                * lenient approach, in the interests of enhancing
+                * debuggability and reducing fragility.
+                * If "lapic_error_count_threshold" error interrupts
+                * occur within "lapic_error_time_threshold" absolute
+                * time units, we mask the error vector and log. The
+                * error interrupts themselves are likely
+                * side effects of issues which are beyond the purview of
+                * the local APIC interrupt handler, however. The Error
+                * Status Register value (the illegal destination
+                * vector code is one observed in practice) indicates
+                * the immediate cause of the error.
+                */
+               esr = lapic_esr_read();
                lapic_dump();
-               panic("Local APIC error\n");
+
+               if ((debug_boot_arg && (lapic_dont_panic == FALSE)) ||
+                       cpu_number() != master_cpu) {
+                       panic("Local APIC error, ESR: %d\n", esr);
+               }
+
+               if (cpu_number() == master_cpu) {
+                       uint64_t abstime = mach_absolute_time();
+                       if ((abstime - lapic_last_master_error) < lapic_error_time_threshold) {
+                               if (lapic_master_error_count++ > lapic_error_count_threshold) {
+                                       lapic_errors_masked = TRUE;
+                                       LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED);
+                                       printf("Local APIC: errors masked\n");
+                               }
+                       }
+                       else {
+                               lapic_last_master_error = abstime;
+                               lapic_master_error_count = 0;
+                       }
+                       printf("Local APIC error on master CPU, ESR: %d, error count this run: %d\n", esr, lapic_master_error_count);
+               }
+
                _lapic_end_of_interrupt();
                retval = 1;
                break;
index 4fa855676a159d5e3a33e6e5077b8f39ade83fc9..b37b3a78933d3b7a846ac10fcb22dbe5b0c64b2b 100644 (file)
@@ -62,6 +62,7 @@
 #define LAPIC_TMR_BASE                 0x00000180
 #define LAPIC_IRR_BASE                 0x00000200
 #define LAPIC_ERROR_STATUS             0x00000280
+#define LAPIC_LVT_CMCI                 0x000002F0
 #define LAPIC_ICR                      0x00000300
 #define                LAPIC_ICR_VECTOR_MASK   0x000FF
 #define                LAPIC_ICR_DM_MASK       0x00700
@@ -238,6 +239,10 @@ static inline void lapic_set_thermal_func(i386_intr_func_t func)
 {
        lapic_set_intr_func(LAPIC_VECTOR(THERMAL), func);
 }
+static inline void     lapic_set_cmci_func(i386_intr_func_t func)
+{
+       lapic_set_intr_func(LAPIC_VECTOR(CMCI), func);
+}
 
 #ifdef MP_DEBUG
 #define LAPIC_CPU_MAP_DUMP()   lapic_cpu_map_dump()
index 23f26fc50db41d21f45853763782b42f16681c8b..6eaff9d8fd9332d3537c7d7c640a65a65bf1b74a 100644 (file)
@@ -47,6 +47,7 @@ static boolean_t      mca_control_MSR_present = FALSE;
 static boolean_t       mca_threshold_status_present = FALSE;
 static boolean_t       mca_extended_MSRs_present = FALSE;
 static unsigned int    mca_extended_MSRs_count = 0;
+static boolean_t       mca_cmci_present = FALSE;
 static ia32_mcg_cap_t  ia32_mcg_cap;
 decl_simple_lock_data(static, mca_lock);
 
@@ -88,6 +89,7 @@ mca_get_availability(void)
                mca_error_bank_count = ia32_mcg_cap.bits.count;
                mca_control_MSR_present = ia32_mcg_cap.bits.mcg_ctl_p;
                mca_threshold_status_present = ia32_mcg_cap.bits.mcg_tes_p;
+               mca_cmci_present = ia32_mcg_cap.bits.mcg_ext_corr_err_p;
                if (family == 0x0F) {
                        mca_extended_MSRs_present = ia32_mcg_cap.bits.mcg_ext_p;
                        mca_extended_MSRs_count = ia32_mcg_cap.bits.mcg_ext_cnt;
@@ -144,6 +146,14 @@ mca_cpu_init(void)
        }
 }
 
+boolean_t
+mca_is_cmci_present(void)
+{
+       if (!mca_initialized)
+               mca_cpu_init();
+       return mca_cmci_present;
+}
+
 void
 mca_cpu_alloc(cpu_data_t       *cdp)
 {
@@ -195,6 +205,13 @@ mca_save_state(mca_state_t *mca_state)
                bank->mca_mci_addr = (bank->mca_mci_status.bits.addrv)?
                                        rdmsr64(IA32_MCi_ADDR(i)) : 0ULL;       
        } 
+
+       /*
+        * If we're the first thread with MCA state, point our package to it
+        * and don't care about races
+        */
+       if (x86_package()->mca_state == NULL)
+               x86_package()->mca_state = mca_state;
 }
 
 void
@@ -265,6 +282,78 @@ mca_report_cpu_info(void)
        kdb_printf(" %s\n", infop->cpuid_brand_string);
 }
 
+static const char *mc8_memory_operation[] = {
+       [MC8_MMM_GENERIC]               "generic",
+       [MC8_MMM_READ]                  "read",
+       [MC8_MMM_WRITE]                 "write",
+       [MC8_MMM_ADDRESS_COMMAND]       "address/command",
+       [MC8_MMM_RESERVED]              "reserved"
+};
+
+static void
+mca_dump_bank_mc8(mca_state_t *state, int i)
+{
+       mca_mci_bank_t                  *bank;
+       ia32_mci_status_t               status;
+       struct ia32_mc8_specific        mc8;
+       int                             mmm;
+
+       bank = &state->mca_error_bank[i];
+       status = bank->mca_mci_status;
+       mc8 = status.bits_mc8;
+       mmm = MIN(mc8.memory_operation, MC8_MMM_RESERVED);
+
+       kdb_printf(
+               " IA32_MC%d_STATUS(0x%x): 0x%016qx %svalid\n",
+               i, IA32_MCi_STATUS(i), status.u64, IF(!status.bits.val, "in"));
+       if (!status.bits.val)
+               return;
+
+       kdb_printf(
+               "  Channel number:         %d%s\n"
+               "  Memory Operation:       %s\n"
+               "  Machine-specific error: %s%s%s%s%s%s%s%s\n"
+               "  COR_ERR_CNT:            %d\n",
+               mc8.channel_number,
+               IF(mc8.channel_number == 15, " (unknown)"),
+               mc8_memory_operation[mmm],
+               IF(mc8.read_ecc,            "Read ECC"),
+               IF(mc8.ecc_on_a_scrub,      "ECC on scrub"),
+               IF(mc8.write_parity,        "Write parity"),
+               IF(mc8.redundant_memory,    "Redundant memory"),
+               IF(mc8.sparing,             "Sparing/Resilvering"),
+               IF(mc8.access_out_of_range, "Access out of Range"),
+               IF(mc8.address_parity,      "Address Parity"),
+               IF(mc8.byte_enable_parity,  "Byte Enable Parity"),
+               mc8.cor_err_cnt);
+       kdb_printf(
+               "  Status bits:\n%s%s%s%s%s%s",
+               IF(status.bits.pcc,         "   Processor context corrupt\n"),
+               IF(status.bits.addrv,       "   ADDR register valid\n"),
+               IF(status.bits.miscv,       "   MISC register valid\n"),
+               IF(status.bits.en,          "   Error enabled\n"),
+               IF(status.bits.uc,          "   Uncorrected error\n"),
+               IF(status.bits.over,        "   Error overflow\n"));
+       if (status.bits.addrv)
+               kdb_printf(
+                       " IA32_MC%d_ADDR(0x%x): 0x%016qx\n",
+                       i, IA32_MCi_ADDR(i), bank->mca_mci_addr);
+       if (status.bits.miscv) {
+               ia32_mc8_misc_t mc8_misc;
+
+               mc8_misc.u64 = bank->mca_mci_misc;
+               kdb_printf(
+                       " IA32_MC%d_MISC(0x%x): 0x%016qx\n"
+                       "  DIMM:     %d\n"
+                       "  Channel:  %d\n"
+                       "  Syndrome: 0x%x\n",
+                       i, IA32_MCi_MISC(i), mc8_misc.u64,
+                       mc8_misc.bits.dimm,
+                       mc8_misc.bits.channel,
+                       (int) mc8_misc.bits.syndrome);
+       }
+}
+
 static const char *mca_threshold_status[] = {
        [THRESHOLD_STATUS_NO_TRACKING]  "No tracking",
        [THRESHOLD_STATUS_GREEN]        "Green",
@@ -331,6 +420,37 @@ mca_dump_error_banks(mca_state_t *state)
 
        kdb_printf("MCA error-reporting registers:\n");
        for (i = 0; i < mca_error_bank_count; i++ ) {
+               if (i == 8) {
+                       /*
+                        * Fatal Memory Error
+                        */
+
+                       /* Dump MC8 for local package */
+                       kdb_printf(" Package %d logged:\n",
+                                  x86_package()->ppkg_num);
+                       mca_dump_bank_mc8(state, 8);
+
+                       /* If there's other packages, report their MC8s */
+                       x86_pkg_t       *pkg;
+                       uint64_t        deadline;
+                       for (pkg = x86_pkgs; pkg != NULL; pkg = pkg->next) {
+                               if (pkg == x86_package())
+                                       continue;
+                               deadline = mach_absolute_time() + LockTimeOut;
+                               while  (pkg->mca_state == NULL &&
+                                       mach_absolute_time() < deadline)
+                                       cpu_pause();
+                               if (pkg->mca_state) {
+                                       kdb_printf(" Package %d logged:\n",
+                                                  pkg->ppkg_num);
+                                       mca_dump_bank_mc8(pkg->mca_state, 8);
+                               } else {
+                                       kdb_printf(" Package %d timed out!\n",
+                                                  pkg->ppkg_num);
+                               }
+                       }
+                       continue;
+               }
                mca_dump_bank(state, i);
        }
 }
@@ -376,7 +496,8 @@ mca_dump(void)
                   " control MSR present\n"),
                IF(mca_threshold_status_present,
                   " threshold-based error status present\n"),
-               "");
+               IF(mca_cmci_present,
+                  " extended corrected memory error handling present\n"));
        if (mca_extended_MSRs_present)
                kdb_printf(
                        " %d extended MSRs present\n", mca_extended_MSRs_count);
index 233e78e2c9929b7907c9e1a95561ca11deeb2f85..7ecf69403999300e6014f9482f20d6afe90f5c10 100644 (file)
@@ -49,9 +49,10 @@ typedef union {
        uint64_t        count                   :BITS(7,0);
        uint64_t        mcg_ctl_p               :BIT1(8);
        uint64_t        mcg_ext_p               :BIT1(9);
-       uint64_t        mcg_reserved1           :BIT1(10);
+       uint64_t        mcg_ext_corr_err_p      :BIT1(10);
        uint64_t        mcg_tes_p               :BIT1(11);
-       uint64_t        mcg_reserved2           :BITS(15,12);
+       uint64_t        mcg_ecms                :BIT1(12);
+       uint64_t        mcg_reserved2           :BITS(15,13);
        uint64_t        mcg_ext_cnt             :BITS(23,16);
      }         bits;
      uint64_t  u64;
@@ -123,7 +124,7 @@ typedef union {
        uint64_t        over                    :BIT1(62);
        uint64_t        val                     :BIT1(63);
     }          bits;
-    struct {           /* Variant if threshold-based error status present: */
+     struct {          /* Variant if threshold-based error status present: */
        uint64_t        mca_error               :BITS(15,0);
        uint64_t        model_specific_error    :BITS(31,16);
        uint64_t        other_information       :BITS(52,32);
@@ -136,6 +137,21 @@ typedef union {
        uint64_t        over                    :BIT1(62);
        uint64_t        val                     :BIT1(63);
     }          bits_tes_p;
+    struct ia32_mc8_specific {
+       uint64_t        channel_number          :BITS(3,0);
+       uint64_t        memory_operation        :BITS(6,4);
+       uint64_t        unused                  :BITS(15,7);
+       uint64_t        read_ecc                :BIT1(16);
+       uint64_t        ecc_on_a_scrub          :BIT1(17);
+       uint64_t        write_parity            :BIT1(18);
+       uint64_t        redundant_memory        :BIT1(19);
+       uint64_t        sparing                 :BIT1(20);
+       uint64_t        access_out_of_range     :BIT1(21);
+       uint64_t        address_parity          :BIT1(23);
+       uint64_t        byte_enable_parity      :BIT1(24);
+       uint64_t        reserved                :BITS(37,25);
+       uint64_t        cor_err_cnt             :BITS(52,38);
+    }          bits_mc8;
     uint64_t   u64;
 } ia32_mci_status_t;
 
@@ -145,6 +161,24 @@ typedef union {
 #define THRESHOLD_STATUS_YELLOW                2
 #define THRESHOLD_STATUS_RESERVED      3
 
+/* MC8 memory operations encoding: */
+#define        MC8_MMM_GENERIC                 0
+#define        MC8_MMM_READ                    1
+#define        MC8_MMM_WRITE                   2
+#define        MC8_MMM_ADDRESS_COMMAND         3
+#define        MC8_MMM_RESERVED                4
+
+typedef union {
+    struct {
+       uint64_t        reserved1               :BITS(15,0);
+       uint64_t        dimm                    :BITS(17,16);
+       uint64_t        channel                 :BITS(19,18);
+       uint64_t        reserved2               :BITS(31,20);
+       uint64_t        syndrome                :BITS(63,32);
+    }          bits;
+    uint64_t   u64;
+} ia32_mc8_misc_t;
+
 typedef uint64_t       ia32_mci_addr_t;
 typedef uint64_t       ia32_mci_misc_t;
 
@@ -189,6 +223,7 @@ extern void         mca_cpu_alloc(cpu_data_t *cdp);
 extern void            mca_cpu_init(void);
 extern void            mca_dump(void);
 extern void            mca_check_save(void);
+extern boolean_t       mca_is_cmci_present(void);
 
 #endif /* _I386_MACHINE_CHECK_H_ */
 #endif /* KERNEL_PRIVATE */
index d42f6d2f1f9aaf71361ac24d07b510a6b865b8b2..019d7f82f901adb95791aa448cab8580389c090a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -296,7 +296,7 @@ void
 machine_signal_idle(
         processor_t processor)
 {
-       cpu_interrupt(PROCESSOR_DATA(processor, slot_num));
+       cpu_interrupt(processor->cpu_num);
 }
 
 thread_t        
index f68b813768f7dee0ad029763b78baaa603fdf16c..bb4095af3a4eb0d09ae8ce125e43ec4dfe9e9689 100644 (file)
@@ -47,6 +47,7 @@ ENTRY(ml_get_timebase)
 
                        movl    S_ARG0, %ecx
                        
+                       lfence
                        rdtsc
                        lfence
                        
@@ -235,7 +236,9 @@ Lslow:
                pushl           %esi                                    /* save generation */
                pushl           RNT_SHIFT(%edi)                         /* save low 32 bits of tscFreq */
 
-               rdtsc                                                   /* get TSC in %edx:%eax */
+               lfence
+               rdtsc                                                   /* get TSC in %edx:%eax */
+               lfence
                subl            RNT_TSC_BASE(%edi),%eax
                sbbl            RNT_TSC_BASE+4(%edi),%edx
 
index 5073997835e1fe7d83e6e90389b7edbb99839800..00db14a6696e1a4186ede911ccee8dcdd5e802eb 100644 (file)
@@ -40,6 +40,7 @@
 
 #include <kern/kern_types.h>
 #include <kern/startup.h>
+#include <kern/timer_queue.h>
 #include <kern/processor.h>
 #include <kern/cpu_number.h>
 #include <kern/cpu_data.h>
@@ -145,6 +146,7 @@ decl_mutex_data(static, mp_bc_lock);
 static volatile int    debugger_cpu = -1;
 
 static void    mp_cpus_call_action(void); 
+static void    mp_call_PM(void);
 
 #if GPROF
 /*
@@ -208,13 +210,51 @@ mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay)
        }
 }
 
+typedef struct {
+       int             target_cpu;
+       int             target_lapic;
+       int             starter_cpu;
+       boolean_t       is_nehalem;
+} processor_start_info_t;
+
+static processor_start_info_t start_info;
+
+static void
+start_cpu(void *arg)
+{
+       int                     i = 1000;
+       processor_start_info_t  *psip = (processor_start_info_t *) arg;
+
+       /* Ignore this if the current processor is not the starter */
+       if (cpu_number() != psip->starter_cpu)
+               return;
+
+       LAPIC_WRITE(ICRD, psip->target_lapic << LAPIC_ICRD_DEST_SHIFT);
+       LAPIC_WRITE(ICR, LAPIC_ICR_DM_INIT);
+       delay(psip->is_nehalem ? 100 : 10000);
+
+       LAPIC_WRITE(ICRD, psip->target_lapic << LAPIC_ICRD_DEST_SHIFT);
+       LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12));
+
+       if (!psip->is_nehalem) {
+               delay(200);
+               LAPIC_WRITE(ICRD, psip->target_lapic << LAPIC_ICRD_DEST_SHIFT);
+               LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12));
+       }
+
+#ifdef POSTCODE_DELAY
+       /* Wait much longer if postcodes are displayed for a delay period. */
+       i *= 10000;
+#endif
+       mp_wait_for_cpu_up(psip->target_cpu, i*100, 100);
+}
+
 kern_return_t
 intel_startCPU(
        int     slot_num)
 {
-
-       int     i = 1000;
-       int     lapic = cpu_to_lapic[slot_num];
+       int             lapic = cpu_to_lapic[slot_num];
+       boolean_t       istate;
 
        assert(lapic != -1);
 
@@ -232,35 +272,33 @@ intel_startCPU(
        else
                cpu_desc_init(cpu_datap(slot_num), FALSE);
 
-       /* Serialize use of the slave boot stack. */
+       /* Serialize use of the slave boot stack, etc. */
        mutex_lock(&mp_cpu_boot_lock);
 
-       mp_disable_preemption();
+       istate = ml_set_interrupts_enabled(FALSE);
        if (slot_num == get_cpu_number()) {
-               mp_enable_preemption();
+               ml_set_interrupts_enabled(istate);
                mutex_unlock(&mp_cpu_boot_lock);
                return KERN_SUCCESS;
        }
 
-       LAPIC_WRITE(ICRD, lapic << LAPIC_ICRD_DEST_SHIFT);
-       LAPIC_WRITE(ICR, LAPIC_ICR_DM_INIT);
-       delay(10000);
-
-       LAPIC_WRITE(ICRD, lapic << LAPIC_ICRD_DEST_SHIFT);
-       LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12));
-       delay(200);
+       start_info.starter_cpu = cpu_number();
+       start_info.is_nehalem = (cpuid_info()->cpuid_model
+                                       == CPUID_MODEL_NEHALEM);
+       start_info.target_cpu = slot_num;
+       start_info.target_lapic = lapic;
 
-       LAPIC_WRITE(ICRD, lapic << LAPIC_ICRD_DEST_SHIFT);
-       LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12));
-       delay(200);
-
-#ifdef POSTCODE_DELAY
-       /* Wait much longer if postcodes are displayed for a delay period. */
-       i *= 10000;
-#endif
-       mp_wait_for_cpu_up(slot_num, i, 10000);
+       /*
+        * For Nehalem, perform the processor startup with all running
+        * processors rendezvous'ed. This is required during periods when
+        * the cache-disable bit is set for MTRR/PAT initialization.
+        */
+       if (start_info.is_nehalem)
+               mp_rendezvous_no_intrs(start_cpu, (void *) &start_info);
+       else
+               start_cpu((void *) &start_info);
 
-       mp_enable_preemption();
+       ml_set_interrupts_enabled(istate);
        mutex_unlock(&mp_cpu_boot_lock);
 
        if (!cpu_datap(slot_num)->cpu_running) {
@@ -432,6 +470,10 @@ cpu_signal_handler(x86_saved_state_t *regs)
                        DBGLOG(cpu_handle,my_cpu,MP_CALL);
                        i_bit_clear(MP_CALL, my_word);
                        mp_cpus_call_action();
+               } else if (i_bit(MP_CALL_PM, my_word)) {
+                       DBGLOG(cpu_handle,my_cpu,MP_CALL_PM);
+                       i_bit_clear(MP_CALL_PM, my_word);
+                       mp_call_PM();
                }
        } while (*my_word);
 
@@ -548,6 +590,36 @@ cpu_NMI_interrupt(int cpu)
        }
 }
 
+static volatile void   (*mp_PM_func)(void) = NULL;
+
+static void
+mp_call_PM(void)
+{
+       assert(!ml_get_interrupts_enabled());
+
+       if (mp_PM_func != NULL)
+               mp_PM_func();
+}
+
+void
+cpu_PM_interrupt(int cpu)
+{
+       assert(!ml_get_interrupts_enabled());
+
+       if (mp_PM_func != NULL) {
+               if (cpu == cpu_number())
+                       mp_PM_func();
+               else
+                       i386_signal_cpu(cpu, MP_CALL_PM, ASYNC);
+       }
+}
+
+void
+PM_interrupt_register(void (*fn)(void))
+{
+       mp_PM_func = fn;
+}
+
 void
 i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode)
 {
@@ -977,6 +1049,8 @@ i386_activate_cpu(void)
        simple_unlock(&x86_topo_lock);
 }
 
+extern void etimer_timer_expire(void   *arg);
+
 void
 i386_deactivate_cpu(void)
 {
@@ -988,6 +1062,10 @@ i386_deactivate_cpu(void)
        cdp->cpu_running = FALSE;
        simple_unlock(&x86_topo_lock);
 
+       timer_queue_shutdown(&cdp->rtclock_timer.queue);
+       cdp->rtclock_timer.deadline = EndOfAllTime;
+       mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, etimer_timer_expire, NULL);
+
        /*
         * In case a rendezvous/braodcast/call was initiated to this cpu
         * before we cleared cpu_running, we must perform any actions due.
@@ -1188,7 +1266,7 @@ void
 cause_ast_check(
        processor_t     processor)
 {
-       int     cpu = PROCESSOR_DATA(processor, slot_num);
+       int     cpu = processor->cpu_num;
 
        if (cpu != cpu_number()) {
                i386_signal_cpu(cpu, MP_AST, ASYNC);
index 0fac0fbd5ff84a0fda7896014442a2b58264f0df..d4b3551e76268b05d5de6a22846d38da2d9a688f 100644 (file)
@@ -164,6 +164,15 @@ extern cpu_t mp_cpus_call(
                void            (*action_func)(void *),
                void            *arg);
 
+/*
+ * Power-management-specific SPI to:
+ *  - register a callout function, and
+ *  - request the callout (if registered) on a given cpu.
+ */
+extern void PM_interrupt_register(void (*fn)(void));
+extern void cpu_PM_interrupt(int cpu);
+
+
 __END_DECLS
 
 #if MP_DEBUG
index 9e2df152ef445b3e77e0e2135d75d76dead27e9e..798a191df378420506f42bb5a0776167be26148c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -488,13 +488,15 @@ cpu_data_alloc(boolean_t is_boot_cpu)
 
        if (is_boot_cpu) {
                assert(real_ncpus == 1);
-               simple_lock_init(&cpu_lock, 0);
                cdp = &cpu_data_master;
                if (cdp->cpu_processor == NULL) {
+                       simple_lock_init(&cpu_lock, 0);
                        cdp->cpu_processor = cpu_processor_alloc(TRUE);
                        cdp->cpu_pmap = pmap_cpu_alloc(TRUE);
                        cpu_desc_init(cdp, TRUE);
                        fast_syscall_init();
+                       queue_init(&cdp->rtclock_timer.queue);
+                       cdp->rtclock_timer.deadline = EndOfAllTime;
                }
                return cdp;
        }
@@ -569,6 +571,8 @@ cpu_data_alloc(boolean_t is_boot_cpu)
        simple_unlock(&cpu_lock);
 
        cdp->cpu_nanotime = &rtc_nanotime_info;
+       queue_init(&cdp->rtclock_timer.queue);
+       cdp->rtclock_timer.deadline = EndOfAllTime;
 
        kprintf("cpu_data_alloc(%d) %p desc_table: %p "
                "ldt: %p "
@@ -673,22 +677,25 @@ cpu_physwindow_init(int cpu)
 {
        cpu_data_t              *cdp = cpu_data_ptr[cpu];
        cpu_desc_index_t        *cdi = &cdp->cpu_desc_index;
-        vm_offset_t            phys_window;
+        vm_offset_t            phys_window = cdp->cpu_physwindow_base;
 
-       if (vm_allocate(kernel_map, &phys_window,
-                       PAGE_SIZE, VM_FLAGS_ANYWHERE)
+       if (phys_window == 0) {
+               if (vm_allocate(kernel_map, &phys_window,
+                               PAGE_SIZE, VM_FLAGS_ANYWHERE)
                                != KERN_SUCCESS)
-               panic("cpu_physwindow_init: couldn't allocate phys map window");
+                       panic("cpu_physwindow_init: "
+                               "couldn't allocate phys map window");
 
-        /*
-         * make sure the page that encompasses the
-         * pte pointer we're interested in actually
-         * exists in the page table
-         */
-       pmap_expand(kernel_pmap, phys_window);
+               /*
+                * make sure the page that encompasses the
+                * pte pointer we're interested in actually
+                * exists in the page table
+                */
+               pmap_expand(kernel_pmap, phys_window);
 
-       cdp->cpu_physwindow_base = phys_window;
-       cdp->cpu_physwindow_ptep = vtopte(phys_window);
+               cdp->cpu_physwindow_base = phys_window;
+               cdp->cpu_physwindow_ptep = vtopte(phys_window);
+       }
 
        cdi->cdi_gdt[sel_idx(PHYS_WINDOW_SEL)] = physwindow_desc_pattern;
        cdi->cdi_gdt[sel_idx(PHYS_WINDOW_SEL)].offset = phys_window;
index 0da1d98c02b3f195fe97602c6f1017329ad04374..e870b0d0364f749d4450a87b23959a82f7c95d1b 100644 (file)
@@ -45,6 +45,7 @@ typedef enum {
        MP_CHUD,
        MP_BROADCAST,
        MP_CALL,
+       MP_CALL_PM,
        MP_LAST
 } mp_event_t;
 
@@ -60,6 +61,7 @@ const char *mp_event_name[] = {       \
        "MP_CHUD",              \
        "MP_BROADCAST",         \
        "MP_CALL",              \
+       "MP_CALL_PM",           \
        "MP_LAST"               \
 }
 
index 04f1504530affb5028d85247aa00200ef80fb4e7..70d007f52233aca3f892e237d525968bf71b9063 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -1789,9 +1789,11 @@ machine_thread_switch_addrmode(thread_t thread)
        machine_thread_create(thread, thread->task);
 
        /* If we're switching ourselves, reset the pcb addresses etc. */
-       if (thread == current_thread())
-               act_machine_switch_pcb(thread);
-
+       if (thread == current_thread()) {
+         if (current_cpu_datap()->cpu_active_cr3 != kernel_pmap->pm_cr3)
+               pmap_load_kernel_cr3();
+         act_machine_switch_pcb(thread);
+       }
        enable_preemption();
 }
 
index d2efc8bc9bf03cf7b08296da4861342d5c10e7d2..88aa0f87b1189d9be69ac33ba1a13b077d4e2ef0 100644 (file)
@@ -106,6 +106,7 @@ machine_idle(void)
        goto out;
 
     my_cpu->lcpu.state = LCPU_IDLE;
+    my_cpu->lcpu.flags |= X86CORE_FL_IDLE;
     DBGLOG(cpu_handle, cpu_number(), MP_IDLE);
     MARK_CPU_IDLE(cpu_number());
 
@@ -129,6 +130,7 @@ machine_idle(void)
      */
     MARK_CPU_ACTIVE(cpu_number());
     DBGLOG(cpu_handle, cpu_number(), MP_UNIDLE);
+    my_cpu->lcpu.flags &= ~(X86CORE_FL_IDLE | X86CORE_FL_WAKEUP);
     my_cpu->lcpu.state = LCPU_RUN;
 
     /*
@@ -325,6 +327,7 @@ pmCPUExitIdle(cpu_data_t *cpu)
 {
     boolean_t          do_ipi;
 
+    cpu->lcpu.flags |= X86CORE_FL_WAKEUP;
     if (pmInitDone
        && pmDispatch != NULL
        && pmDispatch->exitIdle != NULL)
@@ -332,6 +335,9 @@ pmCPUExitIdle(cpu_data_t *cpu)
     else
        do_ipi = TRUE;
 
+    if (do_ipi)
+       cpu->lcpu.flags &= ~X86CORE_FL_WAKEUP;
+
     return(do_ipi);
 }
 
@@ -534,6 +540,34 @@ pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags)
     }
 }
 
+static uint32_t                saved_run_count = 0;
+
+void
+machine_run_count(uint32_t count)
+{
+    if (pmDispatch != NULL
+       && pmDispatch->pmSetRunCount != NULL)
+       pmDispatch->pmSetRunCount(count);
+    else
+       saved_run_count = count;
+}
+
+boolean_t
+machine_cpu_is_inactive(int cpu)
+{
+    if (pmDispatch != NULL
+       && pmDispatch->pmIsCPUUnAvailable != NULL)
+       return(pmDispatch->pmIsCPUUnAvailable(cpu_to_lcpu(cpu)));
+    else
+       return(FALSE);
+}
+
+static uint32_t
+pmGetSavedRunCount(void)
+{
+    return(saved_run_count);
+}
+
 /*
  * Returns the root of the package tree.
  */
@@ -555,6 +589,22 @@ pmLCPUtoProcessor(int lcpu)
     return(cpu_datap(lcpu)->cpu_processor);
 }
 
+static void
+pmReSyncDeadlines(int cpu)
+{
+    static boolean_t   registered      = FALSE;
+
+    if (!registered) {
+       PM_interrupt_register(&etimer_resync_deadlines);
+       registered = TRUE;
+    }
+
+    if ((uint32_t)cpu == current_cpu_datap()->lcpu.cpu_num)
+       etimer_resync_deadlines();
+    else
+       cpu_PM_interrupt(cpu);
+}
+
 /*
  * Called by the power management kext to register itself and to get the
  * callbacks it might need into other kernel functions.  This interface
@@ -566,23 +616,26 @@ pmKextRegister(uint32_t version, pmDispatch_t *cpuFuncs,
               pmCallBacks_t *callbacks)
 {
     if (callbacks != NULL && version == PM_DISPATCH_VERSION) {
-       callbacks->setRTCPop   = setPop;
-       callbacks->resyncDeadlines = etimer_resync_deadlines;
-       callbacks->initComplete= pmInitComplete;
-       callbacks->GetLCPU     = pmGetLogicalCPU;
-       callbacks->GetCore     = pmGetCore;
-       callbacks->GetDie      = pmGetDie;
-       callbacks->GetPackage  = pmGetPackage;
-       callbacks->GetMyLCPU   = pmGetMyLogicalCPU;
-       callbacks->GetMyCore   = pmGetMyCore;
-       callbacks->GetMyDie    = pmGetMyDie;
-       callbacks->GetMyPackage= pmGetMyPackage;
-       callbacks->GetPkgRoot  = pmGetPkgRoot;
-       callbacks->LockCPUTopology = pmLockCPUTopology;
-       callbacks->GetHibernate    = pmCPUGetHibernate;
-       callbacks->LCPUtoProcessor = pmLCPUtoProcessor;
-       callbacks->ThreadBind      = thread_bind;
-       callbacks->topoParms       = &topoParms;
+       callbacks->setRTCPop            = setPop;
+       callbacks->resyncDeadlines      = pmReSyncDeadlines;
+       callbacks->initComplete         = pmInitComplete;
+       callbacks->GetLCPU              = pmGetLogicalCPU;
+       callbacks->GetCore              = pmGetCore;
+       callbacks->GetDie               = pmGetDie;
+       callbacks->GetPackage           = pmGetPackage;
+       callbacks->GetMyLCPU            = pmGetMyLogicalCPU;
+       callbacks->GetMyCore            = pmGetMyCore;
+       callbacks->GetMyDie             = pmGetMyDie;
+       callbacks->GetMyPackage         = pmGetMyPackage;
+       callbacks->GetPkgRoot           = pmGetPkgRoot;
+       callbacks->LockCPUTopology      = pmLockCPUTopology;
+       callbacks->GetHibernate         = pmCPUGetHibernate;
+       callbacks->LCPUtoProcessor      = pmLCPUtoProcessor;
+       callbacks->ThreadBind           = thread_bind;
+       callbacks->GetSavedRunCount     = pmGetSavedRunCount;
+       callbacks->topoParms            = &topoParms;
+    } else {
+       panic("Version mis-match between Kernel and CPU PM");
     }
 
     if (cpuFuncs != NULL) {
index ca3072b2a301e5493713d0feed7c80cb34111e68..6026f5ed6ff5878c9c2b837b6431a54cef74ddce 100644 (file)
@@ -37,7 +37,7 @@
  * This value should be changed each time that pmDsipatch_t or pmCallBacks_t
  * changes.
  */
-#define PM_DISPATCH_VERSION    12
+#define PM_DISPATCH_VERSION    15
 
 /*
  * Dispatch table for functions that get installed when the power
@@ -69,11 +69,13 @@ typedef struct
     void               (*pmTimerStateRestore)(void);
     kern_return_t      (*exitHalt)(x86_lcpu_t *lcpu);
     void               (*markAllCPUsOff)(void);
+    void               (*pmSetRunCount)(uint32_t count);
+    boolean_t          (*pmIsCPUUnAvailable)(x86_lcpu_t *lcpu);
 } pmDispatch_t;
 
 typedef struct {
     int                        (*setRTCPop)(uint64_t time);
-    void               (*resyncDeadlines)(void);
+    void               (*resyncDeadlines)(int cpu);
     void               (*initComplete)(void);
     x86_lcpu_t         *(*GetLCPU)(int cpu);
     x86_core_t         *(*GetCore)(int cpu);
@@ -88,6 +90,7 @@ typedef struct {
     boolean_t          (*GetHibernate)(int cpu);
     processor_t                (*LCPUtoProcessor)(int lcpu);
     processor_t                (*ThreadBind)(processor_t proc);
+    uint32_t           (*GetSavedRunCount)(void);
     x86_topology_parameters_t  *topoParms;
 } pmCallBacks_t;
 
index a424d7e11e08ea1bd38f4966a234bd758cd984a7..75685d57ddbe0533a5378391feb6d37936a10070 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -3581,12 +3581,6 @@ phys_attribute_clear(
                    vm_map_offset_t va;
 
                    va = pv_e->va;
-                   /*
-                    * first make sure any processor actively
-                    * using this pmap, flushes its TLB state
-                    */
-
-                   PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
 
                    /*
                     * Clear modify and/or reference bits.
@@ -3594,7 +3588,13 @@ phys_attribute_clear(
 
                    pte = pmap_pte(pmap, va);
                    pmap_update_pte(pte, *pte, (*pte & ~bits));
-
+                   /* Ensure all processors using this translation
+                    * invalidate this TLB entry. The invalidation *must* follow
+                    * the PTE update, to ensure that the TLB shadow of the
+                    * 'D' bit (in particular) is synchronized with the
+                    * updated PTE.
+                    */
+                   PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
                }
 
                pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
index a74ca7548c8ac314fbac6f0aca297f2fbf03763d..a8eefb5b143adb04d2f51c5b59e1d46fed3a7299 100644 (file)
@@ -275,7 +275,7 @@ static inline void invlpg(unsigned long addr)
        __asm__ volatile("wrmsr" : : "c" (msr), "a" (lo), "d" (hi))
 
 #define rdtsc(lo,hi) \
-       __asm__ volatile("rdtsc; lfence" : "=a" (lo), "=d" (hi))
+       __asm__ volatile("lfence; rdtsc; lfence" : "=a" (lo), "=d" (hi))
 
 #define write_tsc(lo,hi) wrmsr(0x10, lo, hi)
 
@@ -297,7 +297,17 @@ static inline void wrmsr64(uint32_t msr, uint64_t val)
 static inline uint64_t rdtsc64(void)
 {
        uint64_t ret;
-       __asm__ volatile("rdtsc; lfence" : "=A" (ret));
+       __asm__ volatile("lfence; rdtsc; lfence" : "=A" (ret));
+       return ret;
+}
+
+static inline uint64_t rdtscp64(uint32_t *aux)
+{
+       uint64_t ret;
+       __asm__ volatile("rdtscp; mov %%ecx, %1"
+                               : "=A" (ret), "=m" (*aux)
+                               :
+                               : "ecx");
        return ret;
 }
 
@@ -412,4 +422,8 @@ __END_DECLS
 
 #define MSR_IA32_BIOS_SIGN_ID  0x08B
 
+#define MSR_FLEX_RATIO         0x194
+#define MSR_PLATFORM_INFO      0x0ce
+#define MSR_CORE_THREAD_COUNT  0x035
+
 #endif /* _I386_PROC_REG_H_ */
index 982c160f4a2749674dce28ede90c26fa0079d72f..4b06c8a1ec7430eee5504d51de082f119b287330 100644 (file)
@@ -86,9 +86,6 @@ uint64_t      rtc_decrementer_min;
 void                   rtclock_intr(x86_saved_state_t *regs);
 static uint64_t                maxDec;                 /* longest interval our hardware timer can handle (nsec) */
 
-/* XXX this should really be in a header somewhere */
-extern clock_timer_func_t      rtclock_timer_expire;
-
 static void    rtc_set_timescale(uint64_t cycles);
 static uint64_t        rtc_export_speed(uint64_t cycles);
 
@@ -461,14 +458,6 @@ clock_timebase_info(
        info->numer = info->denom =  1;
 }      
 
-void
-clock_set_timer_func(
-       clock_timer_func_t              func)
-{
-       if (rtclock_timer_expire == NULL)
-               rtclock_timer_expire = func;
-}
-
 /*
  * Real-time clock device interrupt.
  */
index e3ea716d46e088327768ee2308dd9e78d26c8ae1..6f3406a8cff4cdca4e355e650c0503781452f059 100644 (file)
@@ -83,6 +83,7 @@ extern rtc_nanotime_t rtc_nanotime_info;
 0:     movl    RNT_GENERATION(%edi),%esi       /* being updated? */    ; \
        testl   %esi,%esi                                               ; \
        jz      0b                              /* wait until done */   ; \
+       lfence                                                          ; \
        rdtsc                                                           ; \
        lfence                                                          ; \
        subl    RNT_TSC_BASE(%edi),%eax                                 ; \
@@ -111,6 +112,7 @@ extern rtc_nanotime_t       rtc_nanotime_info;
 0:     movl    RNT_GENERATION(%rdi),%esi                               ; \
        test    %esi,%esi                       /* info updating? */    ; \
        jz      0b                              /* - wait if so */      ; \
+       lfence                                                          ; \
        rdtsc                                                           ; \
        lfence                                                          ; \
        shlq    $32,%rdx                                                ; \
index 624e5d431197bb8f781eabb4d074a7a056e6f613..669bc401f1ae7a7dc7fec6127f974d70d4ec6546 100644 (file)
@@ -148,12 +148,47 @@ tsc_init(void)
                        cpuid_info()->cpuid_family);
        }
 
-       {
+       switch (cpuid_info()->cpuid_model) {
+       case CPUID_MODEL_NEHALEM: {
+               uint64_t cpu_mhz;
+               uint64_t msr_flex_ratio;
+               uint64_t msr_platform_info;
+
+               /* See if FLEX_RATIO is being used */
+               msr_flex_ratio = rdmsr64(MSR_FLEX_RATIO);
+               msr_platform_info = rdmsr64(MSR_PLATFORM_INFO);
+               flex_ratio_min = (uint32_t)bitfield(msr_platform_info, 47, 40);
+               flex_ratio_max = (uint32_t)bitfield(msr_platform_info, 15, 8);
+               /* No BIOS-programed flex ratio. Use hardware max as default */
+               tscGranularity = flex_ratio_max;
+               if (msr_flex_ratio & bit(16)) {
+                       /* Flex Enabled: Use this MSR if less than max */
+                       flex_ratio = (uint32_t)bitfield(msr_flex_ratio, 15, 8);
+                       if (flex_ratio < flex_ratio_max)
+                               tscGranularity = flex_ratio;
+               }
+
+               /* If EFI isn't configured correctly, use a constant 
+                * value. See 6036811.
+                */
+               if (busFreq == 0)
+                       busFreq = BASE_NHM_CLOCK_SOURCE;
+
+               cpu_mhz = tscGranularity * BASE_NHM_CLOCK_SOURCE;
+
+               kprintf("[NHM] Maximum Non-Turbo Ratio = [%d]\n",
+                       (uint32_t)tscGranularity);
+               kprintf("[NHM] CPU: Frequency          = %6d.%04dMhz\n", 
+                       (uint32_t)(cpu_mhz / Mega), (uint32_t)(cpu_mhz % Mega));
+               break;
+            }
+       default: {
                uint64_t        prfsts;
 
                prfsts = rdmsr64(IA32_PERF_STS);
                tscGranularity = (uint32_t)bitfield(prfsts, 44, 40);
                N_by_2_bus_ratio = (prfsts & bit(46)) != 0;
+           }
        }
 
        if (busFreq != 0) {
index 1b6589de7805b087b947c03980769a28dcf63878..e702ec234b818f0815cb7d23619046d7640f1072 100644 (file)
@@ -40,6 +40,7 @@
 #ifndef _I386_TSC_H_
 #define _I386_TSC_H_
 
+#define BASE_NHM_CLOCK_SOURCE  139806638ULL
 #define IA32_PERF_STS          0x198
 
 extern uint64_t        busFCvtt2n;
index 31c16ca9f6c85b1a0d9e7210601c9737d50c0580..c86af004ec450c1d0540ecc5bdf4422ec64ddbef 100644 (file)
@@ -368,6 +368,7 @@ void
 vmx_resume()
 {
        VMX_KPRINTF("vmx_resume\n");
+       vmx_init(); /* init VMX on CPU #0 */
        if (vmx_use_count)
                vmx_on();
 }
index 0c07ed8ea8ab262ab8737ccd8711dc9769ee82a1..db7e6acf49cb22331581f618e6ae0f16f7064151 100644 (file)
@@ -241,30 +241,9 @@ MACRO_END
 /*
  *     extern void
  *     ipc_kmsg_send_always(ipc_kmsg_t);
- *
- *     Unfortunately, to avoid warnings/lint about unused variables
- *     when assertions are turned off, we need two versions of this.
  */
-#if    MACH_ASSERT
-
 #define        ipc_kmsg_send_always(kmsg)                                      \
-MACRO_BEGIN                                                            \
-       mach_msg_return_t mr2;                                          \
-                                                                       \
-       mr2 = ipc_kmsg_send((kmsg), MACH_SEND_ALWAYS,                   \
-                            MACH_MSG_TIMEOUT_NONE);                    \
-       assert(mr == MACH_MSG_SUCCESS);                                 \
-MACRO_END
-
-#else  /* MACH_ASSERT */
-
-#define        ipc_kmsg_send_always(kmsg)                                      \
-MACRO_BEGIN                                                            \
-       (void) ipc_kmsg_send((kmsg), MACH_SEND_ALWAYS,                  \
-                              MACH_MSG_TIMEOUT_NONE);                  \
-MACRO_END
-
-#endif /* MACH_ASSERT */
+       ipc_kmsg_send((kmsg), MACH_SEND_ALWAYS, MACH_MSG_TIMEOUT_NONE)
 
 
 /* Allocate a kernel message */
index 66c3db6f1ae221637a6ed3998865c86e835b4b55..316babd8df542f751b13695291d0643b02d962ad 100644 (file)
@@ -361,9 +361,10 @@ ipc_mqueue_send(
        imq_lock(mqueue);
 
        if (!imq_full(mqueue) ||
-               (option & MACH_SEND_ALWAYS) ||
-               (MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits) ==
-                MACH_MSG_TYPE_PORT_SEND_ONCE)) {
+           (!imq_full_kernel(mqueue) && 
+            ((option & MACH_SEND_ALWAYS) ||
+             (MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits) ==
+              MACH_MSG_TYPE_PORT_SEND_ONCE)))) {
                mqueue->imq_msgcount++;
                assert(mqueue->imq_msgcount > 0);
                imq_unlock(mqueue);
@@ -380,6 +381,11 @@ ipc_mqueue_send(
                        splx(s);
                        return MACH_SEND_TIMED_OUT;
                }
+               if (imq_full_kernel(mqueue)) {
+                       imq_unlock(mqueue);
+                       splx(s);
+                       return MACH_SEND_NO_BUFFER;
+               }
                mqueue->imq_fullwaiters = TRUE;
                thread_lock(cur_thread);
                if (option & MACH_SEND_TIMEOUT)
index 1fa4294f8a5b1da6ee36d43e9cc1a9175f448321..4ef47d9692227e5ab18d1fb5e4cf9f5ae6a1e055 100644 (file)
@@ -112,6 +112,7 @@ typedef struct ipc_mqueue {
 #define imq_held(mq)           wait_queue_held(&(mq)->imq_wait_queue)
 
 #define imq_full(mq)           ((mq)->imq_msgcount >= (mq)->imq_qlimit)
+#define imq_full_kernel(mq)    ((mq)->imq_msgcount >= MACH_PORT_QLIMIT_KERNEL)
 
 extern int ipc_mqueue_full;
 extern int ipc_mqueue_rcv;
index bbffba8bc174003ba89dd453f5bc9ae1723b0741..25a26aa63956d194316e3e434ffdb6f0977135a7 100644 (file)
@@ -84,13 +84,8 @@ ipc_notify_port_deleted(
        ipc_port_t              port,
        mach_port_name_t        name)
 {
-       kern_return_t kr;
-
-       kr = mach_notify_port_deleted(port, name);
-       if (kr != KERN_SUCCESS) {
-               printf("dropped port-deleted (%p, 0x%x)\n", port, name);
-               ipc_port_release_sonce(port);
-       }
+       (void)mach_notify_port_deleted(port, name);
+       /* send-once right consumed */
 }
 
 /*
@@ -110,15 +105,8 @@ ipc_notify_port_destroyed(
        ipc_port_t      port,
        ipc_port_t      right)
 {
-       kern_return_t kr;
-
-       kr = mach_notify_port_destroyed(port, right);
-       if (kr != KERN_SUCCESS) {
-               printf("dropped port-destroyed (%p, %p)\n",
-                      port, right);
-               ipc_port_release_sonce(port);
-               ipc_port_release_receive(right);
-       }
+       mach_notify_port_destroyed(port, right);
+       /* send-once and receive rights consumed */
 }
 
 /*
@@ -135,13 +123,8 @@ ipc_notify_no_senders(
        ipc_port_t              port,
        mach_port_mscount_t     mscount)
 {
-       kern_return_t kr;
-
-       kr = mach_notify_no_senders(port, mscount);
-       if (kr != KERN_SUCCESS) {
-               printf("dropped no-senders (%p, %u)\n", port, mscount);
-               ipc_port_release_sonce(port);
-       }
+       (void)mach_notify_no_senders(port, mscount);
+       /* send-once right consumed */
 }
 
 /*
@@ -157,13 +140,8 @@ void
 ipc_notify_send_once(
        ipc_port_t      port)
 {
-       kern_return_t kr;
-
-       kr = mach_notify_send_once(port);
-       if (kr != KERN_SUCCESS) {
-               printf("dropped send-once (%p)\n", port);
-               ipc_port_release_sonce(port);
-       }
+       (void)mach_notify_send_once(port);
+       /* send-once right consumed */
 }
 
 /*
@@ -180,11 +158,6 @@ ipc_notify_dead_name(
        ipc_port_t              port,
        mach_port_name_t        name)
 {
-       kern_return_t kr;
-
-       kr = mach_notify_dead_name(port, name);
-       if (kr != KERN_SUCCESS) {
-               printf("dropped dead-name (%p, 0x%x)\n", port, name);
-               ipc_port_release_sonce(port);
-       }
+       (void)mach_notify_dead_name(port, name);
+       /* send-once right consumed */
 }
index 1d9d165964ff31eb0a9e66d2e7e4575c0310bc32..e7ffd94ecb1633964b000a246c463574b89384b2 100644 (file)
@@ -625,7 +625,7 @@ ipc_right_clean(
            }
 
            default:
-               panic("ipc_right_clean: strange type");
+               panic("ipc_right_clean: strange type - 0x%x", type);
        }
 }
 
index 065f3b40fcff2b1c5875fe1c703558157cdc83e6..8d3f17e4b204d4ffa12505955233386c0a75be9e 100644 (file)
@@ -1652,8 +1652,12 @@ mach_msg_overwrite_trap(
                    (reply_port->ip_receiver_name != rcv_name) ||
                    (reply_port->ip_pset_count != 0))
                {
+                       /* try to enqueue by sending with an immediate timeout */
                        ip_unlock(reply_port);
-                       ipc_kmsg_send_always(kmsg);
+                       mr = ipc_kmsg_send(kmsg, MACH_SEND_TIMEOUT, 0);
+                       if (mr != MACH_MSG_SUCCESS) {
+                               ipc_kmsg_destroy(kmsg);
+                       }
                        HOT(c_mmot_cold_052++);
                        goto slow_get_rcv_port;
                }
@@ -1668,6 +1672,8 @@ mach_msg_overwrite_trap(
                 * If there are messages on the port
                 * or other threads waiting for a message,
                 * we cannot directly receive the reply.
+                * Try to enqueue it by sending with an
+                * immediate timeout.
                 */
                if (!wait_queue_empty(&rcv_mqueue->imq_wait_queue) ||
                    (ipc_kmsg_queue_first(&rcv_mqueue->imq_messages) != IKM_NULL))
@@ -1675,7 +1681,10 @@ mach_msg_overwrite_trap(
                        imq_unlock(rcv_mqueue);
                        splx(s);
                        ip_unlock(reply_port);
-                       ipc_kmsg_send_always(kmsg);
+                       mr = ipc_kmsg_send(kmsg, MACH_SEND_TIMEOUT, 0);
+                       if (mr != MACH_MSG_SUCCESS) {
+                               ipc_kmsg_destroy(kmsg);
+                       }
                        HOT(c_mmot_cold_053++);
                        goto slow_get_rcv_port;
                }
index f0b75b59a1ee4020e9028af80e2b3a47c6e5dcd5..9a0e95ec60dde20055d9d28232b4bb52a9cb1b6e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -164,14 +164,9 @@ ast_taken(
                        /* 
                         * Check for preemption.
                         */
-                       if (reasons & AST_PREEMPT) {
-                               processor_t             myprocessor = current_processor();
+                       if (reasons & AST_PREEMPT)
+                               reasons = csw_check(current_processor());
 
-                               if (csw_needed(thread, myprocessor))
-                                       reasons = AST_PREEMPT;
-                               else
-                                       reasons = AST_NONE;
-                       }
                        if (    (reasons & AST_PREEMPT)                         &&
                                        wait_queue_assert_possible(thread)              ) {             
                                counter(c_ast_taken_block++);
@@ -205,7 +200,7 @@ ast_check(
                /*
                 *      Context switch check.
                 */
-               if ((preempt = csw_check(thread, processor)) != AST_NONE)
+               if ((preempt = csw_check(processor)) != AST_NONE)
                        ast_on(preempt);
        }
 }
index 0990ad1428e9b78b0b3af439b6721998defaf82f..57ab51d5e016558a681458c0041c231143b638fe 100644 (file)
@@ -1,6 +1,5 @@
 /*
- * Copyright (c) 1993-1995, 1999-2000 Apple Computer, Inc.
- * All rights reserved.
+ * Copyright (c) 1993-1995, 1999-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
- * Private declarations for thread-based callouts.
- *
- * HISTORY
- *
- * 10 July 1999 (debo)
- *  Pulled into Mac OS X (microkernel).
- *
- * 3 July 1993 (debo)
- *     Created.
+ * Declarations for generic call outs.
  */
 
 #ifndef _KERN_CALL_ENTRY_H_
@@ -51,21 +42,32 @@ typedef void                        (*call_entry_func_t)(
 
 typedef struct call_entry {
     queue_chain_t              q_link;
+       queue_t                         queue;
     call_entry_func_t  func;
     call_entry_param_t param0;
     call_entry_param_t param1;
     uint64_t                   deadline;
-    enum {
-         IDLE,
-         PENDING,
-         DELAYED }                     state;
 } call_entry_data_t;
 
+typedef struct call_entry              *call_entry_t;
+
+extern queue_t         call_entry_enqueue_deadline(
+                                                       call_entry_t            entry,
+                                                       queue_t                         queue,
+                                                       uint64_t                        deadline);
+
+extern queue_t         call_entry_enqueue_tail(
+                                                       call_entry_t    entry,
+                                                       queue_t                 queue);
+
+extern queue_t         call_entry_dequeue(
+                                                       call_entry_t    entry);
+
 #define        call_entry_setup(entry, pfun, p0)                               \
 MACRO_BEGIN                                                                                            \
        (entry)->func           = (call_entry_func_t)(pfun);    \
        (entry)->param0         = (call_entry_param_t)(p0);             \
-       (entry)->state          = IDLE;                                                 \
+       (entry)->queue          = NULL;                                                 \
 MACRO_END
 
 #endif /* MACH_KERNEL_PRIVATE */
index de80864066a98f8873225b513d5b552f3117504f..5ca49ea7457db29998fa6bda339063b5eb93ab2a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -46,6 +46,8 @@
 
 #ifdef MACH_KERNEL_PRIVATE
 
+#include <kern/queue.h>
+
 /*
  * Clock operations list structure. Contains vectors to machine
  * dependent clock routines.
@@ -96,15 +98,6 @@ extern void          clock_timebase_init(void);
  */
 extern void            clock_service_create(void);
 
-typedef void           (*clock_timer_func_t)(
-                                               uint64_t                        timestamp);
-
-extern void                    clock_set_timer_func(
-                                               clock_timer_func_t      func);
-
-extern void                    clock_set_timer_deadline(
-                                               uint64_t                        deadline);
-
 extern void                    clock_gettimeofday_set_commpage(
                                                uint64_t                                abstime,
                                                uint64_t                                epoch,
index 679e1779c7df0a63c50dd581f57b06e4db95aab8..599d1670ea0bb220d2dafb7b3d5ff8db69b1026d 100644 (file)
@@ -66,6 +66,7 @@
 #include <kern/assert.h>
 #include <kern/sched_prim.h>
 #include <kern/misc_protos.h>
+#include <kern/clock.h>
 #include <vm/vm_kern.h>
 #include <vm/pmap.h>
 #include <stdarg.h>
@@ -106,10 +107,15 @@ unsigned int              panic_is_inited = 0;
 unsigned int           return_on_panic = 0;
 unsigned long          panic_caller;
 
-char debug_buf[PAGE_SIZE];
-ppnum_t debug_buf_page;
-char *debug_buf_ptr;
-unsigned int debug_buf_size;
+#if CONFIG_EMBEDDED
+#define DEBUG_BUF_SIZE (PAGE_SIZE)
+#else
+#define DEBUG_BUF_SIZE (3 * PAGE_SIZE)
+#endif
+
+char debug_buf[DEBUG_BUF_SIZE];
+char *debug_buf_ptr = debug_buf;
+unsigned int debug_buf_size = sizeof(debug_buf);
 
 static char model_name[64];
 
@@ -184,9 +190,7 @@ debug_log_init(void)
        if (debug_buf_size != 0)
                return;
        debug_buf_ptr = debug_buf;
-       debug_buf_size = PAGE_SIZE;
-        debug_buf_page = pmap_find_phys(kernel_pmap,
-                                       (addr64_t)(uintptr_t)debug_buf_ptr);
+       debug_buf_size = sizeof(debug_buf);
 }
 
 #if __i386__
@@ -397,6 +401,13 @@ static void panic_display_model_name(void) {
                kdb_printf("System model name: %s\n", model_name);
 }
 
+static void panic_display_uptime(void) {
+       uint64_t        uptime;
+       absolutetime_to_nanoseconds(mach_absolute_time(), &uptime);
+
+       kdb_printf("\nSystem uptime in nanoseconds: %llu\n", uptime);
+}
+
 extern const char version[];
 extern char osversion[];
 
@@ -409,10 +420,54 @@ __private_extern__ void panic_display_system_configuration(void) {
                    (osversion[0] != 0) ? osversion : "Not yet set");
                kdb_printf("\nKernel version:\n%s\n",version);
                panic_display_model_name();
+               panic_display_uptime();
                config_displayed = TRUE;
        }
 }
 
+extern zone_t          first_zone;
+extern unsigned int    num_zones, stack_total;
+
+#if defined(__i386__)
+extern unsigned int    inuse_ptepages_count;
+#endif
+
+extern boolean_t       panic_include_zprint;
+extern vm_size_t       kalloc_large_total;
+
+__private_extern__ void panic_display_zprint()
+{
+       if(panic_include_zprint == TRUE) {
+
+               unsigned int    i;
+               struct zone     zone_copy;
+
+               if(first_zone!=NULL) {
+                       if(ml_nofault_copy((vm_offset_t)first_zone, (vm_offset_t)&zone_copy, sizeof(struct zone)) == sizeof(struct zone)) {
+                               for (i = 0; i < num_zones; i++) {
+                                       if(zone_copy.cur_size > (1024*1024)) {
+                                               kdb_printf("%.20s:%lu\n",zone_copy.zone_name,(uintptr_t)zone_copy.cur_size);
+                                       }       
+                                       
+                                       if(zone_copy.next_zone == NULL) {
+                                               break;
+                                       }
+
+                                       if(ml_nofault_copy((vm_offset_t)zone_copy.next_zone, (vm_offset_t)&zone_copy, sizeof(struct zone)) != sizeof(struct zone)) {
+                                               break;
+                                       }
+                               }
+                       }
+               }
+
+               kdb_printf("Kernel Stacks:%lu\n",(uintptr_t)(KERNEL_STACK_SIZE * stack_total));
+#if defined(__i386__)
+               kdb_printf("PageTables:%lu\n",(uintptr_t)(PAGE_SIZE * inuse_ptepages_count));
+#endif
+               kdb_printf("Kalloc.Large:%lu\n",(uintptr_t)kalloc_large_total);
+       }
+}
+
 #if !MACH_KDP
 static struct ether_addr kdp_current_mac_address = {{0, 0, 0, 0, 0, 0}};
 unsigned int not_in_kdp = 1;
index e861592e6ef7175fc9c2e877b29eec42842e86a4..cdf94989f89aea716054fef8e25baef587962001 100644 (file)
@@ -84,6 +84,7 @@ int   packA(char *inbuf, uint32_t length, uint32_t buflen);
 void   unpackA(char *inbuf, uint32_t length);
 
 void   panic_display_system_configuration(void);
+void   panic_display_zprint(void);
 
 #endif /* MACH_KERNEL_PRIVATE */
 
index 3decaefe2306f6c2f16f80c00052f22ad69991da..27a089239338367b714b15b768cdf2619264958b 100644 (file)
@@ -189,11 +189,13 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
     uint64_t start, end, nsec;
     vm_page_t m;
     uint32_t pages = page_list->page_count;
-    uint32_t count_zf = 0, count_throttled = 0, count_inactive = 0, count_active = 0;
+    uint32_t count_zf = 0, count_throttled = 0;
+    uint32_t count_inactive = 0, count_active = 0, count_speculative = 0;
     uint32_t count_wire = pages;
     uint32_t count_discard_active    = 0;
     uint32_t count_discard_inactive  = 0;
     uint32_t count_discard_purgeable = 0;
+    uint32_t count_discard_speculative = 0;
     uint32_t i;
     uint32_t             bank;
     hibernate_bitmap_t * bitmap;
@@ -262,7 +264,7 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
     queue_iterate( &vm_page_queue_zf,
                     m,
                     vm_page_t,
-                    pageq )
+                   pageq )
     {
         if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) 
          && consider_discard(m))
@@ -299,6 +301,26 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
        hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
     }
 
+    for( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ )
+    {
+       queue_iterate(&vm_page_queue_speculative[i].age_q,
+                     m,
+                     vm_page_t,
+                     pageq)
+       {
+           if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) 
+            && consider_discard(m))
+           {
+               hibernate_page_bitset(page_list, TRUE, m->phys_page);
+               count_discard_speculative++;
+           }
+           else
+               count_speculative++;
+           count_wire--;
+           hibernate_page_bitset(page_list_wired, TRUE, m->phys_page);
+       }
+    }
+
     queue_iterate( &vm_page_queue_active,
                     m,
                     vm_page_t,
@@ -338,11 +360,11 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
     absolutetime_to_nanoseconds(end - start, &nsec);
     HIBLOG("hibernate_page_list_setall time: %qd ms\n", nsec / 1000000ULL);
 
-    HIBLOG("pages %d, wire %d, act %d, inact %d, zf %d, throt %d, could discard act %d inact %d purgeable %d\n", 
-                pages, count_wire, count_active, count_inactive, count_zf, count_throttled,
-                count_discard_active, count_discard_inactive, count_discard_purgeable);
+    HIBLOG("pages %d, wire %d, act %d, inact %d, spec %d, zf %d, throt %d, could discard act %d inact %d purgeable %d spec %d\n", 
+                pages, count_wire, count_active, count_inactive, count_speculative, count_zf, count_throttled,
+                count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative);
 
-    *pagesOut = pages - count_discard_active - count_discard_inactive - count_discard_purgeable;
+    *pagesOut = pages - count_discard_active - count_discard_inactive - count_discard_purgeable - count_discard_speculative;
 }
 
 void
@@ -351,9 +373,11 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list)
     uint64_t  start, end, nsec;
     vm_page_t m;
     vm_page_t next;
+    uint32_t  i;
     uint32_t  count_discard_active    = 0;
     uint32_t  count_discard_inactive  = 0;
     uint32_t  count_discard_purgeable = 0;
+    uint32_t  count_discard_speculative = 0;
 
     clock_get_uptime(&start);
 
@@ -372,6 +396,21 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list)
         m = next;
     }
 
+    for( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ )
+    {
+       m = (vm_page_t) queue_first(&vm_page_queue_speculative[i].age_q);
+       while (m && !queue_end(&vm_page_queue_speculative[i].age_q, (queue_entry_t)m))
+       {
+           next = (vm_page_t) m->pageq.next;
+           if (hibernate_page_bittst(page_list, m->phys_page))
+           {
+               count_discard_speculative++;
+               discard_page(m);
+           }
+           m = next;
+       }
+    }
+
     m = (vm_page_t) queue_first(&vm_page_queue_inactive);
     while (m && !queue_end(&vm_page_queue_inactive, (queue_entry_t)m))
     {
@@ -404,9 +443,9 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list)
 
     clock_get_uptime(&end);
     absolutetime_to_nanoseconds(end - start, &nsec);
-    HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d\n",
+    HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d spec %d\n",
                 nsec / 1000000ULL,
-                count_discard_active, count_discard_inactive, count_discard_purgeable);
+                count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative);
 }
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
index 010ac288767fd2b06e2cc3906e8dd859624b0a9d..0f8de2841a9d3138f600d756e8f55a01723a6cfa 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -153,7 +153,7 @@ host_info(
        case HOST_BASIC_INFO:
        {
                register host_basic_info_t      basic_info;
-               register int                            master_slot;
+               register int                            master_num;
 
                /*
                 *      Basic information about this host.
@@ -166,12 +166,12 @@ host_info(
                basic_info->memory_size = machine_info.memory_size;
                basic_info->max_cpus = machine_info.max_cpus;
                basic_info->avail_cpus = processor_avail_count;
-               master_slot = PROCESSOR_DATA(master_processor, slot_num);
-               basic_info->cpu_type = slot_type(master_slot);
-               basic_info->cpu_subtype = slot_subtype(master_slot);
+               master_num = master_processor->cpu_num;
+               basic_info->cpu_type = slot_type(master_num);
+               basic_info->cpu_subtype = slot_subtype(master_num);
 
                if (*count >= HOST_BASIC_INFO_COUNT) {
-                       basic_info->cpu_threadtype = slot_threadtype(master_slot);
+                       basic_info->cpu_threadtype = slot_threadtype(master_num);
                        basic_info->physical_cpu = machine_info.physical_cpu;
                        basic_info->physical_cpu_max = machine_info.physical_cpu_max;
                        basic_info->logical_cpu = machine_info.logical_cpu;
index 3b2ae194b53a1e55f69f1ce2c5ffc350350a568a..0dbe02f087c5fc79c6f50540309f181cf36b9a9a 100644 (file)
@@ -93,8 +93,9 @@
  *             Nothing locked.
  *     Returns:
  *             MACH_MSG_SUCCESS        Sent the message.
- *             MACH_MSG_SEND_NO_BUFFER Destination port had inuse fixed bufer
  *             MACH_SEND_INVALID_DEST  Bad destination port.
+ *             MACH_MSG_SEND_NO_BUFFER Destination port had inuse fixed bufer
+ *                                     or destination is above kernel limit
  */
 
 mach_msg_return_t
@@ -113,9 +114,13 @@ mach_msg_send_from_kernel(
                return mr;
 
        ipc_kmsg_copyin_from_kernel(kmsg);
-       ipc_kmsg_send_always(kmsg);
 
-       return MACH_MSG_SUCCESS;
+       mr = ipc_kmsg_send_always(kmsg);
+       if (mr != MACH_MSG_SUCCESS) {
+               ipc_kmsg_destroy(kmsg);
+       }
+
+       return mr;
 }
 
 mach_msg_return_t
@@ -138,7 +143,7 @@ mach_msg_send_from_kernel_with_options(
        ipc_kmsg_copyin_from_kernel(kmsg);
        mr = ipc_kmsg_send(kmsg, option, timeout_val);
        if (mr != MACH_MSG_SUCCESS) {
-               ipc_kmsg_free(kmsg);
+               ipc_kmsg_destroy(kmsg);
        }
        
        return mr;
@@ -196,7 +201,11 @@ mach_msg_rpc_from_kernel(
 
        ipc_kmsg_copyin_from_kernel(kmsg);
 
-       ipc_kmsg_send_always(kmsg);
+       mr = ipc_kmsg_send_always(kmsg);
+       if (mr != MACH_MSG_SUCCESS) {
+               ipc_kmsg_destroy(kmsg);
+               return mr;
+       }
 
        for (;;) {
                ipc_mqueue_t mqueue;
index f30d897e271666aba80f99b8d2bb90955234258c..1feb3688eb06e976218c6e8428b3a105e1811da2 100644 (file)
@@ -45,6 +45,7 @@
 #include <mach/host_priv_server.h>
 #include <mach/vm_map.h>
 
+#include <kern/clock.h>
 #include <kern/kalloc.h>
 #include <kern/kern_types.h>
 #include <kern/thread.h>
@@ -55,6 +56,8 @@
 #include <mach-o/loader.h>
 #include <mach-o/nlist.h>
 
+#include <mach/kext_panic_report.h>
+
 /*
  * XXX headers for which prototypes should be in a common include file;
  * XXX see libsa/kext.cpp for why.
@@ -99,6 +102,377 @@ typedef struct cmd_queue_entry {
 
 queue_head_t kmod_cmd_queue;
 
+/*******************************************************************************
+*******************************************************************************/
+#define KMOD_PANICLIST_SIZE  (2 * PAGE_SIZE)
+
+char     * unloaded_kext_paniclist        = NULL;
+uint32_t   unloaded_kext_paniclist_size   = 0;
+uint32_t   unloaded_kext_paniclist_length = 0;
+uint64_t   last_loaded_timestamp          = 0;
+
+char     * loaded_kext_paniclist          = NULL;
+uint32_t   loaded_kext_paniclist_size     = 0;
+uint32_t   loaded_kext_paniclist_length   = 0;
+uint64_t   last_unloaded_timestamp        = 0;
+
+int substitute(
+    const char * scan_string,
+    char       * string_out,
+    uint32_t   * to_index,
+    uint32_t   * from_index,
+    const char * substring,
+    char         marker,
+    char         substitution);
+
+/* identifier_out must be at least KMOD_MAX_NAME bytes.
+ */
+int substitute(
+    const char * scan_string,
+    char       * string_out,
+    uint32_t   * to_index,
+    uint32_t   * from_index,
+    const char * substring,
+    char         marker,
+    char         substitution)
+{
+    uint32_t substring_length = strnlen(substring, KMOD_MAX_NAME - 1);
+
+    if (!strncmp(scan_string, substring, substring_length)) {
+        if (marker) {
+            string_out[(*to_index)++] = marker;
+        }
+        string_out[(*to_index)++] = substitution;
+        (*from_index) += substring_length;
+        return 1;
+    }
+    return 0;
+}
+
+void compactIdentifier(
+    const char * identifier,
+    char       * identifier_out,
+    char      ** identifier_out_end);
+
+void compactIdentifier(
+    const char * identifier,
+    char       * identifier_out,
+    char      ** identifier_out_end)
+{
+    uint32_t       from_index, to_index;
+    uint32_t       scan_from_index = 0;
+    uint32_t       scan_to_index   = 0;
+    subs_entry_t * subs_entry    = NULL;
+    int            did_sub       = 0;
+
+    from_index = to_index = 0;
+    identifier_out[0] = '\0';
+
+   /* Replace certain identifier prefixes with shorter @+character sequences.
+    */
+    for (subs_entry = &kext_identifier_prefix_subs[0];
+         subs_entry->substring && !did_sub;
+         subs_entry++) {
+
+        did_sub = substitute(identifier, identifier_out,
+            &scan_to_index, &scan_from_index,
+            subs_entry->substring, /* marker */ '\0', subs_entry->substitute);
+    }
+    did_sub = 0;
+
+   /* Now scan through the identifier looking for the common substrings
+    * and replacing them with shorter !+character sequences.
+    */
+    for (/* see above */;
+         scan_from_index < KMOD_MAX_NAME - 1 && identifier[scan_from_index];
+         /* see loop */) {
+         
+        const char   * scan_string = &identifier[scan_from_index];
+
+        did_sub = 0;
+
+        if (scan_from_index) {
+            for (subs_entry = &kext_identifier_substring_subs[0];
+                 subs_entry->substring && !did_sub;
+                 subs_entry++) {
+
+                did_sub = substitute(scan_string, identifier_out,
+                    &scan_to_index, &scan_from_index,
+                    subs_entry->substring, '!', subs_entry->substitute);
+            }
+        }
+
+        if (!did_sub) {
+            identifier_out[scan_to_index++] = identifier[scan_from_index++];
+        }
+    }
+    
+    identifier_out[scan_to_index] = '\0';
+    if (identifier_out_end) {
+        *identifier_out_end = &identifier_out[scan_to_index];
+    }
+    
+    return;
+}
+
+/* identPlusVers must be at least 2*KMOD_MAX_NAME in length.
+ */
+int assemble_identifier_and_version(
+    kmod_info_t * kmod_info, 
+    char        * identPlusVers);
+int assemble_identifier_and_version(
+    kmod_info_t * kmod_info, 
+    char        * identPlusVers)
+{
+    int result = 0;
+
+    compactIdentifier(kmod_info->name, identPlusVers, NULL);
+    result = strnlen(identPlusVers, KMOD_MAX_NAME - 1);
+    identPlusVers[result++] = '\t';  // increment for real char
+    identPlusVers[result] = '\0';    // don't increment for nul char
+    result = strlcat(identPlusVers, kmod_info->version, KMOD_MAX_NAME);
+
+    return result;
+}
+
+#define LAST_LOADED " - last loaded "
+#define LAST_LOADED_TS_WIDTH  (16)
+
+uint32_t save_loaded_kext_paniclist_typed(
+    const char * prefix,
+    int          invertFlag,
+    int          libsFlag,
+    char       * paniclist,
+    uint32_t     list_size,
+    uint32_t   * list_length_ptr,
+    int         (*printf_func)(const char *fmt, ...));
+uint32_t save_loaded_kext_paniclist_typed(
+    const char * prefix,
+    int          invertFlag,
+    int          libsFlag,
+    char       * paniclist,
+    uint32_t     list_size,
+    uint32_t   * list_length_ptr,
+    int         (*printf_func)(const char *fmt, ...))
+{
+    uint32_t      result = 0;
+    int           error  = 0;
+    kmod_info_t * kmod_info;
+
+    for (kmod_info = kmod;
+         kmod_info && (*list_length_ptr + 1 < list_size);
+         kmod_info = kmod_info->next) {
+
+        int      match;
+        char     identPlusVers[2*KMOD_MAX_NAME];
+        uint32_t identPlusVersLength;
+        char     timestampBuffer[17]; // enough for a uint64_t
+
+        if (!pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)kmod_info))) {
+            (*printf_func)("kmod scan stopped due to missing kmod page: %p\n",
+                kmod_info);
+            error = 1;
+            goto finish;
+        }
+
+       /* Skip all built-in/fake entries.
+        */
+        if (!kmod_info->address) {
+            continue;
+        }
+
+       /* Filter for kmod name (bundle identifier).
+        */
+        match = !strncmp(kmod_info->name, prefix, strnlen(prefix, KMOD_MAX_NAME));
+        if ((match && invertFlag) || (!match && !invertFlag)) {
+            continue;
+        }
+
+       /* Filter for libraries. This isn't a strictly correct check,
+        * but any kext that does have references to it has to be a library.
+        * A kext w/o references may or may not be a library.
+        */
+        if ((libsFlag == 0 && kmod_info->reference_count) ||
+            (libsFlag == 1 && !kmod_info->reference_count)) {
+
+            continue;
+        }
+
+        identPlusVersLength = assemble_identifier_and_version(kmod_info,
+            identPlusVers);
+        if (!identPlusVersLength) {
+            printf_func("error saving loaded kext info\n");
+            goto finish;
+        }
+
+       /* We're going to note the last-loaded kext in the list.
+        */
+        if (kmod_info == kmod) {
+            snprintf(timestampBuffer, sizeof(timestampBuffer), "%llu",
+                last_loaded_timestamp);
+            identPlusVersLength += sizeof(LAST_LOADED) - 1 +
+                strnlen(timestampBuffer, sizeof(timestampBuffer));
+        }
+
+       /* Adding 1 for the newline.
+        */
+        if (*list_length_ptr + identPlusVersLength + 1 >= list_size) {
+            goto finish;
+        }
+        
+        *list_length_ptr = strlcat(paniclist, identPlusVers, list_size);
+        if (kmod_info == kmod) {
+            *list_length_ptr = strlcat(paniclist, LAST_LOADED, list_size);
+            *list_length_ptr = strlcat(paniclist, timestampBuffer, list_size);
+        }
+        *list_length_ptr = strlcat(paniclist, "\n", list_size);
+    }
+    
+finish:
+    if (!error) {
+        if (*list_length_ptr + 1 <= list_size) {
+            result = list_size - (*list_length_ptr + 1);
+        }
+    }
+
+    return result;
+}
+
+void save_loaded_kext_paniclist(
+    int         (*printf_func)(const char *fmt, ...));
+
+void save_loaded_kext_paniclist(
+    int         (*printf_func)(const char *fmt, ...))
+{
+    char     * newlist        = NULL;
+    uint32_t   newlist_size   = 0;
+    uint32_t   newlist_length = 0;
+
+    newlist_length = 0;
+    newlist_size = KMOD_PANICLIST_SIZE;
+    newlist = (char *)kalloc(newlist_size);
+    
+    if (!newlist) {
+        printf_func("couldn't allocate kext panic log buffer\n");
+        goto finish;
+    }
+    
+    newlist[0] = '\0';
+
+    // non-"com.apple." kexts
+    if (!save_loaded_kext_paniclist_typed("com.apple.", /* invert? */ 1,
+        /* libs? */ -1, newlist, newlist_size, &newlist_length,
+        printf_func)) {
+        
+        goto finish;
+    }
+    // "com.apple." nonlibrary kexts
+    if (!save_loaded_kext_paniclist_typed("com.apple.", /* invert? */ 0,
+        /* libs? */ 0, newlist, newlist_size, &newlist_length,
+        printf_func)) {
+        
+        goto finish;
+    }
+    // "com.apple." library kexts
+    if (!save_loaded_kext_paniclist_typed("com.apple.", /* invert? */ 0,
+        /* libs? */ 1, newlist, newlist_size, &newlist_length,
+        printf_func)) {
+        
+        goto finish;
+    }
+
+    if (loaded_kext_paniclist) {
+        kfree(loaded_kext_paniclist, loaded_kext_paniclist_size);
+    }
+    loaded_kext_paniclist = newlist;
+    loaded_kext_paniclist_size = newlist_size;
+    loaded_kext_paniclist_length = newlist_length;
+
+finish:
+    return;
+}
+
+void save_unloaded_kext_paniclist(
+    kmod_info_t * kmod_info,
+    int         (*printf_func)(const char *fmt, ...));
+void save_unloaded_kext_paniclist(
+    kmod_info_t * kmod_info,
+    int         (*printf_func)(const char *fmt, ...))
+{
+    char     * newlist        = NULL;
+    uint32_t   newlist_size   = 0;
+    uint32_t   newlist_length = 0;
+    char       identPlusVers[2*KMOD_MAX_NAME];
+    uint32_t   identPlusVersLength;
+
+    identPlusVersLength = assemble_identifier_and_version(kmod_info,
+        identPlusVers);
+    if (!identPlusVersLength) {
+        printf_func("error saving unloaded kext info\n");
+        goto finish;
+    }
+
+    newlist_length = identPlusVersLength;
+    newlist_size = newlist_length + 1;
+    newlist = (char *)kalloc(newlist_size);
+    
+    if (!newlist) {
+        printf_func("couldn't allocate kext panic log buffer\n");
+        goto finish;
+    }
+    
+    newlist[0] = '\0';
+
+    strlcpy(newlist, identPlusVers, newlist_size);
+
+    if (unloaded_kext_paniclist) {
+        kfree(unloaded_kext_paniclist, unloaded_kext_paniclist_size);
+    }
+    unloaded_kext_paniclist = newlist;
+    unloaded_kext_paniclist_size = newlist_size;
+    unloaded_kext_paniclist_length = newlist_length;
+
+finish:
+    return;
+}
+
+// proto is in header
+void record_kext_unload(kmod_t kmod_id)
+{
+    kmod_info_t * kmod_info = NULL;
+
+    mutex_lock(kmod_lock);
+    
+    kmod_info = kmod_lookupbyid(kmod_id);
+    if (kmod_info) {
+        clock_get_uptime(&last_unloaded_timestamp);
+        save_unloaded_kext_paniclist(kmod_info, &printf);
+    }
+    mutex_unlock(kmod_lock);
+    return;
+}
+
+void dump_kext_info(int (*printf_func)(const char *fmt, ...))
+{
+    printf_func("unloaded kexts:\n");
+    if (unloaded_kext_paniclist && (pmap_find_phys(kernel_pmap, (addr64_t) (uintptr_t) unloaded_kext_paniclist))) {
+        printf_func("%.*s - last unloaded %llu\n",
+            unloaded_kext_paniclist_length, unloaded_kext_paniclist,
+            last_unloaded_timestamp);
+    } else {
+        printf_func("(none)\n");
+    }
+    printf_func("loaded kexts:\n");
+    if (loaded_kext_paniclist && (pmap_find_phys(kernel_pmap, (addr64_t) (uintptr_t) loaded_kext_paniclist)) && loaded_kext_paniclist[0]) {
+        printf_func("%.*s", loaded_kext_paniclist_length, loaded_kext_paniclist);
+    } else {
+        printf_func("(none)\n");
+    }
+    return;
+}
+
+/*******************************************************************************
+*******************************************************************************/
 void
 kmod_init(void)
 {
@@ -141,27 +515,27 @@ kmod_lookupbyname(const char * name)
 int kmod_lookupidbyaddress_locked(vm_address_t addr)
 {
     kmod_info_t *k = 0;
-       
+    
     mutex_lock(kmod_queue_lock);
     k = kmod;
-       if(NULL != k) {
-               while (k) {
-                       if ((k->address <= addr) && ((k->address + k->size) > addr)) {
-                               break;
-                       }
-                       k = k->next;
-               }
-               mutex_unlock(kmod_queue_lock);
-       } else {
-               mutex_unlock(kmod_queue_lock);
-               return -1;
-       }
-       
-       if(NULL == k) {
-               return -1;
-       } else {
-               return k->id;
-       }
+    if(NULL != k) {
+        while (k) {
+            if ((k->address <= addr) && ((k->address + k->size) > addr)) {
+                break;
+            }
+            k = k->next;
+        }
+        mutex_unlock(kmod_queue_lock);
+    } else {
+        mutex_unlock(kmod_queue_lock);
+        return -1;
+    }
+    
+    if(NULL == k) {
+        return -1;
+    } else {
+        return k->id;
+    }
 }
 
 kmod_info_t *
@@ -387,6 +761,9 @@ kmod_create_internal(kmod_info_t *info, kmod_t *id)
 
     *id = info->id;
 
+    clock_get_uptime(&last_loaded_timestamp);
+    save_loaded_kext_paniclist(&printf);
+
     mutex_unlock(kmod_lock);
 
 #if DEBUG
@@ -542,6 +919,10 @@ _kmod_destroy_internal(kmod_t id, boolean_t fake)
         k = k->next;
     }
 
+    if (!fake) {
+        save_loaded_kext_paniclist(&printf);
+    }
+
     mutex_unlock(kmod_lock);
 
     return KERN_INVALID_ARGUMENT;
index 0fa0930c7a4d726209bd4d34d6bb9d68cd6a8543..898dd3bfac2e67f0acb152c64c48b830b89527c2 100644 (file)
@@ -121,7 +121,7 @@ processor_up(
        pset = processor->processor_set;
        pset_lock(pset);
        if (++pset->processor_count == 1)
-               pset->low_pri = processor;
+               pset->low_pri = pset->low_count = processor;
        enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
        processor->state = PROCESSOR_RUNNING;
        (void)hw_atomic_add(&processor_avail_count, 1);
@@ -213,15 +213,11 @@ processor_shutdown(
                return (KERN_SUCCESS);
        }
 
-       if (processor->state == PROCESSOR_IDLE) {
+       if (processor->state == PROCESSOR_IDLE)
                remqueue(&pset->idle_queue, (queue_entry_t)processor);
-               pset->idle_count--;
-       }
        else
        if (processor->state == PROCESSOR_RUNNING)
                remqueue(&pset->active_queue, (queue_entry_t)processor);
-       else
-               panic("processor_shutdown");
 
        processor->state = PROCESSOR_SHUTDOWN;
 
@@ -230,7 +226,7 @@ processor_shutdown(
        processor_doshutdown(processor);
        splx(s);
 
-       cpu_exit_wait(PROCESSOR_DATA(processor, slot_num));
+       cpu_exit_wait(processor->cpu_num);
 
        return (KERN_SUCCESS);
 }
@@ -270,24 +266,6 @@ processor_doshutdown(
        old_thread = machine_processor_shutdown(self, processor_offline, processor);
 
        thread_dispatch(old_thread, self);
-
-       /*
-        * If we just shutdown another processor, move any
-        * threads and timer call outs to the current processor.
-        */
-       if (processor != current_processor()) {
-               processor_set_t         pset = processor->processor_set;
-
-               pset_lock(pset);
-
-               if (processor->state == PROCESSOR_OFF_LINE || processor->state == PROCESSOR_SHUTDOWN) {
-                       timer_call_shutdown(processor);
-                       processor_queue_shutdown(processor);
-                       return;
-               }
-
-               pset_unlock(pset);
-       }
 }
 
 /*
@@ -315,15 +293,17 @@ processor_offline(
 
        thread_dispatch(old_thread, new_thread);
 
-       PMAP_DEACTIVATE_KERNEL(PROCESSOR_DATA(processor, slot_num));
+       PMAP_DEACTIVATE_KERNEL(processor->cpu_num);
 
        pset = processor->processor_set;
        pset_lock(pset);
        processor->state = PROCESSOR_OFF_LINE;
        if (--pset->processor_count == 0)
-               pset->low_pri = PROCESSOR_NULL;
+               pset->low_pri = pset->low_count = PROCESSOR_NULL;
        (void)hw_atomic_sub(&processor_avail_count, 1);
-       pset_unlock(pset);
+       processor_queue_shutdown(processor);
+       /* pset lock dropped */
+
        ml_cpu_down();
 
        cpu_sleep();
index 6d2ceb8989d087beded146f818008c5bcba6ee13..3590d3c452faa95059ac6d7f695eaba356fcecbe 100644 (file)
@@ -122,6 +122,8 @@ extern void dbugprintf(const char *format, ...) __printflike(1,2);
 
 extern int kdb_printf(const char *format, ...) __printflike(1,2);
 
+extern int kdb_log(const char *format, ...) __printflike(1,2);
+
 extern void printf_init(void);
 
 extern int snprintf(char *, size_t, const char *, ...) __printflike(3,4);
@@ -152,6 +154,8 @@ extern void conslog_putc(char);
 
 extern void consdebug_putc(char);
 
+extern void consdebug_log(char);
+
 extern void cnputc(char);
 
 extern int cngetc(void);
index 1f7015c87b0e8e7d236961f7dc453b122d78e4df..f8376b4197ff4302da10ee7041a7470a0624af4a 100644 (file)
@@ -796,6 +796,13 @@ consdebug_putc(char c)
                        PE_kputc(c);
 }
 
+
+void
+consdebug_log(char c)
+{
+       debug_putc(c);
+}
+
 int
 kdb_printf(const char *fmt, ...)
 {
@@ -807,6 +814,17 @@ kdb_printf(const char *fmt, ...)
        return 0;
 }
 
+int
+kdb_log(const char *fmt, ...)
+{
+       va_list listp;
+
+       va_start(listp, fmt);
+       _doprnt(fmt, &listp, consdebug_log, 16);
+       va_end(listp);
+       return 0;
+}
+
 static void
 copybyte(int c, void *arg)
 {
index 8ee162fa3158210c25ae0a7b9b54be02ab597b19..bc0e89a5c3bf8b5c35da24a1c65586af6cce5ed8 100644 (file)
@@ -165,7 +165,7 @@ thread_quantum_expire(
        /*
         *      Context switch check.
         */
-       if ((preempt = csw_check(thread, processor)) != AST_NONE)
+       if ((preempt = csw_check(processor)) != AST_NONE)
                ast_on(preempt);
        else {
                processor_set_t         pset = processor->processor_set;
@@ -173,6 +173,7 @@ thread_quantum_expire(
                pset_lock(pset);
 
                pset_pri_hint(pset, processor, processor->current_pri);
+               pset_count_hint(pset, processor, processor->runq.count);
 
                pset_unlock(pset);
        }
index 0a413bcf1f51b92ee08f72b2f0bdbb6c9be70d7b..9436505b341d4e763cfa214f7a56735c9ddf3322 100644 (file)
@@ -132,36 +132,36 @@ processor_bootstrap(void)
 
 /*
  *     Initialize the given processor for the cpu
- *     indicated by slot_num, and assign to the
+ *     indicated by cpu_num, and assign to the
  *     specified processor set.
  */
 void
 processor_init(
-       processor_t             p,
-       int                             slot_num,
-       processor_set_t pset)
+       processor_t                     processor,
+       int                                     cpu_num,
+       processor_set_t         pset)
 {
-       run_queue_init(&p->runq);
-
-       p->state = PROCESSOR_OFF_LINE;
-       p->active_thread = p->next_thread = p->idle_thread = THREAD_NULL;
-       p->processor_set = pset;
-       p->current_pri = MINPRI;
-       timer_call_setup(&p->quantum_timer, thread_quantum_expire, p);
-       p->deadline = UINT64_MAX;
-       p->timeslice = 0;
-       p->processor_self = IP_NULL;
-       simple_lock_init(&p->lock, 0);
-       processor_data_init(p);
-       PROCESSOR_DATA(p, slot_num) = slot_num;
-       p->processor_list = NULL;
+       run_queue_init(&processor->runq);
+
+       processor->state = PROCESSOR_OFF_LINE;
+       processor->active_thread = processor->next_thread = processor->idle_thread = THREAD_NULL;
+       processor->processor_set = pset;
+       processor->current_pri = MINPRI;
+       processor->cpu_num = cpu_num;
+       timer_call_setup(&processor->quantum_timer, thread_quantum_expire, processor);
+       processor->deadline = UINT64_MAX;
+       processor->timeslice = 0;
+       processor->processor_self = IP_NULL;
+       simple_lock_init(&processor->lock, 0);
+       processor_data_init(processor);
+       processor->processor_list = NULL;
 
        simple_lock(&processor_list_lock);
        if (processor_list == NULL)
-               processor_list = p;
+               processor_list = processor;
        else
-               processor_list_tail->processor_list = p;
-       processor_list_tail = p;
+               processor_list_tail->processor_list = processor;
+       processor_list_tail = processor;
        processor_count++;
        simple_unlock(&processor_list_lock);
 }
@@ -212,9 +212,8 @@ pset_init(
 {
        queue_init(&pset->active_queue);
        queue_init(&pset->idle_queue);
-       pset->idle_count = 0;
        pset->processor_count = 0;
-       pset->low_pri = PROCESSOR_NULL;
+       pset->low_pri = pset->low_count = PROCESSOR_NULL;
        pset_lock_init(pset);
        pset->pset_self = IP_NULL;
        pset->pset_name_self = IP_NULL;
@@ -253,13 +252,13 @@ processor_info(
        processor_info_t                info,
        mach_msg_type_number_t  *count)
 {
-       register int    slot_num, state;
+       register int    cpu_num, state;
        kern_return_t   result;
 
        if (processor == PROCESSOR_NULL)
                return (KERN_INVALID_ARGUMENT);
 
-       slot_num = PROCESSOR_DATA(processor, slot_num);
+       cpu_num = processor->cpu_num;
 
        switch (flavor) {
 
@@ -271,14 +270,14 @@ processor_info(
                        return (KERN_FAILURE);
 
                basic_info = (processor_basic_info_t) info;
-               basic_info->cpu_type = slot_type(slot_num);
-               basic_info->cpu_subtype = slot_subtype(slot_num);
+               basic_info->cpu_type = slot_type(cpu_num);
+               basic_info->cpu_subtype = slot_subtype(cpu_num);
                state = processor->state;
                if (state == PROCESSOR_OFF_LINE)
                        basic_info->running = FALSE;
                else
                        basic_info->running = TRUE;
-               basic_info->slot_num = slot_num;
+               basic_info->slot_num = cpu_num;
                if (processor == master_processor) 
                        basic_info->is_master = TRUE;
                else
@@ -313,7 +312,7 @@ processor_info(
        }
 
        default:
-           result = cpu_info(flavor, slot_num, info, count);
+           result = cpu_info(flavor, cpu_num, info, count);
            if (result == KERN_SUCCESS)
                        *host = &realhost;                 
 
@@ -339,7 +338,7 @@ processor_start(
                prev = thread_bind(processor);
                thread_block(THREAD_CONTINUE_NULL);
 
-               result = cpu_start(PROCESSOR_DATA(processor, slot_num));
+               result = cpu_start(processor->cpu_num);
 
                thread_bind(prev);
 
@@ -408,12 +407,11 @@ processor_start(
        if (processor->processor_self == IP_NULL)
                ipc_processor_init(processor);
 
-       result = cpu_start(PROCESSOR_DATA(processor, slot_num));
+       result = cpu_start(processor->cpu_num);
        if (result != KERN_SUCCESS) {
                s = splsched();
                pset_lock(pset);
                processor->state = PROCESSOR_OFF_LINE;
-               timer_call_shutdown(processor);
                pset_unlock(pset);
                splx(s);
 
@@ -444,7 +442,7 @@ processor_control(
        if (processor == PROCESSOR_NULL)
                return(KERN_INVALID_ARGUMENT);
 
-       return(cpu_control(PROCESSOR_DATA(processor, slot_num), info, count));
+       return(cpu_control(processor->cpu_num, info, count));
 }
            
 kern_return_t
index 5cb479cd1a3bf87b0b5a116ee4708d424aa258fc..24603cc11be29ec97dac02c797282218f0202cdc 100644 (file)
@@ -84,9 +84,8 @@
 struct processor_set {
        queue_head_t            active_queue;   /* active processors */
        queue_head_t            idle_queue;             /* idle processors */
-       int                                     idle_count;
 
-       processor_t                     low_pri;
+       processor_t                     low_pri, low_count;
 
        int                                     processor_count;
 
@@ -128,6 +127,7 @@ struct processor {
        processor_set_t         processor_set;  /* assigned set */
 
        int                                     current_pri;    /* priority of current thread */
+       int                                     cpu_num;                /* platform numeric id */
 
        timer_call_data_t       quantum_timer;  /* timer for quantum expiration */
        uint64_t                        quantum_end;    /* time when current quantum ends */
@@ -149,7 +149,9 @@ extern processor_t          processor_list;
 extern unsigned int            processor_count;
 decl_simple_lock_data(extern,processor_list_lock)
 
-extern processor_t     master_processor;
+extern uint32_t                        processor_avail_count;
+
+extern processor_t             master_processor;
 
 /*
  *     Processor state is accessed by locking the scheduling lock
@@ -158,9 +160,10 @@ extern processor_t master_processor;
 #define PROCESSOR_OFF_LINE             0       /* Not available */
 #define PROCESSOR_SHUTDOWN             1       /* Going off-line */
 #define PROCESSOR_START                        2       /* Being started */
-#define        PROCESSOR_IDLE                  3       /* Idle */
-#define PROCESSOR_DISPATCHING  4       /* Dispatching (idle -> running) */
-#define        PROCESSOR_RUNNING               5       /* Normal execution */
+#define PROCESSOR_INACTIVE             3       /* Inactive (unavailable) */
+#define        PROCESSOR_IDLE                  4       /* Idle (available) */
+#define PROCESSOR_DISPATCHING  5       /* Dispatching (idle -> active) */
+#define        PROCESSOR_RUNNING               6       /* Normal execution */
 
 extern processor_t     current_processor(void);
 
@@ -184,6 +187,20 @@ MACRO_BEGIN                                                                                                \
        if ((p) != (ps)->low_pri) {                                                     \
                if ((pri) < (ps)->low_pri->current_pri)                 \
                        (ps)->low_pri = (p);                                            \
+               else                                                                                    \
+               if ((ps)->low_pri->state < PROCESSOR_IDLE)              \
+                       (ps)->low_pri = (p);                                            \
+       }                                                                                                       \
+MACRO_END
+
+#define pset_count_hint(ps, p, cnt)            \
+MACRO_BEGIN                                                                                            \
+       if ((p) != (ps)->low_count) {                                           \
+               if ((cnt) < (ps)->low_count->runq.count)                \
+                       (ps)->low_count = (p);                                          \
+               else                                                                                    \
+               if ((ps)->low_count->state < PROCESSOR_IDLE)    \
+                       (ps)->low_count = (p);                                          \
        }                                                                                                       \
 MACRO_END
 
@@ -191,7 +208,7 @@ extern void         processor_bootstrap(void) __attribute__((section("__TEXT, initcode"
 
 extern void            processor_init(
                                        processor_t             processor,
-                                       int                             slot_num,
+                                       int                             cpu_num,
                                        processor_set_t processor_set) __attribute__((section("__TEXT, initcode")));
 
 extern kern_return_t   processor_shutdown(
@@ -219,6 +236,12 @@ extern kern_return_t       processor_info_count(
 #define pset_deallocate(x)
 #define pset_reference(x)
 
+extern void            machine_run_count(
+                                       uint32_t        count);
+
+extern boolean_t       machine_cpu_is_inactive(
+                                               int                             num);
+
 #else  /* MACH_KERNEL_PRIVATE */
 
 __BEGIN_DECLS
@@ -233,9 +256,4 @@ __END_DECLS
 
 #endif /* MACH_KERNEL_PRIVATE */
 
-#ifdef XNU_KERNEL_PRIVATE
-
-extern uint32_t                processor_avail_count;
-
-#endif
 #endif /* _KERN_PROCESSOR_H_ */
index 41031a8feee28e4df5ddeb8f6720a9c09a1b0fc3..9f81a2d18080ba4aa6d560449f8423f85896b21f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -43,6 +43,4 @@ processor_data_init(
        timer_init(&PROCESSOR_DATA(processor, idle_state));
        timer_init(&PROCESSOR_DATA(processor, system_state));
        timer_init(&PROCESSOR_DATA(processor, user_state));
-
-       queue_init(&PROCESSOR_DATA(processor, timer_call_queue));
 }
index 4debe720d69cfe91819793e6af56aba045e0ccdf..0e3f64705e02d155ef54124caa858ae053734ef4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -59,9 +59,6 @@ struct processor_data {
                unsigned int                    count;
        }                                               stack_cache;
 
-       /* Pending timer callouts */
-       queue_head_t                    timer_call_queue;
-
        /* VM event counters */
        vm_statistics_data_t    vm_stat;
 
@@ -72,8 +69,6 @@ struct processor_data {
                unsigned int                    avail;
        }                                               ikm_cache;
 
-       int                                             slot_num;
-
        unsigned long                   page_grab_count;
        int                                             start_color;
        void                                    *free_pages;
index 2e7018ef9e6b3f783d2e13033e8076ca881b1474..088e84c3deb21a26ab70e107844919f55f4fd6a0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -185,15 +185,6 @@ MACRO_BEGIN                                                                                                        \
                        (thread)->realtime.computation: std_quantum;    \
 MACRO_END
 
-/* Invoked at splsched by a thread on itself */
-#define csw_needed(thread, processor) (                                                                                \
-       ((thread)->state & TH_SUSP)                                                                             ||              \
-       (first_timeslice(processor)?                                                                                    \
-        ((processor)->runq.highq > (thread)->sched_pri                         ||                      \
-         rt_runq.highq > (thread)->sched_pri) :                                                                \
-        ((processor)->runq.highq >= (thread)->sched_pri                        ||                      \
-         rt_runq.highq >= (thread)->sched_pri))                                                                )
-
 extern struct run_queue                rt_runq;
 
 /*
@@ -209,10 +200,8 @@ extern void                thread_quantum_expire(
                                        timer_call_param_t      processor,
                                        timer_call_param_t      thread);
 
-/* Called at splsched by a thread on itself */
-extern ast_t   csw_check(
-                                       thread_t                thread,
-                                       processor_t             processor);
+/* Context switch check for current processor */
+extern ast_t   csw_check(processor_t           processor);
 
 extern uint32_t        std_quantum, min_std_quantum;
 extern uint32_t        std_quantum_us;
@@ -258,16 +247,24 @@ extern uint64_t           max_unsafe_computation;
 extern uint64_t                max_poll_computation;
 
 #define sched_run_incr()                       \
-       (void)hw_atomic_add(&sched_run_count, 1)
+MACRO_BEGIN                                                                                                    \
+       machine_run_count(hw_atomic_add(&sched_run_count, 1));  \
+MACRO_END
 
 #define sched_run_decr()                       \
-       (void)hw_atomic_sub(&sched_run_count, 1)
+MACRO_BEGIN                                                                                                    \
+       machine_run_count(hw_atomic_sub(&sched_run_count, 1));  \
+MACRO_END
 
 #define sched_share_incr()                     \
-       (void)hw_atomic_add(&sched_share_count, 1)
+MACRO_BEGIN                                                                                    \
+       (void)hw_atomic_add(&sched_share_count, 1);             \
+MACRO_END
 
 #define sched_share_decr()                     \
-       (void)hw_atomic_sub(&sched_share_count, 1)
+MACRO_BEGIN                                                                                    \
+       (void)hw_atomic_sub(&sched_share_count, 1);             \
+MACRO_END
 
 /*
  *     thread_timer_delta macro takes care of both thread timers.
index a12449033818dff09b85b05a390036ed89d99eb4..4e281607d965916f54c0699912848aeb07baf96e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -150,6 +150,10 @@ void wait_queues_init(void) __attribute__((section("__TEXT, initcode")));
 static void load_shift_init(void) __attribute__((section("__TEXT, initcode")));
 static void preempt_pri_init(void) __attribute__((section("__TEXT, initcode")));
 
+static thread_t        run_queue_dequeue(
+                                       run_queue_t             runq,
+                                       integer_t               options);
+
 static thread_t        thread_select_idle(
                                        thread_t                        thread,
                                        processor_t                     processor);
@@ -158,9 +162,6 @@ static thread_t     processor_idle(
                                        thread_t                        thread,
                                        processor_t                     processor);
 
-static thread_t        choose_thread(
-                                       processor_t                     processor);
-
 static thread_t        steal_thread(
                                        processor_set_t         pset);
 
@@ -1170,7 +1171,7 @@ thread_select(
 {
        processor_set_t         pset = processor->processor_set;
        thread_t                        new_thread = THREAD_NULL;
-       boolean_t                       other_runnable;
+       boolean_t                       other_runnable, inactive_state;
 
        do {
                /*
@@ -1183,6 +1184,8 @@ thread_select(
 
                pset_lock(pset);
 
+               inactive_state = processor->state != PROCESSOR_SHUTDOWN && machine_cpu_is_inactive(processor->cpu_num);
+
                simple_lock(&rt_lock);
 
                /*
@@ -1233,7 +1236,8 @@ thread_select(
                                return (thread);
                        }
 
-                       if (    (!other_runnable                                                        ||
+                       if (!inactive_state &&
+                                       (!other_runnable                                                        ||
                                         (processor->runq.highq < thread->sched_pri             &&
                                          rt_runq.highq < thread->sched_pri))                           ) {
 
@@ -1243,6 +1247,8 @@ thread_select(
 
                                pset_pri_hint(pset, processor, processor->current_pri);
 
+                               pset_count_hint(pset, processor, processor->runq.count);
+
                                processor->deadline = UINT64_MAX;
 
                                pset_unlock(pset);
@@ -1251,11 +1257,51 @@ thread_select(
                        }
                }
 
-               if (other_runnable)
-                       return choose_thread(processor);
+               if (other_runnable) {
+                       if (processor->runq.count > 0 && processor->runq.highq >= rt_runq.highq) {
+                               simple_unlock(&rt_lock);
+
+                               thread = run_queue_dequeue(&processor->runq, SCHED_HEADQ);
+
+                               if (!inactive_state) {
+                                       pset_pri_hint(pset, processor, thread->sched_pri);
+
+                                       pset_count_hint(pset, processor, processor->runq.count);
+                               }
+
+                               processor->deadline = UINT64_MAX;
+                               pset_unlock(pset);
+
+                               return (thread);
+                       }
+
+                       thread = run_queue_dequeue(&rt_runq, SCHED_HEADQ);
+                       simple_unlock(&rt_lock);
+
+                       processor->deadline = thread->realtime.deadline;
+                       pset_unlock(pset);
+
+                       return (thread);
+               }
 
                simple_unlock(&rt_lock);
 
+               processor->deadline = UINT64_MAX;
+
+               if (inactive_state) {
+                       if (processor->state == PROCESSOR_RUNNING)
+                               remqueue(&pset->active_queue, (queue_entry_t)processor);
+                       else
+                       if (processor->state == PROCESSOR_IDLE)
+                               remqueue(&pset->idle_queue, (queue_entry_t)processor);
+
+                       processor->state = PROCESSOR_INACTIVE;
+
+                       pset_unlock(pset);
+
+                       return (processor->idle_thread);
+               }
+
                /*
                 *      No runnable threads, attempt to steal
                 *      from other processors.
@@ -1282,12 +1328,9 @@ thread_select(
                        processor->state = PROCESSOR_IDLE;
 
                        enqueue_head(&pset->idle_queue, (queue_entry_t)processor);
-                       pset->low_pri = processor;
-                       pset->idle_count++;
+                       pset->low_pri = pset->low_count = processor;
                }
 
-               processor->deadline = UINT64_MAX;
-
                pset_unlock(pset);
 
                /*
@@ -2019,7 +2062,6 @@ realtime_setrun(
         */
        if (processor->state == PROCESSOR_IDLE) {
                remqueue(&pset->idle_queue, (queue_entry_t)processor);
-               pset->idle_count--;
                enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
 
                processor->next_thread = thread;
@@ -2110,7 +2152,6 @@ processor_setrun(
         */
        if (processor->state == PROCESSOR_IDLE) {
                remqueue(&pset->idle_queue, (queue_entry_t)processor);
-               pset->idle_count--;
                enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
 
                processor->next_thread = thread;
@@ -2129,7 +2170,7 @@ processor_setrun(
        if (testbit(thread->sched_pri, sched_preempt_pri))
                preempt = (AST_PREEMPT | AST_URGENT);
        else
-       if (thread->sched_mode & TH_MODE_TIMESHARE && thread->priority < BASEPRI_BACKGROUND)
+       if (thread->sched_mode & TH_MODE_TIMESHARE && thread->sched_pri < thread->priority)
                preempt = AST_NONE;
        else
                preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
@@ -2139,9 +2180,7 @@ processor_setrun(
 
        if (preempt != AST_NONE) {
                if (processor == current_processor()) {
-                       thread_t        self = processor->active_thread;
-
-                       if (csw_needed(self, processor))
+                       if (csw_check(processor) != AST_NONE)
                                ast_on(preempt);
                }
                else
@@ -2207,11 +2246,11 @@ choose_processor(
         *      Prefer the last processor, when appropriate.
         */
        if (processor != PROCESSOR_NULL) {
-               if (processor->processor_set != pset ||
+               if (processor->processor_set != pset || processor->state == PROCESSOR_INACTIVE ||
                                processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)
                        processor = PROCESSOR_NULL;
                else
-               if (processor->state == PROCESSOR_IDLE || processor->current_pri < thread->sched_pri)
+               if (processor->state == PROCESSOR_IDLE || ( thread->sched_pri > BASEPRI_DEFAULT && processor->current_pri < thread->sched_pri))
                        return (processor);
        }
 
@@ -2243,12 +2282,20 @@ choose_processor(
                }
                else {
                        /*
-                        *      Check the low hint processor in the processor set if available.
+                        *      Check any hinted processors in the processor set if available.
                         */
-                       if (cset->low_pri != PROCESSOR_NULL &&
-                                               cset->low_pri->state != PROCESSOR_SHUTDOWN && cset->low_pri->state != PROCESSOR_OFF_LINE) {
-                               if (processor == PROCESSOR_NULL || cset->low_pri->current_pri < thread->sched_pri)
-                                       processor = cset->low_pri;
+                       if (cset->low_pri != PROCESSOR_NULL && cset->low_pri->state != PROCESSOR_INACTIVE &&
+                                       cset->low_pri->state != PROCESSOR_SHUTDOWN && cset->low_pri->state != PROCESSOR_OFF_LINE &&
+                                               (processor == PROCESSOR_NULL ||
+                                                       (thread->sched_pri > BASEPRI_DEFAULT && cset->low_pri->current_pri < thread->sched_pri))) {
+                               processor = cset->low_pri;
+                       }
+                       else
+                       if (cset->low_count != PROCESSOR_NULL && cset->low_count->state != PROCESSOR_INACTIVE &&
+                                       cset->low_count->state != PROCESSOR_SHUTDOWN && cset->low_count->state != PROCESSOR_OFF_LINE &&
+                                               (processor == PROCESSOR_NULL || 
+                                                ( thread->sched_pri <= BASEPRI_DEFAULT && cset->low_count->runq.count < processor->runq.count))) {
+                               processor = cset->low_count;
                        }
 
                        /*
@@ -2281,10 +2328,10 @@ choose_processor(
        do {
                /*
                 *      If we haven't been able to choose a processor,
-                *      pick the current one and return it.
+                *      pick the boot processor and return it.
                 */
                if (processor == PROCESSOR_NULL) {
-                       processor = current_processor();
+                       processor = master_processor;
 
                        /*
                         *      Check that the correct processor set is
@@ -2314,7 +2361,8 @@ choose_processor(
                /*
                 *      We must verify that the chosen processor is still available.
                 */
-               if (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)
+               if (processor->state == PROCESSOR_INACTIVE ||
+                                       processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)
                        processor = PROCESSOR_NULL;
        } while (processor == PROCESSOR_NULL);
 
@@ -2430,8 +2478,8 @@ thread_setrun(
 /*
  *     processor_queue_shutdown:
  *
- *     Shutdown a processor run queue by moving
- *     non-bound threads to the current processor.
+ *     Shutdown a processor run queue by
+ *     re-dispatching non-bound threads.
  *
  *     Associated pset must be locked, and is
  *     returned unlocked.
@@ -2480,35 +2528,25 @@ processor_queue_shutdown(
 
        pset_unlock(pset);
 
-       processor = current_processor();
-       pset = processor->processor_set;
-
        while ((thread = (thread_t)dequeue_head(&tqueue)) != THREAD_NULL) {
                thread_lock(thread);
-               thread->last_processor = PROCESSOR_NULL;
 
-               pset_lock(pset);
-
-               processor_enqueue(processor, thread, SCHED_TAILQ);
-
-               pset_unlock(pset);
+               thread_setrun(thread, SCHED_TAILQ);
 
                thread_unlock(thread);
        }
 }
 
 /*
- *     Check for a possible preemption point in
- *     the (current) thread.
+ *     Check for a preemption point in
+ *     the current context.
  *
  *     Called at splsched.
  */
 ast_t
 csw_check(
-       thread_t                thread,
        processor_t             processor)
 {
-       int                             current_pri = thread->sched_pri;
        ast_t                   result = AST_NONE;
        run_queue_t             runq;
 
@@ -2517,7 +2555,7 @@ csw_check(
                if (runq->highq >= BASEPRI_RTQUEUES)
                        return (AST_PREEMPT | AST_URGENT);
 
-               if (runq->highq > current_pri) {
+               if (runq->highq > processor->current_pri) {
                        if (runq->urgency > 0)
                                return (AST_PREEMPT | AST_URGENT);
 
@@ -2525,7 +2563,7 @@ csw_check(
                }
 
                runq = &processor->runq;
-               if (runq->highq > current_pri) {
+               if (runq->highq > processor->current_pri) {
                        if (runq->urgency > 0)
                                return (AST_PREEMPT | AST_URGENT);
 
@@ -2534,7 +2572,7 @@ csw_check(
        }
        else {
                runq = &rt_runq;
-               if (runq->highq >= current_pri) {
+               if (runq->highq >= processor->current_pri) {
                        if (runq->urgency > 0)
                                return (AST_PREEMPT | AST_URGENT);
 
@@ -2542,7 +2580,7 @@ csw_check(
                }
 
                runq = &processor->runq;
-               if (runq->highq >= current_pri) {
+               if (runq->highq >= processor->current_pri) {
                        if (runq->urgency > 0)
                                return (AST_PREEMPT | AST_URGENT);
 
@@ -2553,10 +2591,13 @@ csw_check(
        if (result != AST_NONE)
                return (result);
 
-       if (thread->state & TH_SUSP)
-               result |= AST_PREEMPT;
+       if (machine_cpu_is_inactive(processor->cpu_num))
+               return (AST_PREEMPT);
 
-       return (result);
+       if (processor->active_thread->state & TH_SUSP)
+               return (AST_PREEMPT);
+
+       return (AST_NONE);
 }
 
 /*
@@ -2583,11 +2624,11 @@ set_sched_pri(
                processor_t             processor = thread->last_processor;
 
                if (thread == current_thread()) {
-                       ast_t           preempt = csw_check(thread, processor);
+                       ast_t                   preempt;
 
-                       if (preempt != AST_NONE)
-                               ast_on(preempt);
                        processor->current_pri = priority;
+                       if ((preempt = csw_check(processor)) != AST_NONE)
+                               ast_on(preempt);
                }
                else
                if (    processor != PROCESSOR_NULL                                             &&
@@ -2700,44 +2741,6 @@ run_queue_remove(
        return (processor != PROCESSOR_NULL);
 }
 
-/*
- *     choose_thread:
- *
- *     Choose a thread to execute from the run queues
- *     and return it.
- *
- *     Called with pset scheduling lock and rt lock held,
- *     released on return.
- */
-static thread_t
-choose_thread(
-       processor_t                     processor)
-{
-       processor_set_t         pset = processor->processor_set;
-       thread_t                        thread;
-
-       if (processor->runq.count > 0 && processor->runq.highq >= rt_runq.highq) {
-               simple_unlock(&rt_lock);
-
-               thread = run_queue_dequeue(&processor->runq, SCHED_HEADQ);
-
-               pset_pri_hint(pset, processor, thread->sched_pri);
-
-               processor->deadline = UINT64_MAX;
-               pset_unlock(pset);
-
-               return (thread);
-       }
-
-       thread = run_queue_dequeue(&rt_runq, SCHED_HEADQ);
-       simple_unlock(&rt_lock);
-
-       processor->deadline = thread->realtime.deadline;
-       pset_unlock(pset);
-
-       return (thread);
-}
-
 /*
  *     steal_processor_thread:
  *
@@ -2813,7 +2816,6 @@ steal_thread(
                                        remqueue(&cset->active_queue, (queue_entry_t)processor);
                                        enqueue_tail(&cset->active_queue, (queue_entry_t)processor);
 
-                                       processor->deadline = UINT64_MAX;
                                        pset_unlock(cset);
 
                                        return (thread);
@@ -2872,6 +2874,9 @@ processor_idle(
                machine_idle();
 
                (void)splsched();
+
+               if (processor->state == PROCESSOR_INACTIVE && !machine_cpu_is_inactive(processor->cpu_num))
+                       break;
        }
 
        timer_switch(&PROCESSOR_DATA(processor, idle_state),
@@ -2919,12 +2924,16 @@ processor_idle(
        else
        if (state == PROCESSOR_IDLE) {
                remqueue(&pset->idle_queue, (queue_entry_t)processor);
-               pset->idle_count--;
 
                processor->state = PROCESSOR_RUNNING;
                enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
        }
        else
+       if (state == PROCESSOR_INACTIVE) {
+               processor->state = PROCESSOR_RUNNING;
+               enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
+       }
+       else
        if (state == PROCESSOR_SHUTDOWN) {
                /*
                 *      Going off-line.  Force a
index 68cdcabc0fd54b111f3e4e5defec347eb069a776..fe792f9979ac4f257c262166705a13111664eb91 100644 (file)
@@ -61,7 +61,8 @@ decl_simple_lock_data(static,stack_lock_data)
 static vm_offset_t             stack_free_list;
 
 static unsigned int            stack_free_count, stack_free_hiwat;             /* free list count */
-static unsigned int            stack_total, stack_hiwat;                               /* current total count */
+static unsigned int            stack_hiwat;
+unsigned int                   stack_total;                            /* current total count */
 
 static unsigned int            stack_free_target;
 static int                             stack_free_delta;
index d0a496a07227015bb5e487d1b1c57221582c8ba9..a4f1eebd1fe6d8c5fdb672aecb1550f318497744 100644 (file)
@@ -407,7 +407,7 @@ load_context(
        load_context_kprintf("calling processor_up\n");
        processor_up(processor);
 
-       PMAP_ACTIVATE_KERNEL(PROCESSOR_DATA(processor, slot_num));
+       PMAP_ACTIVATE_KERNEL(processor->cpu_num);
 
        /*
         * Acquire a stack if none attached.  The panic
@@ -441,7 +441,7 @@ load_context(
        timer_start(&PROCESSOR_DATA(processor, system_state), processor->last_dispatch);
        PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
 
-       PMAP_ACTIVATE_USER(thread, PROCESSOR_DATA(processor, slot_num));
+       PMAP_ACTIVATE_USER(thread, processor->cpu_num);
 
        load_context_kprintf("calling machine_load_context\n");
        machine_load_context(thread);
index 311e96c7dad5ebe338ebb03312c87574385ab9c1..15af1fa7e03aebd2c754100423de7d9c4ed8070a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -436,7 +436,7 @@ thread_poll_yield(
                                self->depress_timer_active++;
                        thread_unlock(self);
 
-                       if ((preempt = csw_check(self, myprocessor)) != AST_NONE)
+                       if ((preempt = csw_check(myprocessor)) != AST_NONE)
                                ast_on(preempt);
                }
        }
index d10c7b4bbffe6678e89b4efd851f9ab5d415e66c..7ae31523c600e427db88d68e10d3083403288c97 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1993-1995, 1999-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 1993-1995, 1999-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -30,7 +30,7 @@
 #include <mach/thread_act.h>
 
 #include <kern/kern_types.h>
-#include <kern/kalloc.h>
+#include <kern/zalloc.h>
 #include <kern/sched_prim.h>
 #include <kern/clock.h>
 #include <kern/task.h>
 
 #include <sys/kdebug.h>
 
-#define internal_call_num      768
+decl_simple_lock_data(static,thread_call_lock)
 
-#define thread_call_thread_min 4
+static zone_t          thread_call_zone;
 
-static
-thread_call_data_t
-       internal_call_storage[internal_call_num];
+struct thread_call_group {
+       queue_head_t            pending_queue;
+       uint32_t                        pending_count;
 
-decl_simple_lock_data(static,thread_call_lock)
+       queue_head_t            delayed_queue;
 
-static
-timer_call_data_t
-       thread_call_delaytimer;
+       timer_call_data_t       delayed_timer;
 
-static
-queue_head_t
-       thread_call_xxx_queue,
-       thread_call_pending_queue, thread_call_delayed_queue;
+       struct wait_queue       idle_wqueue;
+       uint32_t                        idle_count, active_count;
+};
 
-static
-struct wait_queue
-       call_thread_waitqueue;
+typedef struct thread_call_group       *thread_call_group_t;
 
-static
-boolean_t
-       activate_thread_awake;
-
-static struct {
-       int             pending_num,
-                       pending_hiwat;
-       int             active_num,
-                       active_hiwat,
-                       active_lowat;
-       int             delayed_num,
-                       delayed_hiwat;
-       int             idle_thread_num;
-       int             thread_num,
-                       thread_hiwat,
-                       thread_lowat;
-} thread_call_vars;
+static struct thread_call_group                thread_call_group0;
 
-static __inline__ thread_call_t
-       _internal_call_allocate(void);
+static boolean_t                       thread_call_daemon_awake;
 
-static __inline__ void
-_internal_call_release(
-       thread_call_t           call
-);
+#define thread_call_thread_min 4
 
-static __inline__ void
-_pending_call_enqueue(
-       thread_call_t           call
-),
-_pending_call_dequeue(
-       thread_call_t           call
-),
-_delayed_call_enqueue(
-       thread_call_t           call
-),
-_delayed_call_dequeue(
-       thread_call_t           call
-);
+#define internal_call_count    768
 
-static __inline__ void
-_set_delayed_call_timer(
-       thread_call_t           call
-);
-                                       
-static boolean_t
-_remove_from_pending_queue(
-       thread_call_func_t      func,
-       thread_call_param_t     param0,
-       boolean_t                       remove_all
-),
-_remove_from_delayed_queue(
-       thread_call_func_t      func,
-       thread_call_param_t     param0,
-       boolean_t                       remove_all
-);
+static thread_call_data_t      internal_call_storage[internal_call_count];
+static queue_head_t                    thread_call_internal_queue;
 
-static inline void
-       _call_thread_wake(void);
+static __inline__ thread_call_t                _internal_call_allocate(void);
 
-static void
-       _call_thread(void),
-       _activate_thread(void);
+static __inline__ void _internal_call_release(
+                                                       thread_call_t           call);
 
-static void
-_delayed_call_timer(
-       timer_call_param_t              p0,
-       timer_call_param_t              p1
-);
+static __inline__ boolean_t    _pending_call_enqueue(
+                                                               thread_call_t           call,
+                                                               thread_call_group_t     group),
+                                                       _delayed_call_enqueue(
+                                                               thread_call_t           call,
+                                                               thread_call_group_t     group,
+                                                               uint64_t                        deadline),
+                                                       _call_dequeue(
+                                                               thread_call_t           call,
+                                                               thread_call_group_t     group);
+
+static __inline__ void thread_call_wake(
+                                                       thread_call_group_t     group);
+
+static __inline__ void _set_delayed_call_timer(
+                                                       thread_call_t           call,
+                                                       thread_call_group_t     group);
+                                       
+static boolean_t       _remove_from_pending_queue(
+                                               thread_call_func_t              func,
+                                               thread_call_param_t             param0,
+                                               boolean_t                               remove_all),
+                                       _remove_from_delayed_queue(
+                                               thread_call_func_t              func,
+                                               thread_call_param_t             param0,
+                                               boolean_t                               remove_all);
+
+static void            thread_call_daemon(
+                                       thread_call_group_t             group),
+                               thread_call_thread(
+                                       thread_call_group_t             group);
+
+static void            thread_call_delayed_timer(
+                                       timer_call_param_t              p0,
+                                       timer_call_param_t              p1);
 
 #define qe(x)          ((queue_entry_t)(x))
 #define TC(x)          ((thread_call_t)(x))
 
 /*
- * Routine:    thread_call_initialize [public]
- *
- * Description:        Initialize this module, called
- *             early during system initialization.
+ *     thread_call_initialize:
  *
- * Preconditions:      None.
- *
- * Postconditions:     None.
+ *     Initialize this module, called
+ *     early during system initialization.
  */
-
 void
 thread_call_initialize(void)
 {
-       kern_return_t   result;
-       thread_t                thread;
-    thread_call_t      call;
-       spl_t                   s;
+    thread_call_t                      call;
+       thread_call_group_t             group = &thread_call_group0;
+       kern_return_t                   result;
+       thread_t                                thread;
+       int                                             i;
+       spl_t                                   s;
+
+       i = sizeof (thread_call_data_t);
+       thread_call_zone = zinit(i, 4096 * i, 16 * i, "thread_call");
 
     simple_lock_init(&thread_call_lock, 0);
 
        s = splsched();
        simple_lock(&thread_call_lock);
 
-    queue_init(&thread_call_pending_queue);
-    queue_init(&thread_call_delayed_queue);
+    queue_init(&group->pending_queue);
+    queue_init(&group->delayed_queue);
+
+       timer_call_setup(&group->delayed_timer, thread_call_delayed_timer, group);
 
-    queue_init(&thread_call_xxx_queue);
+       wait_queue_init(&group->idle_wqueue, SYNC_POLICY_FIFO);
+
+    queue_init(&thread_call_internal_queue);
     for (
                call = internal_call_storage;
-                       call < &internal_call_storage[internal_call_num];
+                       call < &internal_call_storage[internal_call_count];
                        call++) {
 
-               enqueue_tail(&thread_call_xxx_queue, qe(call));
+               enqueue_tail(&thread_call_internal_queue, qe(call));
     }
 
-       timer_call_setup(&thread_call_delaytimer, _delayed_call_timer, NULL);
-
-       wait_queue_init(&call_thread_waitqueue, SYNC_POLICY_FIFO);
-       thread_call_vars.thread_lowat = thread_call_thread_min;
-
-       activate_thread_awake = TRUE;
+       thread_call_daemon_awake = TRUE;
 
        simple_unlock(&thread_call_lock);
        splx(s);
 
-       result = kernel_thread_start_priority((thread_continue_t)_activate_thread, NULL, MAXPRI_KERNEL - 2, &thread);
+       result = kernel_thread_start_priority((thread_continue_t)thread_call_daemon, group, BASEPRI_PREEMPT + 1, &thread);
        if (result != KERN_SUCCESS)
                panic("thread_call_initialize");
 
@@ -199,218 +175,170 @@ void
 thread_call_setup(
        thread_call_t                   call,
        thread_call_func_t              func,
-       thread_call_param_t             param0
-)
+       thread_call_param_t             param0)
 {
        call_entry_setup(call, func, param0);
 }
 
 /*
- * Routine:    _internal_call_allocate [private, inline]
- *
- * Purpose:    Allocate an internal callout entry.
+ *     _internal_call_allocate:
  *
- * Preconditions:      thread_call_lock held.
+ *     Allocate an internal callout entry.
  *
- * Postconditions:     None.
+ *     Called with thread_call_lock held.
  */
-
 static __inline__ thread_call_t
 _internal_call_allocate(void)
 {
     thread_call_t              call;
     
-    if (queue_empty(&thread_call_xxx_queue))
+    if (queue_empty(&thread_call_internal_queue))
        panic("_internal_call_allocate");
        
-    call = TC(dequeue_head(&thread_call_xxx_queue));
+    call = TC(dequeue_head(&thread_call_internal_queue));
     
     return (call);
 }
 
 /*
- * Routine:    _internal_call_release [private, inline]
+ *     _internal_call_release:
  *
- * Purpose:    Release an internal callout entry which
- *             is no longer pending (or delayed).
+ *     Release an internal callout entry which
+ *     is no longer pending (or delayed).
  *
- * Preconditions:      thread_call_lock held.
- *
- * Postconditions:     None.
+ *     Called with thread_call_lock held.
  */
-
-static __inline__
-void
+static __inline__ void
 _internal_call_release(
-    thread_call_t              call
-)
+    thread_call_t              call)
 {
     if (    call >= internal_call_storage                                              &&
-                   call < &internal_call_storage[internal_call_num]            )
-               enqueue_head(&thread_call_xxx_queue, qe(call));
+                   call < &internal_call_storage[internal_call_count]          )
+               enqueue_head(&thread_call_internal_queue, qe(call));
 }
 
 /*
- * Routine:    _pending_call_enqueue [private, inline]
+ *     _pending_call_enqueue:
  *
- * Purpose:    Place an entry at the end of the
- *             pending queue, to be executed soon.
+ *     Place an entry at the end of the
+ *     pending queue, to be executed soon.
  *
- * Preconditions:      thread_call_lock held.
+ *     Returns TRUE if the entry was already
+ *     on a queue.
  *
- * Postconditions:     None.
+ *     Called with thread_call_lock held.
  */
-
-static __inline__
-void
+static __inline__ boolean_t
 _pending_call_enqueue(
-    thread_call_t              call
-)
+    thread_call_t              call,
+       thread_call_group_t     group)
 {
-    enqueue_tail(&thread_call_pending_queue, qe(call));
-       if (++thread_call_vars.pending_num > thread_call_vars.pending_hiwat)
-               thread_call_vars.pending_hiwat = thread_call_vars.pending_num;
+       queue_t         old_queue;
 
-    call->state = PENDING;
-}
+       old_queue = call_entry_enqueue_tail(call, &group->pending_queue);
 
-/*
- * Routine:    _pending_call_dequeue [private, inline]
- *
- * Purpose:    Remove an entry from the pending queue,
- *             effectively unscheduling it.
- *
- * Preconditions:      thread_call_lock held.
- *
- * Postconditions:     None.
- */
+       group->pending_count++;
 
-static __inline__
-void
-_pending_call_dequeue(
-    thread_call_t              call
-)
-{
-    (void)remque(qe(call));
-       thread_call_vars.pending_num--;
-    
-    call->state = IDLE;
+       return (old_queue != NULL);
 }
 
 /*
- * Routine:    _delayed_call_enqueue [private, inline]
+ *     _delayed_call_enqueue:
  *
- * Purpose:    Place an entry on the delayed queue,
- *             after existing entries with an earlier
- *             (or identical) deadline.
+ *     Place an entry on the delayed queue,
+ *     after existing entries with an earlier
+ *     (or identical) deadline.
  *
- * Preconditions:      thread_call_lock held.
+ *     Returns TRUE if the entry was already
+ *     on a queue.
  *
- * Postconditions:     None.
+ *     Called with thread_call_lock held.
  */
-
-static __inline__
-void
+static __inline__ boolean_t
 _delayed_call_enqueue(
-    thread_call_t              call
-)
+    thread_call_t              call,
+       thread_call_group_t     group,
+       uint64_t                        deadline)
 {
-    thread_call_t              current;
-    
-    current = TC(queue_first(&thread_call_delayed_queue));
-    
-    while (TRUE) {
-       if (    queue_end(&thread_call_delayed_queue, qe(current))              ||
-                                       call->deadline < current->deadline                      ) {
-                       current = TC(queue_prev(qe(current)));
-                       break;
-               }
-           
-               current = TC(queue_next(qe(current)));
-    }
+       queue_t                 old_queue;
 
-    insque(qe(call), qe(current));
-       if (++thread_call_vars.delayed_num > thread_call_vars.delayed_hiwat)
-               thread_call_vars.delayed_hiwat = thread_call_vars.delayed_num;
-    
-    call->state = DELAYED;
+       old_queue = call_entry_enqueue_deadline(call, &group->delayed_queue, deadline);
+
+       if (old_queue == &group->pending_queue)
+               group->pending_count--;
+
+       return (old_queue != NULL);
 }
 
 /*
- * Routine:    _delayed_call_dequeue [private, inline]
+ *     _call_dequeue:
  *
- * Purpose:    Remove an entry from the delayed queue,
- *             effectively unscheduling it.
+ *     Remove an entry from a queue.
  *
- * Preconditions:      thread_call_lock held.
+ *     Returns TRUE if the entry was on a queue.
  *
- * Postconditions:     None.
+ *     Called with thread_call_lock held.
  */
-
-static __inline__
-void
-_delayed_call_dequeue(
-    thread_call_t              call
-)
+static __inline__ boolean_t
+_call_dequeue(
+       thread_call_t           call,
+       thread_call_group_t     group)
 {
-    (void)remque(qe(call));
-       thread_call_vars.delayed_num--;
-    
-    call->state = IDLE;
+       queue_t                 old_queue;
+
+       old_queue = call_entry_dequeue(call);
+
+       if (old_queue == &group->pending_queue)
+               group->pending_count--;
+
+       return (old_queue != NULL);
 }
 
 /*
- * Routine:    _set_delayed_call_timer [private]
+ *     _set_delayed_call_timer:
  *
- * Purpose:    Reset the timer so that it
- *             next expires when the entry is due.
+ *     Reset the timer so that it
+ *     next expires when the entry is due.
  *
- * Preconditions:      thread_call_lock held.
- *
- * Postconditions:     None.
+ *     Called with thread_call_lock held.
  */
-
 static __inline__ void
 _set_delayed_call_timer(
-    thread_call_t              call
-)
+    thread_call_t              call,
+       thread_call_group_t     group)
 {
-    timer_call_enter(&thread_call_delaytimer, call->deadline);
+    timer_call_enter(&group->delayed_timer, call->deadline);
 }
 
 /*
- * Routine:    _remove_from_pending_queue [private]
+ *     _remove_from_pending_queue:
  *
- * Purpose:    Remove the first (or all) matching
- *             entries from the pending queue,
- *             effectively unscheduling them.
- *             Returns whether any matching entries
- *             were found.
+ *     Remove the first (or all) matching
+ *     entries from the pending queue.
  *
- * Preconditions:      thread_call_lock held.
+ *     Returns TRUE if any matching entries
+ *     were found.
  *
- * Postconditions:     None.
+ *     Called with thread_call_lock held.
  */
-
-static
-boolean_t
+static boolean_t
 _remove_from_pending_queue(
     thread_call_func_t         func,
     thread_call_param_t                param0,
-    boolean_t                          remove_all
-)
+    boolean_t                          remove_all)
 {
-       boolean_t                       call_removed = FALSE;
-       thread_call_t           call;
+       boolean_t                               call_removed = FALSE;
+       thread_call_t                   call;
+       thread_call_group_t             group = &thread_call_group0;
     
-    call = TC(queue_first(&thread_call_pending_queue));
+    call = TC(queue_first(&group->pending_queue));
     
-    while (!queue_end(&thread_call_pending_queue, qe(call))) {
+    while (!queue_end(&group->pending_queue, qe(call))) {
        if (    call->func == func                      &&
                                call->param0 == param0                  ) {
                        thread_call_t   next = TC(queue_next(qe(call)));
                
-                       _pending_call_dequeue(call);
+                       _call_dequeue(call, group);
 
                        _internal_call_release(call);
            
@@ -428,38 +356,34 @@ _remove_from_pending_queue(
 }
 
 /*
- * Routine:    _remove_from_delayed_queue [private]
+ *     _remove_from_delayed_queue:
  *
- * Purpose:    Remove the first (or all) matching
- *             entries from the delayed queue,
- *             effectively unscheduling them.
- *             Returns whether any matching entries
- *             were found.
+ *     Remove the first (or all) matching
+ *     entries from the delayed queue.
  *
- * Preconditions:      thread_call_lock held.
+ *     Returns TRUE if any matching entries
+ *     were found.
  *
- * Postconditions:     None.
+ *     Called with thread_call_lock held.
  */
-
-static
-boolean_t
+static boolean_t
 _remove_from_delayed_queue(
     thread_call_func_t         func,
     thread_call_param_t                param0,
-    boolean_t                          remove_all
-)
+    boolean_t                          remove_all)
 {
-    boolean_t                  call_removed = FALSE;
-    thread_call_t              call;
+    boolean_t                          call_removed = FALSE;
+    thread_call_t                      call;
+       thread_call_group_t             group = &thread_call_group0;
     
-    call = TC(queue_first(&thread_call_delayed_queue));
+    call = TC(queue_first(&group->delayed_queue));
     
-    while (!queue_end(&thread_call_delayed_queue, qe(call))) {
+    while (!queue_end(&group->delayed_queue, qe(call))) {
        if (    call->func == func                      &&
                                call->param0 == param0                  ) {
                        thread_call_t   next = TC(queue_next(qe(call)));
                
-                       _delayed_call_dequeue(call);
+                       _call_dequeue(call, group);
            
                        _internal_call_release(call);
            
@@ -477,34 +401,29 @@ _remove_from_delayed_queue(
 }
 
 /*
- * Routine:    thread_call_func [public]
- *
- * Purpose:    Schedule a function callout.
- *             Guarantees { function, argument }
- *             uniqueness if unique_call is TRUE.
+ *     thread_call_func:
  *
- * Preconditions:      Callable from an interrupt context
- *                                     below splsched.
+ *     Enqueue a function callout.
  *
- * Postconditions:     None.
+ *     Guarantees { function, argument }
+ *     uniqueness if unique_call is TRUE.
  */
-
 void
 thread_call_func(
     thread_call_func_t         func,
     thread_call_param_t                param,
-    boolean_t                          unique_call
-)
+    boolean_t                          unique_call)
 {
-    thread_call_t              call;
-    spl_t                              s;
+    thread_call_t                      call;
+       thread_call_group_t             group = &thread_call_group0;
+    spl_t                                      s;
     
     s = splsched();
     simple_lock(&thread_call_lock);
     
-    call = TC(queue_first(&thread_call_pending_queue));
+    call = TC(queue_first(&group->pending_queue));
     
-       while (unique_call && !queue_end(&thread_call_pending_queue, qe(call))) {
+       while (unique_call && !queue_end(&group->pending_queue, qe(call))) {
        if (    call->func == func                      &&
                                call->param0 == param                   ) {
                        break;
@@ -513,16 +432,16 @@ thread_call_func(
                call = TC(queue_next(qe(call)));
     }
     
-    if (!unique_call || queue_end(&thread_call_pending_queue, qe(call))) {
+    if (!unique_call || queue_end(&group->pending_queue, qe(call))) {
                call = _internal_call_allocate();
                call->func                      = func;
                call->param0            = param;
                call->param1            = NULL;
        
-               _pending_call_enqueue(call);
+               _pending_call_enqueue(call, group);
                
-               if (thread_call_vars.active_num <= 0)
-                       _call_thread_wake();
+               if (group->active_count == 0)
+                       thread_call_wake(group);
     }
 
        simple_unlock(&thread_call_lock);
@@ -530,26 +449,20 @@ thread_call_func(
 }
 
 /*
- * Routine:    thread_call_func_delayed [public]
- *
- * Purpose:    Schedule a function callout to
- *             occur at the stated time.
- *
- * Preconditions:      Callable from an interrupt context
- *                                     below splsched.
+ *     thread_call_func_delayed:
  *
- * Postconditions:     None.
+ *     Enqueue a function callout to
+ *     occur at the stated time.
  */
-
 void
 thread_call_func_delayed(
     thread_call_func_t         func,
     thread_call_param_t                param,
-    uint64_t                           deadline
-)
+    uint64_t                           deadline)
 {
-    thread_call_t              call;
-    spl_t                              s;
+    thread_call_t                      call;
+       thread_call_group_t             group = &thread_call_group0;
+    spl_t                                      s;
     
     s = splsched();
     simple_lock(&thread_call_lock);
@@ -558,41 +471,33 @@ thread_call_func_delayed(
     call->func                 = func;
     call->param0               = param;
     call->param1               = 0;
-    call->deadline             = deadline;
     
-    _delayed_call_enqueue(call);
+    _delayed_call_enqueue(call, group, deadline);
     
-    if (queue_first(&thread_call_delayed_queue) == qe(call))
-       _set_delayed_call_timer(call);
+    if (queue_first(&group->delayed_queue) == qe(call))
+       _set_delayed_call_timer(call, group);
     
     simple_unlock(&thread_call_lock);
     splx(s);
 }
 
 /*
- * Routine:    thread_call_func_cancel [public]
+ *     thread_call_func_cancel:
  *
- * Purpose:    Unschedule a function callout.
- *             Removes one (or all)
- *             { function, argument }
- *             instance(s) from either (or both)
- *             the pending and the delayed queue,
- *             in that order.  Returns a boolean
- *             indicating whether any calls were
- *             cancelled.
+ *     Dequeue a function callout.
  *
- * Preconditions:      Callable from an interrupt context
- *                                     below splsched.
+ *     Removes one (or all) { function, argument }
+ *     instance(s) from either (or both)
+ *     the pending and the delayed queue,
+ *     in that order.
  *
- * Postconditions:     None.
+ *     Returns TRUE if any calls were cancelled.
  */
-
 boolean_t
 thread_call_func_cancel(
     thread_call_func_t         func,
     thread_call_param_t                param,
-    boolean_t                          cancel_all
-)
+    boolean_t                          cancel_all)
 {
        boolean_t                       result;
     spl_t                              s;
@@ -614,53 +519,37 @@ thread_call_func_cancel(
 }
 
 /*
- * Routine:    thread_call_allocate [public]
+ *     thread_call_allocate:
  *
- * Purpose:    Allocate an external callout
- *             entry.
- *
- * Preconditions:      None.
- *
- * Postconditions:     None.
+ *     Allocate a callout entry.
  */
-
 thread_call_t
 thread_call_allocate(
     thread_call_func_t         func,
-    thread_call_param_t                param0
-)
+    thread_call_param_t                param0)
 {
-    thread_call_t              call = (void *)kalloc(sizeof (thread_call_data_t));
-    
-    call->func                 = func;
-    call->param0               = param0;
-    call->state                        = IDLE;
-    
+    thread_call_t              call = zalloc(thread_call_zone);
+
+       call_entry_setup(call, func, param0);
+
     return (call);
 }
 
 /*
- * Routine:    thread_call_free [public]
- *
- * Purpose:    Free an external callout
- *             entry.
- *
- * Preconditions:      None.
+ *     thread_call_free:
  *
- * Postconditions:     None.
+ *     Free a callout entry.
  */
-
 boolean_t
 thread_call_free(
-    thread_call_t              call
-)
+    thread_call_t              call)
 {
     spl_t              s;
     
     s = splsched();
     simple_lock(&thread_call_lock);
     
-    if (call->state != IDLE) {
+    if (call->queue != NULL) {
        simple_unlock(&thread_call_lock);
                splx(s);
 
@@ -670,46 +559,35 @@ thread_call_free(
     simple_unlock(&thread_call_lock);
     splx(s);
     
-    kfree(call, sizeof (thread_call_data_t));
+       zfree(thread_call_zone, call);
 
        return (TRUE);
 }
 
 /*
- * Routine:    thread_call_enter [public]
+ *     thread_call_enter:
  *
- * Purpose:    Schedule an external callout 
- *             entry to occur "soon".  Returns a
- *             boolean indicating whether the call
- *             had been already scheduled.
+ *     Enqueue a callout entry to occur "soon".
  *
- * Preconditions:      Callable from an interrupt context
- *                                     below splsched.
- *
- * Postconditions:     None.
+ *     Returns TRUE if the call was
+ *     already on a queue.
  */
-
 boolean_t
 thread_call_enter(
-    thread_call_t              call
-)
+    thread_call_t              call)
 {
-       boolean_t               result = TRUE;
-    spl_t                      s;
+       boolean_t                               result = TRUE;
+       thread_call_group_t             group = &thread_call_group0;
+    spl_t                                      s;
     
     s = splsched();
     simple_lock(&thread_call_lock);
     
-    if (call->state != PENDING) {
-               if (call->state == DELAYED)
-                       _delayed_call_dequeue(call);
-               else if (call->state == IDLE)
-                       result = FALSE;
-
-       _pending_call_enqueue(call);
+    if (call->queue != &group->pending_queue) {
+       result = _pending_call_enqueue(call, group);
                
-               if (thread_call_vars.active_num <= 0)
-                       _call_thread_wake();
+               if (group->active_count == 0)
+                       thread_call_wake(group);
        }
 
        call->param1 = 0;
@@ -723,26 +601,21 @@ thread_call_enter(
 boolean_t
 thread_call_enter1(
     thread_call_t                      call,
-    thread_call_param_t                param1
-)
+    thread_call_param_t                param1)
 {
-       boolean_t                       result = TRUE;
-    spl_t                              s;
+       boolean_t                               result = TRUE;
+       thread_call_group_t             group = &thread_call_group0;
+    spl_t                                      s;
     
     s = splsched();
     simple_lock(&thread_call_lock);
     
-    if (call->state != PENDING) {
-               if (call->state == DELAYED)
-                       _delayed_call_dequeue(call);
-               else if (call->state == IDLE)
-                       result = FALSE;
-
-       _pending_call_enqueue(call);
-
-               if (thread_call_vars.active_num <= 0)
-                       _call_thread_wake();
-    }
+    if (call->queue != &group->pending_queue) {
+       result = _pending_call_enqueue(call, group);
+               
+               if (group->active_count == 0)
+                       thread_call_wake(group);
+       }
 
        call->param1 = param1;
 
@@ -753,45 +626,32 @@ thread_call_enter1(
 }
 
 /*
- * Routine:    thread_call_enter_delayed [public]
- *
- * Purpose:    Schedule an external callout 
- *             entry to occur at the stated time.
- *             Returns a boolean indicating whether
- *             the call had been already scheduled.
+ *     thread_call_enter_delayed:
  *
- * Preconditions:      Callable from an interrupt context
- *                                     below splsched.
+ *     Enqueue a callout entry to occur
+ *     at the stated time.
  *
- * Postconditions:     None.
+ *     Returns TRUE if the call was
+ *     already on a queue.
  */
-
 boolean_t
 thread_call_enter_delayed(
     thread_call_t              call,
-    uint64_t                   deadline
-)
+    uint64_t                   deadline)
 {
-       boolean_t               result = TRUE;
-    spl_t                      s;
+       boolean_t                               result = TRUE;
+       thread_call_group_t             group = &thread_call_group0;
+    spl_t                                      s;
 
     s = splsched();
     simple_lock(&thread_call_lock);
 
-       if (call->state == PENDING)
-               _pending_call_dequeue(call);
-       else if (call->state == DELAYED)
-               _delayed_call_dequeue(call);
-       else if (call->state == IDLE)
-               result = FALSE;
-
-       call->param1    = 0;
-       call->deadline  = deadline;
+       result = _delayed_call_enqueue(call, group, deadline);
 
-       _delayed_call_enqueue(call);
+       if (queue_first(&group->delayed_queue) == qe(call))
+               _set_delayed_call_timer(call, group);
 
-       if (queue_first(&thread_call_delayed_queue) == qe(call))
-               _set_delayed_call_timer(call);
+       call->param1 = 0;
 
     simple_unlock(&thread_call_lock);
     splx(s);
@@ -803,29 +663,21 @@ boolean_t
 thread_call_enter1_delayed(
     thread_call_t                      call,
     thread_call_param_t                param1,
-    uint64_t                           deadline
-)
+    uint64_t                           deadline)
 {
-       boolean_t                       result = TRUE;
-    spl_t                              s;
+       boolean_t                               result = TRUE;
+       thread_call_group_t             group = &thread_call_group0;
+    spl_t                                      s;
 
     s = splsched();
     simple_lock(&thread_call_lock);
 
-       if (call->state == PENDING)
-               _pending_call_dequeue(call);
-       else if (call->state == DELAYED)
-               _delayed_call_dequeue(call);
-       else if (call->state == IDLE)
-               result = FALSE;
+       result = _delayed_call_enqueue(call, group, deadline);
 
-       call->param1    = param1;
-       call->deadline  = deadline;
+       if (queue_first(&group->delayed_queue) == qe(call))
+               _set_delayed_call_timer(call, group);
 
-       _delayed_call_enqueue(call);
-
-       if (queue_first(&thread_call_delayed_queue) == qe(call))
-               _set_delayed_call_timer(call);
+       call->param1 = param1;
 
     simple_unlock(&thread_call_lock);
     splx(s);
@@ -834,36 +686,25 @@ thread_call_enter1_delayed(
 }
 
 /*
- * Routine:    thread_call_cancel [public]
- *
- * Purpose:    Unschedule a callout entry.
- *             Returns a boolean indicating
- *             whether the call had actually
- *             been scheduled.
+ *     thread_call_cancel:
  *
- * Preconditions:      Callable from an interrupt context
- *                                     below splsched.
+ *     Dequeue a callout entry.
  *
- * Postconditions:     None.
+ *     Returns TRUE if the call was
+ *     on a queue.
  */
-
 boolean_t
 thread_call_cancel(
-    thread_call_t              call
-)
+    thread_call_t              call)
 {
-       boolean_t               result = TRUE;
-    spl_t                      s;
+       boolean_t                               result;
+       thread_call_group_t             group = &thread_call_group0;
+    spl_t                                      s;
     
     s = splsched();
     simple_lock(&thread_call_lock);
-    
-    if (call->state == PENDING)
-       _pending_call_dequeue(call);
-    else if (call->state == DELAYED)
-       _delayed_call_dequeue(call);
-    else
-       result = FALSE;
+
+       result = _call_dequeue(call, group);
        
     simple_unlock(&thread_call_lock);
     splx(s);
@@ -872,31 +713,26 @@ thread_call_cancel(
 }
 
 /*
- * Routine:    thread_call_is_delayed [public]
- *
- * Purpose:    Returns a boolean indicating
- *             whether a call is currently scheduled
- *             to occur at a later time.  Optionally
- *             returns the expiration time.
+ *     thread_call_is_delayed:
  *
- * Preconditions:      Callable from an interrupt context
- *                                     below splsched.
+ *     Returns TRUE if the call is
+ *     currently on a delayed queue.
  *
- * Postconditions:     None.
+ *     Optionally returns the expiration time.
  */
-
 boolean_t
 thread_call_is_delayed(
        thread_call_t           call,
        uint64_t                        *deadline)
 {
-       boolean_t               result = FALSE;
-       spl_t                   s;
+       boolean_t                               result = FALSE;
+       thread_call_group_t             group = &thread_call_group0;
+       spl_t                                   s;
 
        s = splsched();
        simple_lock(&thread_call_lock);
 
-       if (call->state == DELAYED) {
+       if (call->queue == &group->delayed_queue) {
                if (deadline != NULL)
                        *deadline = call->deadline;
                result = TRUE;
@@ -909,31 +745,26 @@ thread_call_is_delayed(
 }
 
 /*
- * Routine:    _call_thread_wake [private, inline]
- *
- * Purpose:    Wake a callout thread to service
- *             pending callout entries.  May wake
- *             the activate thread in order to
- *             create additional callout threads.
+ *     thread_call_wake:
  *
- * Preconditions:      thread_call_lock held.
+ *     Wake a call thread to service
+ *     pending call entries.  May wake
+ *     the daemon thread in order to
+ *     create additional call threads.
  *
- * Postconditions:     None.
+ *     Called with thread_call_lock held.
  */
-
-static inline void
-_call_thread_wake(void)
+static __inline__ void
+thread_call_wake(
+       thread_call_group_t             group)
 {
-       if (wait_queue_wakeup_one(&call_thread_waitqueue, NULL, THREAD_AWAKENED) == KERN_SUCCESS) {
-               thread_call_vars.idle_thread_num--;
-
-               if (++thread_call_vars.active_num > thread_call_vars.active_hiwat)
-                       thread_call_vars.active_hiwat = thread_call_vars.active_num;
+       if (group->idle_count > 0 && wait_queue_wakeup_one(&group->idle_wqueue, NULL, THREAD_AWAKENED) == KERN_SUCCESS) {
+               group->idle_count--; group->active_count++;
        }
        else
-       if (!activate_thread_awake) {
-               thread_wakeup_one(&activate_thread_awake);
-               activate_thread_awake = TRUE;
+       if (!thread_call_daemon_awake) {
+               thread_call_daemon_awake = TRUE;
+               thread_wakeup_one(&thread_call_daemon_awake);
        }
 }
 
@@ -942,28 +773,24 @@ _call_thread_wake(void)
  *
  *     Call out invoked by the scheduler.
  */
-
 static void
 sched_call_thread(
-                       int                     type,
-__unused       thread_t        thread)
+       int                             type,
+__unused       thread_t                thread)
 {
+       thread_call_group_t             group = &thread_call_group0;
+
        simple_lock(&thread_call_lock);
 
        switch (type) {
 
        case SCHED_CALL_BLOCK:
-               if (--thread_call_vars.active_num < thread_call_vars.active_lowat)
-                       thread_call_vars.active_lowat = thread_call_vars.active_num;
-
-               if (    thread_call_vars.active_num <= 0        &&
-                               thread_call_vars.pending_num > 0                )
-                       _call_thread_wake();
+               if (--group->active_count == 0 && group->pending_count > 0)
+                       thread_call_wake(group);
                break;
 
        case SCHED_CALL_UNBLOCK:
-               if (++thread_call_vars.active_num > thread_call_vars.active_hiwat)
-                       thread_call_vars.active_hiwat = thread_call_vars.active_num;
+               group->active_count++;
                break;
        }
 
@@ -971,18 +798,11 @@ __unused  thread_t        thread)
 }
 
 /*
- * Routine:    _call_thread [private]
- *
- * Purpose:    Executed by a callout thread.
- *
- * Preconditions:      None.
- *
- * Postconditions:     None.
+ *     thread_call_thread:
  */
-
-static
-void
-_call_thread_continue(void)
+static void
+thread_call_thread(
+       thread_call_group_t             group)
 {
        thread_t                self = current_thread();
 
@@ -991,19 +811,19 @@ _call_thread_continue(void)
 
        thread_sched_call(self, sched_call_thread);
 
-    while (thread_call_vars.pending_num > 0) {
+    while (group->pending_count > 0) {
                thread_call_t                   call;
                thread_call_func_t              func;
                thread_call_param_t             param0, param1;
 
-               call = TC(dequeue_head(&thread_call_pending_queue));
-               thread_call_vars.pending_num--;
+               call = TC(dequeue_head(&group->pending_queue));
+               group->pending_count--;
 
                func = call->func;
                param0 = call->param0;
                param1 = call->param1;
        
-               call->state = IDLE;
+               call->queue = NULL;
 
                _internal_call_release(call);
 
@@ -1016,31 +836,27 @@ _call_thread_continue(void)
 
                (*func)(param0, param1);
 
-               (void)thread_funnel_set(self->funnel_lock, FALSE);
+               (void)thread_funnel_set(self->funnel_lock, FALSE);              /* XXX */
 
                (void) splsched();
                simple_lock(&thread_call_lock);
     }
 
        thread_sched_call(self, NULL);
+       group->active_count--;
 
-       if (--thread_call_vars.active_num < thread_call_vars.active_lowat)
-               thread_call_vars.active_lowat = thread_call_vars.active_num;
-       
-    if (thread_call_vars.idle_thread_num < thread_call_vars.thread_lowat) {
-               thread_call_vars.idle_thread_num++;
+    if (group->idle_count < thread_call_thread_min) {
+               group->idle_count++;
 
-               wait_queue_assert_wait(&call_thread_waitqueue, NULL, THREAD_UNINT, 0);
+               wait_queue_assert_wait(&group->idle_wqueue, NULL, THREAD_UNINT, 0);
        
                simple_unlock(&thread_call_lock);
                (void) spllo();
 
-               thread_block((thread_continue_t)_call_thread_continue);
+               thread_block_parameter((thread_continue_t)thread_call_thread, group);
                /* NOTREACHED */
     }
-    
-    thread_call_vars.thread_num--;
-    
+
     simple_unlock(&thread_call_lock);
     (void) spllo();
     
@@ -1048,27 +864,12 @@ _call_thread_continue(void)
        /* NOTREACHED */
 }
 
-static
-void
-_call_thread(void)
-{
-    _call_thread_continue();
-    /* NOTREACHED */
-}
-
 /*
- * Routine:    _activate_thread [private]
- *
- * Purpose:    Executed by the activate thread.
- *
- * Preconditions:      None.
- *
- * Postconditions:     Never terminates.
+ *     thread_call_daemon:
  */
-
-static
-void
-_activate_thread_continue(void)
+static void
+thread_call_daemon_continue(
+       thread_call_group_t             group)
 {
        kern_return_t   result;
        thread_t                thread;
@@ -1076,89 +877,78 @@ _activate_thread_continue(void)
     (void) splsched();
     simple_lock(&thread_call_lock);
         
-       while (         thread_call_vars.active_num <= 0        &&
-                               thread_call_vars.pending_num > 0                ) {
-
-               if (++thread_call_vars.active_num > thread_call_vars.active_hiwat)
-                       thread_call_vars.active_hiwat = thread_call_vars.active_num;
-
-               if (++thread_call_vars.thread_num > thread_call_vars.thread_hiwat)
-                       thread_call_vars.thread_hiwat = thread_call_vars.thread_num;
+       while (group->active_count == 0 && group->pending_count > 0) {
+               group->active_count++;
 
                simple_unlock(&thread_call_lock);
                (void) spllo();
        
-               result = kernel_thread_start_priority((thread_continue_t)_call_thread, NULL, MAXPRI_KERNEL - 1, &thread);
+               result = kernel_thread_start_priority((thread_continue_t)thread_call_thread, group, BASEPRI_PREEMPT, &thread);
                if (result != KERN_SUCCESS)
-                       panic("activate_thread");
+                       panic("thread_call_daemon");
 
                thread_deallocate(thread);
 
                (void) splsched();
                simple_lock(&thread_call_lock);
     }
-               
-    assert_wait(&activate_thread_awake, THREAD_INTERRUPTIBLE);
-       activate_thread_awake = FALSE;
+
+       thread_call_daemon_awake = FALSE;
+    assert_wait(&thread_call_daemon_awake, THREAD_UNINT);
     
     simple_unlock(&thread_call_lock);
        (void) spllo();
     
-       thread_block((thread_continue_t)_activate_thread_continue);
+       thread_block_parameter((thread_continue_t)thread_call_daemon_continue, group);
        /* NOTREACHED */
 }
 
-static
-void
-_activate_thread(void)
+static void
+thread_call_daemon(
+       thread_call_group_t             group)
 {
        thread_t        self = current_thread();
 
        self->options |= TH_OPT_VMPRIV;
        vm_page_free_reserve(2);        /* XXX */
     
-    _activate_thread_continue();
+    thread_call_daemon_continue(group);
     /* NOTREACHED */
 }
 
-static
-void
-_delayed_call_timer(
-       __unused timer_call_param_t             p0,
+static void
+thread_call_delayed_timer(
+       timer_call_param_t                              p0,
        __unused timer_call_param_t             p1
 )
 {
-       uint64_t                        timestamp;
-    thread_call_t              call;
-       boolean_t                       new_pending = FALSE;
-    spl_t                              s;
+    thread_call_t                      call;
+       thread_call_group_t             group = p0;
+       boolean_t                               new_pending = FALSE;
+       uint64_t                                timestamp;
 
-    s = splsched();
     simple_lock(&thread_call_lock);
 
-       clock_get_uptime(&timestamp);
+       timestamp = mach_absolute_time();
     
-    call = TC(queue_first(&thread_call_delayed_queue));
+    call = TC(queue_first(&group->delayed_queue));
     
-    while (!queue_end(&thread_call_delayed_queue, qe(call))) {
+    while (!queue_end(&group->delayed_queue, qe(call))) {
        if (call->deadline <= timestamp) {
-                       _delayed_call_dequeue(call);
-
-                       _pending_call_enqueue(call);
+                       _pending_call_enqueue(call, group);
                        new_pending = TRUE;
                }
                else
                        break;
            
-               call = TC(queue_first(&thread_call_delayed_queue));
+               call = TC(queue_first(&group->delayed_queue));
     }
 
-       if (!queue_end(&thread_call_delayed_queue, qe(call)))
-               _set_delayed_call_timer(call);
+       if (!queue_end(&group->delayed_queue, qe(call)))
+               _set_delayed_call_timer(call, group);
 
-    if (new_pending && thread_call_vars.active_num <= 0)
-               _call_thread_wake();
+    if (new_pending && group->active_count == 0)
+               thread_call_wake(group);
 
     simple_unlock(&thread_call_lock);
-    splx(s);
 }
index 296c45079e99951e313fbfd35e3d7709e3596a9a..fbfa0fb2d7fe05892e694484c99afd7edef66155 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1993-1995, 1999-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 1993-1995, 1999-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -45,45 +45,31 @@ typedef void                                (*thread_call_func_t)(
                                                                        thread_call_param_t             param1);
 __BEGIN_DECLS
 
-boolean_t
-thread_call_enter(
-       thread_call_t           call
-);
-
-boolean_t
-thread_call_enter1(
-       thread_call_t                   call,
-       thread_call_param_t             param1
-);
-
-boolean_t
-thread_call_enter_delayed(
-       thread_call_t           call,
-       uint64_t                        deadline
-);
-
-boolean_t
-thread_call_enter1_delayed(
-       thread_call_t                   call,
-       thread_call_param_t             param1,
-       uint64_t                                deadline
-);
-
-boolean_t
-thread_call_cancel(
-       thread_call_t           call
-);
-
-thread_call_t
-thread_call_allocate(
-       thread_call_func_t              func,
-       thread_call_param_t             param0
-);
-
-boolean_t
-thread_call_free(
-       thread_call_t           call
-);
+extern boolean_t       thread_call_enter(
+                                               thread_call_t           call);
+
+extern boolean_t       thread_call_enter1(
+                                               thread_call_t                   call,
+                                               thread_call_param_t             param1);
+
+extern boolean_t       thread_call_enter_delayed(
+                                               thread_call_t           call,
+                                               uint64_t                        deadline);
+
+extern boolean_t       thread_call_enter1_delayed(
+                                               thread_call_t                   call,
+                                               thread_call_param_t             param1,
+                                               uint64_t                                deadline);
+
+extern boolean_t       thread_call_cancel(
+                                               thread_call_t           call);
+
+extern thread_call_t   thread_call_allocate(
+                                                       thread_call_func_t              func,
+                                                       thread_call_param_t             param0);
+
+extern boolean_t               thread_call_free(
+                                                       thread_call_t           call);
 
 __END_DECLS
 
@@ -93,15 +79,12 @@ __END_DECLS
 
 typedef struct call_entry      thread_call_data_t;
 
-void
-thread_call_initialize(void);
+extern void            thread_call_initialize(void);
 
-void
-thread_call_setup(
-       thread_call_t                   call,
-       thread_call_func_t              func,
-       thread_call_param_t             param0
-);
+extern void            thread_call_setup(
+                                       thread_call_t                   call,
+                                       thread_call_func_t              func,
+                                       thread_call_param_t             param0);
 
 #endif /* MACH_KERNEL_PRIVATE */
 
@@ -113,32 +96,24 @@ __BEGIN_DECLS
  * Obsolete interfaces.
  */
 
-boolean_t
-thread_call_is_delayed(
-       thread_call_t           call,
-       uint64_t                        *deadline
-);
-
-void
-thread_call_func(
-       thread_call_func_t              func,
-       thread_call_param_t             param,
-       boolean_t                               unique_call
-);
-
-void
-thread_call_func_delayed(
-       thread_call_func_t              func,
-       thread_call_param_t             param,
-       uint64_t                                deadline
-);
-
-boolean_t
-thread_call_func_cancel(
-       thread_call_func_t              func,
-       thread_call_param_t             param,
-       boolean_t                               cancel_all
-);
+extern boolean_t       thread_call_is_delayed(
+                                               thread_call_t           call,
+                                               uint64_t                        *deadline);
+
+extern void            thread_call_func(
+                                       thread_call_func_t              func,
+                                       thread_call_param_t             param,
+                                       boolean_t                               unique_call);
+
+extern void            thread_call_func_delayed(
+                                       thread_call_func_t              func,
+                                       thread_call_param_t             param,
+                                       uint64_t                                deadline);
+
+extern boolean_t       thread_call_func_cancel(
+                                               thread_call_func_t              func,
+                                               thread_call_param_t             param,
+                                               boolean_t                               cancel_all);
 
 #ifndef        MACH_KERNEL_PRIVATE
 
index 941061c3d30fcbac7a1fb855b2bbd12e2a340646..e091f67075f37d0d1e530171f573b56707ab3e0d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1993-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -35,6 +35,7 @@
 #include <kern/processor.h>
 #include <kern/etimer.h>
 #include <kern/timer_call.h>
+#include <kern/timer_queue.h>
 #include <kern/call_entry.h>
 
 #include <sys/kdebug.h>
 
 decl_simple_lock_data(static,timer_call_lock)
 
-static void
-timer_call_interrupt(
-       uint64_t                        timestamp);
-
 #define qe(x)          ((queue_entry_t)(x))
 #define TC(x)          ((timer_call_t)(x))
 
 void
 timer_call_initialize(void)
 {
-       spl_t                           s;
-
        simple_lock_init(&timer_call_lock, 0);
-
-       s = splclock();
-       simple_lock(&timer_call_lock);
-
-       clock_set_timer_func((clock_timer_func_t)timer_call_interrupt);
-
-       simple_unlock(&timer_call_lock);
-       splx(s);
 }
 
 void
@@ -77,224 +64,205 @@ timer_call_setup(
        call_entry_setup(call, func, param0);
 }
 
-static __inline__
-void
-_delayed_call_enqueue(
-       queue_t                                 queue,
-       timer_call_t                    call)
+__inline__ queue_t
+call_entry_enqueue_deadline(
+       call_entry_t            entry,
+       queue_t                         queue,
+       uint64_t                        deadline)
 {
+       queue_t                 old_queue = entry->queue;
        timer_call_t    current;
 
-       current = TC(queue_first(queue));
+       if (old_queue != queue || entry->deadline < deadline) {
+               if (old_queue != queue)
+                       current = TC(queue_first(queue));
+               else
+                       current = TC(queue_next(qe(entry)));
 
-       while (TRUE) {
-               if (    queue_end(queue, qe(current))                   ||
-                               call->deadline < current->deadline              ) {
-                       current = TC(queue_prev(qe(current)));
-                       break;
+               if (old_queue != NULL)
+                       (void)remque(qe(entry));
+
+               while (TRUE) {
+                       if (    queue_end(queue, qe(current))           ||
+                                       deadline < current->deadline            ) {
+                               current = TC(queue_prev(qe(current)));
+                               break;
+                       }
+
+                       current = TC(queue_next(qe(current)));
                }
 
-               current = TC(queue_next(qe(current)));
+               insque(qe(entry), qe(current));
        }
+       else
+       if (deadline < entry->deadline) {
+               current = TC(queue_prev(qe(entry)));
 
-       insque(qe(call), qe(current));
+               (void)remque(qe(entry));
 
-       call->state = DELAYED;
-}
+               while (TRUE) {
+                       if (    queue_end(queue, qe(current))           ||
+                                       current->deadline <= deadline           ) {
+                               break;
+                       }
 
-static __inline__
-void
-_delayed_call_dequeue(
-       timer_call_t                    call)
-{
-       (void)remque(qe(call));
+                       current = TC(queue_prev(qe(current)));
+               }
 
-       call->state = IDLE;
-}
+               insque(qe(entry), qe(current));
+       }
 
-static __inline__
-void
-_set_delayed_call_timer(
-       timer_call_t                    call)
-{
-       etimer_set_deadline(call->deadline);
+       entry->queue = queue;
+       entry->deadline = deadline;
+
+       return (old_queue);
 }
 
-boolean_t
-timer_call_enter(
-       timer_call_t                    call,
-       uint64_t                                deadline)
+__inline__ queue_t
+call_entry_enqueue_tail(
+       call_entry_t            entry,
+       queue_t                         queue)
 {
-       boolean_t               result = TRUE;
-       queue_t                 queue;
-       spl_t                   s;
+       queue_t                 old_queue = entry->queue;
 
-       s = splclock();
-       simple_lock(&timer_call_lock);
+       if (old_queue != NULL)
+               (void)remque(qe(entry));
 
-       if (call->state == DELAYED)
-               _delayed_call_dequeue(call);
-       else
-               result = FALSE;
+       enqueue_tail(queue, qe(entry));
 
-       call->param1    = NULL;
-       call->deadline  = deadline;
+       entry->queue = queue;
 
-       queue = &PROCESSOR_DATA(current_processor(), timer_call_queue);
+       return (old_queue);
+}
 
-       _delayed_call_enqueue(queue, call);
+__inline__ queue_t
+call_entry_dequeue(
+       call_entry_t            entry)
+{
+       queue_t                 old_queue = entry->queue;
 
-       if (queue_first(queue) == qe(call))
-               _set_delayed_call_timer(call);
+       if (old_queue != NULL)
+               (void)remque(qe(entry));
 
-       simple_unlock(&timer_call_lock);
-       splx(s);
+       entry->queue = NULL;
 
-       return (result);
+       return (old_queue);
 }
 
 boolean_t
-timer_call_enter1(
-       timer_call_t                    call,
-       timer_call_param_t              param1,
-       uint64_t                                deadline)
+timer_call_enter(
+       timer_call_t            call,
+       uint64_t                        deadline)
 {
-       boolean_t               result = TRUE;
-       queue_t                 queue;
+       queue_t                 queue, old_queue;
        spl_t                   s;
 
        s = splclock();
        simple_lock(&timer_call_lock);
 
-       if (call->state == DELAYED)
-               _delayed_call_dequeue(call);
-       else
-               result = FALSE;
-
-       call->param1    = param1;
-       call->deadline  = deadline;
-
-       queue = &PROCESSOR_DATA(current_processor(), timer_call_queue);
+       queue = timer_queue_assign(deadline);
 
-       _delayed_call_enqueue(queue, call);
+       old_queue = call_entry_enqueue_deadline(call, queue, deadline);
 
-       if (queue_first(queue) == qe(call))
-               _set_delayed_call_timer(call);
+       call->param1 = NULL;
 
        simple_unlock(&timer_call_lock);
        splx(s);
 
-       return (result);
+       return (old_queue != NULL);
 }
 
 boolean_t
-timer_call_cancel(
-       timer_call_t                    call)
+timer_call_enter1(
+       timer_call_t            call,
+       timer_call_param_t      param1,
+       uint64_t                        deadline)
 {
-       boolean_t               result = TRUE;
+       queue_t                 queue, old_queue;
        spl_t                   s;
 
        s = splclock();
        simple_lock(&timer_call_lock);
 
-       if (call->state == DELAYED) {
-               queue_t                 queue = &PROCESSOR_DATA(current_processor(), timer_call_queue);
+       queue = timer_queue_assign(deadline);
 
-               if (queue_first(queue) == qe(call)) {
-                       _delayed_call_dequeue(call);
+       old_queue = call_entry_enqueue_deadline(call, queue, deadline);
 
-                       if (!queue_empty(queue))
-                               _set_delayed_call_timer((timer_call_t)queue_first(queue));
-               }
-               else
-                       _delayed_call_dequeue(call);
-       }
-       else
-               result = FALSE;
+       call->param1 = param1;
 
        simple_unlock(&timer_call_lock);
        splx(s);
 
-       return (result);
+       return (old_queue != NULL);
 }
 
 boolean_t
-timer_call_is_delayed(
-       timer_call_t                    call,
-       uint64_t                                *deadline)
+timer_call_cancel(
+       timer_call_t            call)
 {
-       boolean_t               result = FALSE;
+       queue_t                 old_queue;
        spl_t                   s;
 
        s = splclock();
        simple_lock(&timer_call_lock);
 
-       if (call->state == DELAYED) {
-               if (deadline != NULL)
-                       *deadline = call->deadline;
-               result = TRUE;
+       old_queue = call_entry_dequeue(call);
+
+       if (old_queue != NULL) {
+               if (!queue_empty(old_queue))
+                       timer_queue_cancel(old_queue, call->deadline, TC(queue_first(old_queue))->deadline);
+               else
+                       timer_queue_cancel(old_queue, call->deadline, UINT64_MAX);
        }
 
        simple_unlock(&timer_call_lock);
        splx(s);
 
-       return (result);
+       return (old_queue != NULL);
 }
 
-/*
- * Called at splclock.
- */
-
 void
-timer_call_shutdown(
-       processor_t                     processor)
+timer_queue_shutdown(
+       queue_t                 queue)
 {
-       timer_call_t            call;
-       queue_t                         queue, myqueue;
-
-       assert(processor != current_processor());
-
-       queue = &PROCESSOR_DATA(processor, timer_call_queue);
-       myqueue = &PROCESSOR_DATA(current_processor(), timer_call_queue);
+       timer_call_t    call;
+       queue_t                 new_queue;
+       spl_t                   s;
 
+       s = splclock();
        simple_lock(&timer_call_lock);
 
        call = TC(queue_first(queue));
 
        while (!queue_end(queue, qe(call))) {
-               _delayed_call_dequeue(call);
+               new_queue = timer_queue_assign(call->deadline);
 
-               _delayed_call_enqueue(myqueue, call);
+               call_entry_enqueue_deadline(call, new_queue, call->deadline);
 
                call = TC(queue_first(queue));
        }
 
-       call = TC(queue_first(myqueue));
-
-       if (!queue_end(myqueue, qe(call)))
-               _set_delayed_call_timer(call);
-
        simple_unlock(&timer_call_lock);
+       splx(s);
 }
 
-static void
-timer_call_interrupt(uint64_t timestamp)
+uint64_t
+timer_queue_expire(
+       queue_t                 queue,
+       uint64_t                deadline)
 {
-       timer_call_t            call;
-       queue_t                         queue;
+       timer_call_t    call;
 
        simple_lock(&timer_call_lock);
 
-       queue = &PROCESSOR_DATA(current_processor(), timer_call_queue);
-
        call = TC(queue_first(queue));
 
        while (!queue_end(queue, qe(call))) {
-               if (call->deadline <= timestamp) {
+               if (call->deadline <= deadline) {
                        timer_call_func_t               func;
                        timer_call_param_t              param0, param1;
 
-                       _delayed_call_dequeue(call);
+                       call_entry_dequeue(call);
 
                        func = call->func;
                        param0 = call->param0;
@@ -331,14 +299,19 @@ timer_call_interrupt(uint64_t timestamp)
                                              (unsigned int)param1, 0, 0);
 
                        simple_lock(&timer_call_lock);
-               } else
+               }
+               else
                        break;
 
                call = TC(queue_first(queue));
        }
 
        if (!queue_end(queue, qe(call)))
-               _set_delayed_call_timer(call);
+               deadline = call->deadline;
+       else
+               deadline = UINT64_MAX;
 
        simple_unlock(&timer_call_lock);
+
+       return (deadline);
 }
index d3beccfc7c2510dd2c6554d88836461819257b9d..061e3d96ca2e9682889523b8ecead8c60763059a 100644 (file)
@@ -1,6 +1,5 @@
 /*
- * Copyright (c) 1993-1995, 1999-2000 Apple Computer, Inc.
- * All rights reserved.
+ * Copyright (c) 1993-1995, 1999-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
  */
 /*
  * Declarations for timer interrupt callouts.
- *
- * HISTORY
- *
- * 20 December 2000 (debo)
- *     Created.
  */
 
 #ifndef _KERN_TIMER_CALL_H_
@@ -48,42 +42,28 @@ typedef void                                (*timer_call_func_t)(
                                                                        timer_call_param_t              param0,
                                                                        timer_call_param_t              param1);
 
-boolean_t
-timer_call_enter(
-       timer_call_t                    call,
-       uint64_t                                deadline);
+extern boolean_t       timer_call_enter(
+                                               timer_call_t    call,
+                                               uint64_t                deadline);
 
-boolean_t
-timer_call_enter1(
-       timer_call_t                    call,
-       timer_call_param_t              param1,
-       uint64_t                                deadline);
+extern boolean_t       timer_call_enter1(
+                                               timer_call_t            call,
+                                               timer_call_param_t      param1,
+                                               uint64_t                        deadline);
 
-boolean_t
-timer_call_cancel(
-       timer_call_t                    call);
-
-boolean_t
-timer_call_is_delayed(
-       timer_call_t                    call,
-       uint64_t                                *deadline);
+extern boolean_t       timer_call_cancel(
+                                               timer_call_t    call);
 
 #include <kern/call_entry.h>
 
 typedef struct call_entry      timer_call_data_t;
 
-void
-timer_call_initialize(void);
-
-void
-timer_call_setup(
-       timer_call_t                    call,
-       timer_call_func_t               func,
-       timer_call_param_t              param0);
+extern void            timer_call_initialize(void);
 
-void
-timer_call_shutdown(
-       processor_t                     processor);
+extern void            timer_call_setup(
+                                       timer_call_t            call,
+                                       timer_call_func_t       func,
+                                       timer_call_param_t      param0);
 
 #endif /* MACH_KERNEL_PRIVATE */
 
diff --git a/osfmk/kern/timer_queue.h b/osfmk/kern/timer_queue.h
new file mode 100644 (file)
index 0000000..050b09a
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2008 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ * Timer queue support routines.
+ */
+
+#ifndef _KERN_TIMER_QUEUE_H_
+#define _KERN_TIMER_QUEUE_H_
+
+#include <mach/mach_types.h>
+
+#ifdef MACH_KERNEL_PRIVATE
+
+#include <kern/queue.h>
+
+/*
+ *     Invoked by kernel, implemented by platform.
+ */
+
+/* Request an expiration deadline, returns queue association */
+extern queue_t         timer_queue_assign(
+                                               uint64_t                deadline);
+
+/* Cancel an associated expiration deadline and specify new deadline */
+extern void                    timer_queue_cancel(
+                                               queue_t                 queue,
+                                               uint64_t                deadline,
+                                               uint64_t                new_deadline);
+
+/*
+ *     Invoked by platform, implemented by kernel.
+ */
+
+/* Process deadline expiration for queue, returns new deadline */
+extern uint64_t                timer_queue_expire(
+                                               queue_t                 queue,
+                                               uint64_t                deadline);
+
+/* Shutdown a timer queue and reassign existing activities */
+extern void                    timer_queue_shutdown(
+                                               queue_t                 queue);
+
+#endif /* MACH_KERNEL_PRIVATE */
+
+#endif /* _KERN_TIMER_QUEUE_H_ */
index e189edb2c0caf7eae2a5a53c096506101f30a5bd..f8ac4c12fd8bbf2a39a123dc25174a365390c688 100644 (file)
 #include <ppc/mappings.h>
 #endif
 
-int check_freed_element = 0;
 
-#if    MACH_ASSERT
-/* Detect use of zone elt after freeing it by two methods:
+/* 
+ * Zone Corruption Debugging
+ *
+ * We provide three methods to detect use of a zone element after it's been freed.  These
+ * checks are enabled by specifying "-zc" and/or "-zp" in the boot-args:
+ *
  * (1) Range-check the free-list "next" ptr for sanity.
  * (2) Store the ptr in two different words, and compare them against
- *     each other when re-using the zone elt, to detect modifications;
+ *     each other when re-using the zone element, to detect modifications.
+ * (3) poison the freed memory by overwriting it with 0xdeadbeef.
+ *
+ * The first two checks are farily light weight and are enabled by specifying "-zc" 
+ * in the boot-args.  If you want more aggressive checking for use-after-free bugs
+ * and you don't mind the additional overhead, then turn on poisoning by adding
+ * "-zp" to the boot-args in addition to "-zc".  If you specify -zp without -zc,
+ * it still poisons the memory when it's freed, but doesn't check if the memory
+ * has been altered later when it's reallocated.
  */
 
-#if defined(__alpha)
-
-#define is_kernel_data_addr(a)                                         \
-  (!(a) || (IS_SYS_VA(a) && !((a) & (sizeof(long)-1))))
-
-#else /* !defined(__alpha) */
+boolean_t check_freed_element = FALSE;         /* enabled by -zc in boot-args */
+boolean_t zfree_clear = FALSE;                 /* enabled by -zp in boot-args */
 
-#define is_kernel_data_addr(a)                                         \
-  (!(a) || ((a) >= vm_min_kernel_address && !((a) & 0x3)))
-
-#endif /* defined(__alpha) */
-
-/* Should we set all words of the zone element to an illegal address
- * when it is freed, to help catch usage after freeing?  The down-side
- * is that this obscures the identity of the freed element.
- */
-boolean_t zfree_clear = FALSE;
+#define is_kernel_data_addr(a) (!(a) || ((a) >= vm_min_kernel_address && !((a) & 0x3)))
 
 #define ADD_TO_ZONE(zone, element)                                     \
 MACRO_BEGIN                                                            \
-               if (zfree_clear)                                        \
-               {   unsigned int i;                                     \
-                   for (i=1;                                           \
-                        i < zone->elem_size/sizeof(vm_offset_t) - 1;   \
-                        i++)                                           \
-                   ((vm_offset_t *)(element))[i] = 0xdeadbeef;         \
-               }                                                       \
-               ((vm_offset_t *)(element))[0] = (zone)->free_elements;  \
-               (zone)->free_elements = (vm_offset_t) (element);        \
-               (zone)->count--;                                        \
-MACRO_END
-
-#define REMOVE_FROM_ZONE(zone, ret, type)                              \
-MACRO_BEGIN                                                            \
-       (ret) = (type) (zone)->free_elements;                           \
-       if ((ret) != (type) 0) {                                        \
-           if (!is_kernel_data_addr(((vm_offset_t *)(ret))[0])) {      \
-               panic("A freed zone element has been modified.\n");     \
-           }                                                           \
-           (zone)->count++;                                            \
-           (zone)->free_elements = *((vm_offset_t *)(ret));            \
+       if (zfree_clear)                                                \
+       {   unsigned int i;                                             \
+           for (i=0;                                                   \
+                i < zone->elem_size/sizeof(uint32_t);                  \
+                i++)                                                   \
+           ((uint32_t *)(element))[i] = 0xdeadbeef;                    \
        }                                                               \
-MACRO_END
-#else  /* MACH_ASSERT */
-
-#define ADD_TO_ZONE(zone, element)                                     \
-MACRO_BEGIN                                                            \
-               *((vm_offset_t *)(element)) = (zone)->free_elements;    \
-               if (check_freed_element) {  \
-                       if ((zone)->elem_size >= (2 * sizeof(vm_offset_t)))     \
-                               ((vm_offset_t *)(element))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \
-                                       (zone)->free_elements;                          \
-               }       \
-               (zone)->free_elements = (vm_offset_t) (element);        \
-               (zone)->count--;                                        \
-MACRO_END
-
-#define REMOVE_FROM_ZONE(zone, ret, type)                              \
-MACRO_BEGIN                                                            \
-       (ret) = (type) (zone)->free_elements;                           \
-       if ((ret) != (type) 0) {                                        \
-               if (check_freed_element) {              \
-               if ((zone)->elem_size >= (2 * sizeof(vm_offset_t)) &&           \
-                   ((vm_offset_t *)(ret))[((zone)->elem_size/sizeof(vm_offset_t))-1] != \
-                   ((vm_offset_t *)(ret))[0])                          \
-                       panic("a freed zone element has been modified");\
-               }               \
-               (zone)->count++;                                        \
-               (zone)->free_elements = *((vm_offset_t *)(ret));        \
+       *((vm_offset_t *)(element)) = (zone)->free_elements;            \
+       if (check_freed_element) {                                      \
+               if ((zone)->elem_size >= (2 * sizeof(vm_offset_t)))     \
+                       ((vm_offset_t *)(element))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \
+                               (zone)->free_elements;                  \
        }                                                               \
+       (zone)->free_elements = (vm_offset_t) (element);                \
+       (zone)->count--;                                                \
 MACRO_END
 
-#endif /* MACH_ASSERT */
+#define REMOVE_FROM_ZONE(zone, ret, type)                                      \
+MACRO_BEGIN                                                                    \
+       (ret) = (type) (zone)->free_elements;                                   \
+       if ((ret) != (type) 0) {                                                \
+               if (check_freed_element) {                                      \
+                       if (!is_kernel_data_addr(((vm_offset_t *)(ret))[0]) ||  \
+                           ((zone)->elem_size >= (2 * sizeof(vm_offset_t)) &&  \
+                           ((vm_offset_t *)(ret))[((zone)->elem_size/sizeof(vm_offset_t))-1] != \
+                           ((vm_offset_t *)(ret))[0]))                         \
+                               panic("a freed zone element has been modified");\
+                       if (zfree_clear) {                                      \
+                               unsigned int ii;                                \
+                               for (ii = sizeof(vm_offset_t) / sizeof(uint32_t); \
+                                        ii < zone->elem_size/sizeof(uint32_t) - sizeof(vm_offset_t) / sizeof(uint32_t); \
+                                        ii++)                                  \
+                                       if (((uint32_t *)(ret))[ii] != (uint32_t)0xdeadbeef) \
+                                               panic("a freed zone element has been modified");\
+                       }                                                       \
+               }                                                               \
+               (zone)->count++;                                                \
+               (zone)->free_elements = *((vm_offset_t *)(ret));                \
+       }                                                                       \
+MACRO_END
 
 #if    ZONE_DEBUG
 #define zone_debug_enabled(z) z->active_zones.next
@@ -326,10 +311,146 @@ unsigned int             num_zones;
 
 boolean_t zone_gc_allowed = TRUE;
 boolean_t zone_gc_forced = FALSE;
+boolean_t panic_include_zprint = FALSE;
 unsigned zone_gc_last_tick = 0;
 unsigned zone_gc_max_rate = 0;         /* in ticks */
 
+/*
+ * Zone leak debugging code
+ *
+ * When enabled, this code keeps a log to track allocations to a particular zone that have not
+ * yet been freed.  Examining this log will reveal the source of a zone leak.  The log is allocated
+ * only when logging is enabled, so there is no effect on the system when it's turned off.  Logging is
+ * off by default.
+ *
+ * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
+ * is the name of the zone you wish to log.  
+ *
+ * This code only tracks one zone, so you need to identify which one is leaking first.
+ * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
+ * garbage collector.  Note that the zone name printed in the panic message is not necessarily the one
+ * containing the leak.  So do a zprint from gdb and locate the zone with the bloated size.  This
+ * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test.  The
+ * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
+ * See the help in the kgmacros for usage info.
+ *
+ *
+ * Zone corruption logging
+ *
+ * Logging can also be used to help identify the source of a zone corruption.  First, identify the zone
+ * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args.  When -zc is used in conjunction
+ * with zlog, it changes the logging style to track both allocations and frees to the zone.  So when the
+ * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
+ * and freed any particular element in the zone.  Use the findelem kgmacro with the address of the element that's been
+ * corrupted to examine its history.  This should lead to the source of the corruption.
+ */
+
+static int log_records;        /* size of the log, expressed in number of records */
+
+#define MAX_ZONE_NAME  32      /* max length of a zone name we can take from the boot-args */
+
+static char zone_name_to_log[MAX_ZONE_NAME] = "";      /* the zone name we're logging, if any */
+
+/*
+ * The number of records in the log is configurable via the zrecs parameter in boot-args.  Set this to 
+ * the number of records you want in the log.  For example, "zrecs=1000" sets it to 1000 records.  Note
+ * that the larger the size of the log, the slower the system will run due to linear searching in the log,
+ * but one doesn't generally care about performance when tracking down a leak.  The log is capped at 8000
+ * records since going much larger than this tends to make the system unresponsive and unbootable on small
+ * memory configurations.  The default value is 4000 records.
+ *
+ * MAX_DEPTH configures how deep of a stack trace is taken on each zalloc in the zone of interrest.  15
+ * levels is usually enough to get past all the layers of code in kalloc and IOKit and see who the actual
+ * caller is up above these lower levels.
+ */
+
+#define ZRECORDS_MAX           8000            /* Max records allowed in the log */
+#define ZRECORDS_DEFAULT       4000            /* default records in log if zrecs is not specificed in boot-args */
+#define MAX_DEPTH              15              /* number of levels of the stack trace to record */
 
+/*
+ * Each record in the log contains a pointer to the zone element it refers to, a "time" number that allows
+ * the records to be ordered chronologically, and a small array to hold the pc's from the stack trace.  A
+ * record is added to the log each time a zalloc() is done in the zone_of_interest.  For leak debugging,
+ * the record is cleared when a zfree() is done.  For corruption debugging, the log tracks both allocs and frees.
+ * If the log fills, old records are replaced as if it were a circular buffer.
+ */
+
+struct zrecord {
+        void           *z_element;             /* the element that was zalloc'ed of zfree'ed */
+        uint32_t       z_opcode:1,             /* whether it was a zalloc or zfree */
+                       z_time:31;              /* time index when operation was done */
+        void           *z_pc[MAX_DEPTH];       /* stack trace of caller */
+};
+
+/*
+ * Opcodes for the z_opcode field:
+ */
+
+#define ZOP_ALLOC      1
+#define ZOP_FREE       0
+
+/*
+ * The allocation log and all the related variables are protected by the zone lock for the zone_of_interest
+ */
+
+static struct zrecord *zrecords;               /* the log itself, dynamically allocated when logging is enabled  */
+static int zcurrent  = 0;                      /* index of the next slot in the log to use */
+static int zrecorded = 0;                      /* number of allocations recorded in the log */
+static unsigned int ztime = 0;                 /* a timestamp of sorts */
+static zone_t  zone_of_interest = NULL;                /* the zone being watched; corresponds to zone_name_to_log */
+
+/*
+ * Decide if we want to log this zone by doing a string compare between a zone name and the name
+ * of the zone to log. Return true if the strings are equal, false otherwise.  Because it's not
+ * possible to include spaces in strings passed in via the boot-args, a period in the logname will
+ * match a space in the zone name.
+ */
+
+static int
+log_this_zone(const char *zonename, const char *logname) 
+{
+       int len;
+       const char *zc = zonename;
+       const char *lc = logname;
+
+       /*
+        * Compare the strings.  We bound the compare by MAX_ZONE_NAME.
+        */
+
+       for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
+
+               /*
+                * If the current characters don't match, check for a space in
+                * in the zone name and a corresponding period in the log name.
+                * If that's not there, then the strings don't match.
+                */
+
+               if (*zc != *lc && !(*zc == ' ' && *lc == '.')) 
+                       break;
+
+               /*
+                * The strings are equal so far.  If we're at the end, then it's a match.
+                */
+
+               if (*zc == '\0')
+                       return TRUE;
+       }
+
+       return FALSE;
+}
+
+
+/*
+ * Test if we want to log this zalloc/zfree event.  We log if this is the zone we're interested in and
+ * the buffer for the records has been allocated.
+ */
+
+#define DO_LOGGING(z)          (zrecords && (z) == zone_of_interest)
+
+extern boolean_t zlog_ready;
+
+       
 /*
  *     zinit initializes a new zone.  The zone data structures themselves
  *     are stored in a zone, which is initially a static structure that
@@ -435,6 +556,40 @@ use_this_allocation:
        num_zones++;
        simple_unlock(&all_zones_lock);
 
+       /*
+        * Check if we should be logging this zone.  If so, remember the zone pointer.
+        */
+
+        if (log_this_zone(z->zone_name, zone_name_to_log)) {
+               zone_of_interest = z;
+       }
+
+       /*
+        * If we want to log a zone, see if we need to allocate buffer space for the log.  Some vm related zones are
+        * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case.  zlog_ready is set to
+        * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work.  If we want to log one
+        * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again
+        * later on some other zone.  So note we may be allocating a buffer to log a zone other than the one being initialized
+        * right now.
+        */
+
+       if (zone_of_interest != NULL && zrecords == NULL && zlog_ready) {
+               if (kmem_alloc(kernel_map, (vm_offset_t *)&zrecords, log_records * sizeof(struct zrecord)) == KERN_SUCCESS) {
+
+                       /*
+                        * We got the memory for the log.  Zero it out since the code needs this to identify unused records.
+                        * At this point, everything is set up and we're ready to start logging this zone.
+                        */
+       
+                       bzero((void *)zrecords, log_records * sizeof(struct zrecord));
+                       printf("zone: logging started for zone %s (%p)\n", zone_of_interest->zone_name, zone_of_interest);
+
+               } else {
+                       printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
+                       zone_of_interest = NULL;
+               }
+       }
+
        return(z);
 }
 
@@ -613,9 +768,40 @@ zone_bootstrap(void)
        vm_offset_t zone_zone_space;
        char temp_buf[16];
 
-       /* see if we want freed zone element checking */
+       /* see if we want freed zone element checking and/or poisoning */
        if (PE_parse_boot_argn("-zc", temp_buf, sizeof (temp_buf))) {
-               check_freed_element = 1;
+               check_freed_element = TRUE;
+       }
+
+       if (PE_parse_boot_argn("-zp", temp_buf, sizeof (temp_buf))) {
+               zfree_clear = TRUE;
+       }
+
+       /*
+        * Check for and set up zone leak detection if requested via boot-args.  We recognized two
+        * boot-args:
+        *
+        *      zlog=<zone_to_log>
+        *      zrecs=<num_records_in_log>
+        *
+        * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to
+        * control the size of the log.  If zrecs is not specified, a default value is used.
+        */
+
+       if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
+               if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) {
+
+                       /*
+                        * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
+                        * This prevents accidentally hogging too much kernel memory and making the system
+                        * unusable.
+                        */
+
+                       log_records = MIN(ZRECORDS_MAX, log_records);
+
+               } else {
+                       log_records = ZRECORDS_DEFAULT;
+               }
        }
 
        simple_lock_init(&all_zones_lock, 0);
@@ -681,9 +867,19 @@ zalloc_canblock(
 {
        vm_offset_t     addr;
        kern_return_t retval;
+       void            *bt[MAX_DEPTH];         /* only used if zone logging is enabled */
+       int             numsaved = 0;
+       int             i;
 
        assert(zone != ZONE_NULL);
 
+       /*
+        * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
+        */
+
+       if (DO_LOGGING(zone))
+               numsaved = OSBacktrace(&bt[0], MAX_DEPTH);
+
        lock_zone(zone);
 
        REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
@@ -765,8 +961,10 @@ zalloc_canblock(
                                                        zone_gc();
                                                        printf("zalloc did gc\n");
                                                }
-                                               if (retry == 3)
+                                               if (retry == 3) {
+                                                       panic_include_zprint = TRUE;
                                                        panic("zalloc: \"%s\" (%d elements) retry fail %d", zone->zone_name, zone->count, retval);
+                                               }
                                        } else {
                                                break;
                                        }
@@ -826,6 +1024,76 @@ zalloc_canblock(
                        REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
        }
 
+       /*
+        * See if we should be logging allocations in this zone.  Logging is rarely done except when a leak is
+        * suspected, so this code rarely executes.  We need to do this code while still holding the zone lock
+        * since it protects the various log related data structures.
+        */
+
+       if (DO_LOGGING(zone) && addr) {
+
+               /*
+                * Look for a place to record this new allocation.  We implement two different logging strategies
+                * depending on whether we're looking for the source of a zone leak or a zone corruption.  When looking
+                * for a leak, we want to log as many allocations as possible in order to clearly identify the leaker
+                * among all the records.  So we look for an unused slot in the log and fill that in before overwriting
+                * an old entry.  When looking for a corrution however, it's better to have a chronological log of all
+                * the allocations and frees done in the zone so that the history of operations for a specific zone 
+                * element can be inspected.  So in this case, we treat the log as a circular buffer and overwrite the
+                * oldest entry whenever a new one needs to be added.
+                *
+                * The check_freed_element flag tells us what style of logging to do.  It's set if we're supposed to be
+                * doing corruption style logging (indicated via -zc in the boot-args).
+                */
+
+               if (!check_freed_element && zrecords[zcurrent].z_element && zrecorded < log_records) {
+
+                       /*
+                        * If we get here, we're doing leak style logging and there's still some unused entries in
+                        * the log (since zrecorded is smaller than the size of the log).  Look for an unused slot
+                        * starting at zcurrent and wrap-around if we reach the end of the buffer.  If the buffer
+                        * is already full, we just fall through and overwrite the element indexed by zcurrent.
+                        */
+       
+                      for (i = zcurrent; i < log_records; i++) {
+                               if (zrecords[i].z_element == NULL) {
+                                       zcurrent = i;
+                                       goto empty_slot;
+                               }
+                       }
+
+                       for (i = 0; i < zcurrent; i++) {
+                               if (zrecords[i].z_element == NULL) {
+                                       zcurrent = i;
+                                       goto empty_slot;
+                               }
+                       }
+                }
+       
+               /*
+                * Save a record of this allocation
+                */
+       
+empty_slot:
+                 if (zrecords[zcurrent].z_element == NULL)
+                       zrecorded++;
+       
+                 zrecords[zcurrent].z_element = (void *)addr;
+                 zrecords[zcurrent].z_time = ztime++;
+                 zrecords[zcurrent].z_opcode = ZOP_ALLOC;
+                       
+                 for (i = 0; i < numsaved; i++)
+                       zrecords[zcurrent].z_pc[i] = bt[i];
+
+                 for (; i < MAX_DEPTH; i++)
+                       zrecords[zcurrent].z_pc[i] = 0;
+       
+                 zcurrent++;
+       
+                 if (zcurrent >= log_records)
+                         zcurrent = 0;
+       }
+
        if ((addr == 0) && !canblock && (zone->async_pending == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) {
                zone->async_pending = TRUE;
                unlock_zone(zone);
@@ -922,6 +1190,17 @@ zfree(
        void            *addr)
 {
        vm_offset_t     elem = (vm_offset_t) addr;
+       void            *bt[MAX_DEPTH];                 /* only used if zone logging is enable via boot-args */
+       int             numsaved = 0;
+
+       assert(zone != ZONE_NULL);
+
+       /*
+        * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
+        */
+
+       if (DO_LOGGING(zone))
+               numsaved = OSBacktrace(&bt[0], MAX_DEPTH);
 
 #if MACH_ASSERT
        /* Basic sanity checks */
@@ -945,6 +1224,61 @@ zfree(
        }
 
        lock_zone(zone);
+
+       /*
+        * See if we're doing logging on this zone.  There are two styles of logging used depending on
+        * whether we're trying to catch a leak or corruption.  See comments above in zalloc for details.
+        */
+
+       if (DO_LOGGING(zone)) {
+               int  i;
+
+               if (check_freed_element) {
+
+                       /*
+                        * We're logging to catch a corruption.  Add a record of this zfree operation
+                        * to log.
+                        */
+
+                       if (zrecords[zcurrent].z_element == NULL)
+                               zrecorded++;
+
+                       zrecords[zcurrent].z_element = (void *)addr;
+                       zrecords[zcurrent].z_time = ztime++;
+                       zrecords[zcurrent].z_opcode = ZOP_FREE;
+
+                       for (i = 0; i < numsaved; i++)
+                               zrecords[zcurrent].z_pc[i] = bt[i];
+
+                       for (; i < MAX_DEPTH; i++)
+                               zrecords[zcurrent].z_pc[i] = 0;
+
+                       zcurrent++;
+
+                       if (zcurrent >= log_records)
+                               zcurrent = 0;
+
+               } else {
+
+                       /*
+                        * We're logging to catch a leak. Remove any record we might have for this
+                        * element since it's being freed.  Note that we may not find it if the buffer
+                        * overflowed and that's OK.  Since the log is of a limited size, old records
+                        * get overwritten if there are more zallocs than zfrees.
+                        */
+       
+                       for (i = 0; i < log_records; i++) {
+                               if (zrecords[i].z_element == addr) {
+                                       zrecords[i].z_element = NULL;
+                                       zcurrent = i;
+                                       zrecorded--;
+                                       break;
+                               }
+                       }
+               }
+       }
+
+
 #if    ZONE_DEBUG
        if (zone_debug_enabled(zone)) {
                queue_t tmp_elem;
index e2f2f2de72e4e8dd1946b15cc13fe311584d4de4..946bc45e62caf4319f7ac889479029d195d533e3 100644 (file)
@@ -176,6 +176,10 @@ INSTALL_KF_MI_LCL_LIST = \
        mach_interface.h \
        $(filter-out mach_traps.h mach_syscalls.h thread_switch.h, ${DATAFILES})
 
+INSTALL_MI_LCL_LIST = kext_panic_report.h \
+       bootstrap.h \
+       ${DATAFILES}
+
 INSTALL_MI_GEN_LIST =
 
 INSTALL_MI_DIR = mach
diff --git a/osfmk/mach/kext_panic_report.h b/osfmk/mach/kext_panic_report.h
new file mode 100644 (file)
index 0000000..1eb4f38
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef        _KEXT_PANIC_REPORT_H_
+#define        _KEXT_PANIC_REPORT_H_
+
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+
+/*******************************************************************************
+* String-compaction tables for panic reports' kext listing.
+*******************************************************************************/
+
+typedef struct subs_entry_t {
+    const char * substring;
+    char         substitute;
+} subs_entry_t;
+
+/* Prefix substitution list. Common prefixes are replaced with a single
+ * nonalphanumeric character at the beginning of the identifier.
+ *
+ * List should be in descending order of # components, and should then
+ * be in descending frequency order.
+ */
+subs_entry_t kext_identifier_prefix_subs[] = {
+    { "com.apple.driver.",   '>' },
+    { "com.apple.iokit.",    '|' },
+    { "com.apple.security.", '$' },
+    { "com.apple.",          '@' },
+
+    { (char *)0,             '\0' }
+};
+
+/* Substring substitution list. Substrings are replaced with a '!' followed
+ * by a single letter mapping to the original string.
+ * 
+ * List should be in descending frequency order, and within
+ * groups containing same prefix, in descending length order.
+ */
+subs_entry_t kext_identifier_substring_subs[] = {
+    { "AppleUSB",   'U' },
+    { "Apple",      'A' },
+    { "Family",     'F' },
+    { "Storage",    'S' },
+    { "Controller", 'C' },
+    { "Bluetooth",  'B' },
+    { "Intel",      'I' },
+    
+    // CHUD kexts, typically not on user installs
+    { "Profile",    'P' },
+    { "Action",     'a' },   // maybe K if we want to stick to all-caps
+
+    { (char *)0,    '\0' }
+};
+
+__END_DECLS
+#endif /* _KEXT_PANIC_REPORT_H_ */
index 4c6f34c052b5ad45e86cfb55814bca25e724a5eb..beaa45af3b63c3b4856fa391224acd486f9fb123 100644 (file)
@@ -189,6 +189,9 @@ extern kern_return_t kmod_send_generic(int type, void *data, int size);
 extern kern_return_t kmod_initialize_cpp(kmod_info_t *info);
 extern kern_return_t kmod_finalize_cpp(kmod_info_t *info);
 
+void record_kext_unload(kmod_t kmod_id);
+void dump_kext_info(int (*printf_func)(const char *fmt, ...));
+
 extern void kmod_dump(vm_offset_t *addr, unsigned int dump_cnt);
 __END_DECLS
 
index e28a2c5371b7893e0f0b102e8f0487e27db3d469..3ec6039b960f4a1ca154d8ebb401ea06bce2a22c 100644 (file)
@@ -347,6 +347,7 @@ __END_DECLS
 #define CPU_SUBTYPE_ARM_V6              ((cpu_subtype_t) 6)
 #define CPU_SUBTYPE_ARM_V5TEJ           ((cpu_subtype_t) 7)
 #define CPU_SUBTYPE_ARM_XSCALE         ((cpu_subtype_t) 8)
+#define CPU_SUBTYPE_ARM_V7             ((cpu_subtype_t) 9)
 
 /*
  *     CPU families (sysctl hw.cpufamily)
@@ -371,6 +372,7 @@ __END_DECLS
 #define CPUFAMILY_ARM_9      0xe73283ae
 #define CPUFAMILY_ARM_11     0x8ff620d8
 #define CPUFAMILY_ARM_XSCALE 0x53b005f5
+#define CPUFAMILY_ARM_13     0x0cc90e64
 
 #define CPUFAMILY_INTEL_YONAH  CPUFAMILY_INTEL_6_14
 #define CPUFAMILY_INTEL_MEROM  CPUFAMILY_INTEL_6_15
index ff3360297389be78df1097f919d980328bbbd522..5e11865401a19cfff4298ecad0c04d42eb2d7a4a 100644 (file)
@@ -308,6 +308,7 @@ typedef struct mach_port_status {
 #define MACH_PORT_QLIMIT_BASIC         ((mach_port_msgcount_t) 5)
 #define MACH_PORT_QLIMIT_SMALL         ((mach_port_msgcount_t) 16)
 #define MACH_PORT_QLIMIT_LARGE         ((mach_port_msgcount_t) 1024)
+#define MACH_PORT_QLIMIT_KERNEL                ((mach_port_msgcount_t) 65536)
 #define MACH_PORT_QLIMIT_MIN           MACH_PORT_QLIMIT_ZERO
 #define MACH_PORT_QLIMIT_DEFAULT       MACH_PORT_QLIMIT_BASIC
 #define MACH_PORT_QLIMIT_MAX           MACH_PORT_QLIMIT_LARGE
index 5ac1d203a6337b3338e2ef0d558142875f8fcc12..df6f7e01d9a6e7817ac6e32d150af0ff15948c49 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -468,7 +468,7 @@ int diagCall(struct savearea *save) {
                        prssr = (processor_t)port->ip_kobject;  /* Extract the processor */
                        is_write_unlock(current_space());               /* All done with the space now, unlock it */
                        
-                       save->save_r3 = (uint64_t)(uint32_t)PerProcTable[prssr->processor_data.slot_num].ppe_vaddr;     /* Pass back ther per proc */
+                       save->save_r3 = (uint64_t)(uint32_t)PerProcTable[prssr->cpu_num].ppe_vaddr;     /* Pass back ther per proc */
                        return -1;                                                              /* Return and check asts */
 
 /*
index 3a77bccb6384ade11a6d321bc492e317f30a2381..aa6727c9019ea1ce6677b228fdff56eead1b3671 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -36,6 +36,7 @@
 #include <kern/misc_protos.h>
 #include <kern/thread.h>
 #include <kern/sched_prim.h>
+#include <kern/timer_queue.h>
 #include <kern/processor.h>
 #include <kern/pms.h>
 
@@ -241,6 +242,9 @@ cpu_per_proc_alloc(
        proc_info->debstackptr = (vm_offset_t)debugger_stack + KERNEL_STACK_SIZE - FM_SIZE;
        proc_info->debstack_top_ss = proc_info->debstackptr;
 
+       queue_init(&proc_info->rtclock_timer.queue);
+       proc_info->rtclock_timer.deadline = EndOfAllTime;
+
        return proc_info;
 
 }
@@ -427,6 +431,11 @@ cpu_sleep(
 
        proc_info->running = FALSE;
 
+       if (proc_info->cpu_number != master_cpu) {
+               timer_queue_shutdown(&proc_info->rtclock_timer.queue);
+               proc_info->rtclock_timer.deadline = EndOfAllTime;
+       }
+
        fowner = proc_info->FPU_owner;                                  /* Cache this */
        if(fowner) /* If anyone owns FPU, save it */
                fpu_save(fowner);
index 4d3bb1a5ff8914f24d69d1239d3d3c51e1cd0e5c..dca034b919e4da6a4a3836d02b5ffc08c0d01c83 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -41,6 +41,7 @@
 
 #include <kern/clock.h>
 #include <kern/thread.h>
+#include <kern/timer_queue.h>
 #include <kern/processor.h>
 #include <kern/macro_help.h>
 #include <kern/spl.h>
@@ -53,9 +54,6 @@
 #include <sys/kdebug.h>
 #include <ppc/exception.h>
 
-/* XXX from <arch>/rtclock.c */
-clock_timer_func_t             rtclock_timer_expire;
-
 /*
  *     Event timer interrupt.
  *
@@ -91,8 +89,7 @@ __unused uint64_t iaddr)
        /* has a pending clock timer expired? */
        if (mytimer->deadline <= abstime) {                     /* Have we expired the deadline? */
                mytimer->has_expired = TRUE;                    /* Remember that we popped */
-               mytimer->deadline = EndOfAllTime;               /* Set timer request to the end of all time in case we have no more events */
-               (*rtclock_timer_expire)(abstime);               /* Process pop */
+               mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime);
                mytimer->has_expired = FALSE;
        }
 
@@ -102,7 +99,7 @@ __unused uint64_t iaddr)
 }
 
 /*
- * Set the clock deadline; called by the thread scheduler.
+ * Set the clock deadline.
  */
 void etimer_set_deadline(uint64_t deadline)
 {
@@ -165,3 +162,34 @@ etimer_resync_deadlines(void)
        }
        splx(s);
 }
+
+queue_t
+timer_queue_assign(
+       uint64_t                deadline)
+{
+       struct per_proc_info    *pp = getPerProc();
+       rtclock_timer_t                 *timer;
+
+       if (pp->running) {
+               timer = &pp->rtclock_timer;
+
+               if (deadline < timer->deadline)
+                       etimer_set_deadline(deadline);
+       }
+       else
+               timer = &PerProcTable[master_cpu].ppe_vaddr->rtclock_timer;
+
+       return (&timer->queue);
+}
+
+void
+timer_queue_cancel(
+       queue_t                 queue,
+       uint64_t                deadline,
+       uint64_t                new_deadline)
+{
+       if (queue == &getPerProc()->rtclock_timer.queue) {
+               if (deadline < new_deadline)
+                       etimer_set_deadline(new_deadline);
+       }
+}
index 75f7e2e28ee2d7cf526bafca69f0ab6078282388..394b884e44339dd96ace386a405830c730acd69d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -349,7 +349,6 @@ struct per_proc_info {
        /* PPC cache line boundary here - 140 */
        void *                  pp_cbfr;
        void *                  pp_chud;
-       uint64_t                rtclock_intr_deadline;
        rtclock_timer_t rtclock_timer;
        unsigned int    ppbbTaskEnv;            /* BlueBox Task Environment */
     
index 9386f8597d08af463375011e06534939afa06c40..ad4add6f0c3349ee7308e36ae3d8de796425db4d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -815,3 +815,18 @@ void ml_mem_backoff(void) {
        return;
 }
 
+
+
+/*
+ * Stubs for CPU Stepper
+ */
+void
+machine_run_count(__unused uint32_t count)
+{
+}
+
+boolean_t
+machine_cpu_is_inactive(__unused int num)
+{
+    return(FALSE);
+}
index a6dcb65774e0e51847649430d1e8650ea0057a79..e6dc6435fc6791cd3cb4b65a6374cf6e2ac56d9d 100644 (file)
@@ -451,7 +451,8 @@ print_backtrace(struct savearea *ssp)
        while(pbtcnt);                                                  /* Wait for completion */
 pbt_exit:
        panic_display_system_configuration();
-
+       panic_display_zprint();
+        dump_kext_info(&kdb_log);
        return;
 }
 
index 35526ab2c5ffcfd5124acad07f4b79ad4046137f..ccdbb8bb93ef2709adc0e03c24157e69b8f6dc46 100644 (file)
@@ -165,6 +165,8 @@ ppc_init(
        BootProcInfo.VMX_owner = NULL;
        BootProcInfo.pp_cbfr = console_per_proc_alloc(TRUE);
        BootProcInfo.rtcPop = EndOfAllTime;
+       queue_init(&BootProcInfo.rtclock_timer.queue);
+       BootProcInfo.rtclock_timer.deadline = EndOfAllTime;
        BootProcInfo.pp2ndPage = (addr64_t)(uintptr_t)&BootProcInfo;    /* Initial physical address of the second page */
 
        BootProcInfo.pms.pmsStamp = 0;                                          /* Dummy transition time */
index 90a5754ae3cf898d877b3519e9a8c280f68ad2aa..7c1222bd0ba2975ec0a2e438ade247356210f723 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -66,9 +66,6 @@ static mach_timebase_info_data_t      rtclock_timebase_const;
 
 static boolean_t               rtclock_timebase_initialized;
 
-/* XXX this should really be in a header somewhere */
-extern clock_timer_func_t      rtclock_timer_expire;
-
 decl_simple_lock_data(static,rtclock_lock)
 
 /*
@@ -214,18 +211,6 @@ clock_timebase_info(
        UNLOCK_RTC(s);
 }      
 
-void
-clock_set_timer_func(
-       clock_timer_func_t              func)
-{
-       spl_t           s;
-
-       LOCK_RTC(s);
-       if (rtclock_timer_expire == NULL)
-               rtclock_timer_expire = func;
-       UNLOCK_RTC(s);
-}
-
 void
 clock_interval_to_absolutetime_interval(
        uint32_t                        interval,
index ed0cbb3332868c54090c0e24c2716363e27ccb79..77f287ead96bea8b5ebae7634a7b5010a5222b8a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2004-2008 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -48,6 +48,7 @@ extern void rtclock_intr(struct savearea *ssp);
 
 #pragma pack(push,4)
 struct rtclock_timer_t  {
+       queue_head_t    queue;
        uint64_t                deadline;
        uint32_t
        /*boolean_t*/   is_set:1,
index f83dd5e8871f6e11a318dd91bfdf965a20ca7205..a89aa0ef0a5f513b3dedf8437005375d056ab791 100644 (file)
@@ -636,7 +636,9 @@ vm_object_update_extent(
                            m->list_req_pending = TRUE;
                            m->cleaning = TRUE;
 
-                           if (should_flush) {
+                           if (should_flush &&
+                               /* let's no flush a wired page... */
+                               !m->wire_count) {
                                    /*
                                     * and add additional state
                                     * for the flush
index 54e618e40004537aae4c35af51a38fc0987238b5..da167635f0a77a5aaec1093892cf1bb435146ceb 100644 (file)
@@ -412,6 +412,7 @@ apple_protect_pager_data_request(
        pl_count = length / PAGE_SIZE;
        for (cur_offset = 0; cur_offset < length; cur_offset += PAGE_SIZE) {
                ppnum_t dst_pnum;
+               int     type_of_fault;
 
                if (!upl_page_present(upl_pl, cur_offset / PAGE_SIZE)) {
                        /* this page is not in the UPL: skip it */
@@ -435,7 +436,7 @@ apple_protect_pager_data_request(
                                   &prot,
                                   &src_page,
                                   &top_page,
-                                  NULL,
+                                  &type_of_fault,
                                   &error_code,
                                   FALSE,
                                   FALSE,
index 77a34c912ececf45cf55a64aa08a7c0feb5b6ec1..53ba64bee06c483e00078d5fccb37d1b7183ecff 100644 (file)
@@ -2256,6 +2256,7 @@ vm_fault(
        boolean_t               need_collapse = FALSE;
        int                     object_lock_type = 0;
        int                     cur_object_lock_type;
+       vm_object_t             top_object = VM_OBJECT_NULL;
 
 
        KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
@@ -2618,16 +2619,26 @@ RetryFault:
 
                                prot &= ~VM_PROT_WRITE;
 
-                               /*
-                                * Set up to map the page...
-                                * mark the page busy, drop
-                                * unneeded object lock
-                                */     
                                if (object != cur_object) {
-                                       /*      
-                                        * don't need the original object anymore
+                                       /*
+                                        * We still need to hold the top object
+                                        * lock here to prevent a race between
+                                        * a read fault (taking only "shared"
+                                        * locks) and a write fault (taking
+                                        * an "exclusive" lock on the top
+                                        * object.
+                                        * Otherwise, as soon as we release the
+                                        * top lock, the write fault could
+                                        * proceed and actually complete before
+                                        * the read fault, and the copied page's
+                                        * translation could then be overwritten
+                                        * by the read fault's translation for
+                                        * the original page.
+                                        *
+                                        * Let's just record what the top object
+                                        * is and we'll release it later.
                                         */
-                                       vm_object_unlock(object);
+                                       top_object = object;
 
                                        /*
                                         * switch to the object that has the new page
@@ -2668,6 +2679,20 @@ FastPmapEnter:
                                                            &type_of_fault);
                                }
 
+                               if (top_object != VM_OBJECT_NULL) {
+                                       /*
+                                        * It's safe to drop the top object
+                                        * now that we've done our
+                                        * vm_fault_enter().  Any other fault
+                                        * in progress for that virtual
+                                        * address will either find our page
+                                        * and translation or put in a new page
+                                        * and translation.
+                                        */
+                                       vm_object_unlock(top_object);
+                                       top_object = VM_OBJECT_NULL;
+                               }
+
                                if (need_collapse == TRUE)
                                        vm_object_collapse(object, offset, TRUE);
 
index 7b6b17dc6019bf9e969b8032f3ae3adf67392a8b..f5e05931ed6578d2de07be65a236d423ec084b12 100644 (file)
@@ -87,6 +87,7 @@ const vm_offset_t vm_min_kernel_address = VM_MIN_KERNEL_ADDRESS;
 const vm_offset_t vm_max_kernel_address = VM_MAX_KERNEL_ADDRESS;
 
 boolean_t vm_kernel_ready = FALSE;
+boolean_t zlog_ready = FALSE;
 
 /*
  *     vm_mem_bootstrap initializes the virtual memory system.
@@ -131,6 +132,8 @@ vm_mem_bootstrap(void)
        vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling pmap_init\n"));
        pmap_init();
        
+       zlog_ready = TRUE;
+
        if (PE_parse_boot_argn("zsize", &zsizearg, sizeof (zsizearg)))
                zsize = zsizearg * 1024ULL * 1024ULL;
        else {
index 5bad35b4ba51c8cd0ff7b9e7609d7b3f990c29b0..a5b224b702d18cac7155f1f311c104cdcd856815 100644 (file)
@@ -39,6 +39,7 @@ ENTRY(PE_get_timebase)
 
         movl    S_ARG0, %ecx
 
+       lfence
         rdtsc
        lfence
 
index d8a013397cab984c0075f13e58871bdfe2cfeeff..42e96d9772cee5484d26973a043aca7d266fc2b2 100644 (file)
@@ -60,6 +60,7 @@ void PE_init_platform(
 
 
 
+
 void PE_init_kprintf(
        boolean_t vm_initialized);
 
index 58f3e2b33aa5b83e538c23290cc5c779ff543bb0..c3ea614357eef702c20ee51e24f508b0dedf09f4 100644 (file)
@@ -159,7 +159,7 @@ int mac_cred_label_externalize_audit(proc_t p, struct mac *mac);
 void   mac_cred_label_free(struct label *label);
 void   mac_cred_label_init(kauth_cred_t cred);
 void   mac_cred_label_update(kauth_cred_t cred, struct label *newlabel);
-void   mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t newcred,
+int    mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t newcred,
            struct vnode *vp, struct label *scriptvnodelabel,
            struct label *execlabel);
 void   mac_devfs_label_associate_device(dev_t dev, struct devnode *de,
index 21b645a73f34258edf2ca141e8bf961090cce756..544565552ced6a484bf955429a7abaa180cd90de 100644 (file)
@@ -528,6 +528,10 @@ typedef int mpo_cred_label_internalize_t(
   The final label, execlabel, corresponds to a label supplied by a
   user space application through the use of the mac_execve system call.
 
+  If non-NULL, the value pointed to by disjointp will be set to 0 to
+  indicate that the old and new credentials are not disjoint, or 1 to
+  indicate that they are.
+
   The vnode lock is held during this operation.  No changes should be
   made to the old credential structure.
 */
@@ -537,7 +541,8 @@ typedef void mpo_cred_label_update_execve_t(
        struct vnode *vp,
        struct label *vnodelabel,
        struct label *scriptvnodelabel,
-       struct label *execlabel
+       struct label *execlabel,
+       int *disjointp
 );
 /**
   @brief Update a credential label
index 2bbfb04db49fae57e42b9ec2721dc5d52ddd1eab..8910d6d72d06e8bb65fdce64990a8368f9b99cb1 100644 (file)
@@ -413,21 +413,24 @@ mac_vnode_label_store(vfs_context_t ctx, struct vnode *vp,
        return (error);
 }
 
-void
+int
 mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t new, struct vnode *vp,
     struct label *scriptvnodelabel, struct label *execl)
 {
        kauth_cred_t cred;
+       int disjoint = 0;
 
        if (!mac_proc_enforce && !mac_vnode_enforce)
-               return
+               return disjoint;
 
        /* mark the new cred to indicate "matching" includes the label */
        new->cr_flags |= CRF_MAC_ENFORCE;
 
        cred = vfs_context_ucred(ctx);
        MAC_PERFORM(cred_label_update_execve, cred, new, vp, vp->v_label,
-           scriptvnodelabel, execl);
+           scriptvnodelabel, execl, &disjoint);
+
+       return (disjoint);
 }
 
 int