options REV_ENDIAN_FS # Reverse Endian FS # <revfs>
options NAMEDSTREAMS # named stream vnop support # <namedstreams>
options CONFIG_VOLFS # volfs path support (legacy) # <config_volfs>
+options CONFIG_IMGSRC_ACCESS # source of imageboot dmg # <config_imgsrc_access>
#
# NFS support
options ZLIB # inflate/deflate support # <zlib>
+options IF_BRIDGE # <if_bridge>
+
makeoptions LIBDRIVER = "libDriver_kern.o" # <libdriver>
makeoptions LIBOBJC = "libkobjc.o" # <kernobjc>
options CONFIG_VFS_NAMES=3072 # <small,xsmall>
options CONFIG_VFS_NAMES=2048 # <bsmall>
+options CONFIG_MAX_CLUSTERS=8 # <xlarge,large,medium>
+options CONFIG_MAX_CLUSTERS=4 # <small,xsmall,bsmall>
+
#
# configurable kauth credential related resources
#
#
options CONFIG_ENFORCE_SIGNED_CODE # <config_embedded>
+# support dynamic signing of code
+#
+options CONFIG_DYNAMIC_CODE_SIGNING # <dynamic_codesigning>
+
#
# code decryption... used on embedded for app protection
# must be set in all the bsd/conf and osfmk/conf MASTER files
# Standard Apple Research Configurations:
# -------- ----- -------- ---------------
# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ]
-# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs hfs_compression ]
-# NETWORKING = [ inet inet6 compat_oldsock mrouting tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow ]
+# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs hfs_compression config_imgsrc_access ]
+# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow ]
# NFS = [ nfsclient nfsserver ]
# VPN = [ ipsec ]
# RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ]
#
# EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ]
# EMBEDDED_FILESYS = [ devfs hfs journaling fdesc fifo ]
-# EMBEDDED_NET = [ inet compat_oldsock mrouting tcpdrop_synfin bpfilter config_mbuf_noexpand ]
+# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter config_mbuf_noexpand ]
# EMBEDDED = [ EMBEDDED_BASE EMBEDDED_NET VPN EMBEDDED_FILESYS libdriver no_printf_str no_kprintf_str no_kdebug ]
# DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver netmibs development mach_assert config_dtrace ]
#
#
# BASE = [ ppc mach medium config_dtrace vol pst gdb noprofiling simple_clock kernstack sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue ]
# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs hfs_compression ]
-# NETWORKING = [ inet inet6 compat_oldsock mrouting tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk ipflow ]
+# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk ipflow ]
# NFS = [ nfsclient nfsserver ]
# VPN = [ ipsec ]
# RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ]
# Standard Apple Research Configurations:
# -------- ----- -------- ---------------
# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ]
-# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs hfs_compression ]
-# NETWORKING = [ inet inet6 compat_oldsock mrouting tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow ]
+# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs hfs_compression config_imgsrc_access ]
+# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow ]
# NFS = [ nfsclient nfsserver ]
# VPN = [ ipsec ]
# RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ]
#
# EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ]
# EMBEDDED_FILESYS = [ devfs hfs journaling fdesc fifo ]
-# EMBEDDED_NET = [ inet compat_oldsock mrouting tcpdrop_synfin bpfilter config_mbuf_noexpand ]
+# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter config_mbuf_noexpand ]
# EMBEDDED = [ EMBEDDED_BASE EMBEDDED_NET VPN EMBEDDED_FILESYS libdriver no_printf_str no_kprintf_str no_kdebug ]
# DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver netmibs development mach_assert ]
#
OPTIONS/ipfirewall optional ipfirewall
OPTIONS/ipv6firewall optional ipv6firewall
OPTIONS/tcpdebug optional tcpdebug
-OPTIONS/bridge optional bridge
+OPTIONS/if_bridge optional if_bridge
OPTIONS/faith optional faith
OPTIONS/gif optional gif
OPTIONS/netat optional netat
bsd/net/bpf.c optional bpfilter
bsd/net/bpf_filter.c optional bpfilter
-bsd/net/bridge.c optional bridge
+bsd/net/if_bridge.c optional if_bridge
+bsd/net/bridgestp.c optional if_bridge
bsd/net/bsd_comp.c optional ppp_bsdcomp
bsd/net/if.c optional networking
bsd/net/if_atmsubr.c optional atm
sizeof(boolean_t),
cpu_thermal, "I", "Dynamic Acceleration Technology (Turbo Mode)");
+SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, invariant_APIC_timer,
+ CTLTYPE_INT | CTLFLAG_RD,
+ (void *)offsetof(cpuid_thermal_leaf_t, invariant_APIC_timer),
+ sizeof(boolean_t),
+ cpu_thermal, "I", "Invariant APIC Timer");
+
SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, thresholds,
CTLTYPE_INT | CTLFLAG_RD,
(void *)offsetof(cpuid_thermal_leaf_t, thresholds),
extern void replace_desc(struct cnode *cp, struct cat_desc *cdp);
extern int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp,
- struct vnode **rvpp, int can_drop_lock);
+ struct vnode **rvpp, int can_drop_lock, int error_on_unlinked);
extern int hfs_update(struct vnode *, int);
static int hfs_isordered(struct cnode *, struct cnode *);
+inline int hfs_checkdeleted (struct cnode *cp) {
+ return ((cp->c_flag & (C_DELETED | C_NOEXISTS)) ? ENOENT : 0);
+}
+
/*
* Last reference to an cnode. If necessary, write or delete it.
if ((cp->c_blocks > 0) && (forkcount == 1) && (vp != cp->c_rsrc_vp)) {
struct vnode *rvp = NULLVP;
- error = hfs_vgetrsrc(hfsmp, vp, &rvp, FALSE);
+ error = hfs_vgetrsrc(hfsmp, vp, &rvp, FALSE, FALSE);
if (error)
goto out;
/*
return (ENOENT);
}
- /* Hardlinks may need an updated catalog descriptor */
- if ((cp->c_flag & C_HARDLINK) && descp->cd_nameptr && descp->cd_namelen > 0) {
- replace_desc(cp, descp);
+ /*
+ * Hardlinks may need an updated catalog descriptor. However, if
+ * the cnode has already been marked as open-unlinked (C_DELETED), then don't
+ * replace its descriptor.
+ */
+ if (!(hfs_checkdeleted(cp))) {
+ if ((cp->c_flag & C_HARDLINK) && descp->cd_nameptr && descp->cd_namelen > 0) {
+ replace_desc(cp, descp);
+ }
}
/* Check if we found a matching vnode */
if (*vpp != NULL)
FTOC(fp)->c_rsrc_vp : \
FTOC(fp)->c_vp)
+/*
+ * This is a helper function used for determining whether or not a cnode has become open
+ * unlinked in between the time we acquired its vnode and the time we acquire the cnode lock
+ * to start manipulating it. Due to the SMP nature of VFS, it is probably necessary to
+ * use this macro every time we acquire a cnode lock, as the content of the Cnode may have
+ * been modified in betweeen the lookup and a VNOP. Whether or not to call this is dependent
+ * upon the VNOP in question. Sometimes it is OK to use an open-unlinked file, for example, in,
+ * reading. But other times, such as on the source of a VNOP_RENAME, it should be disallowed.
+ */
+int hfs_checkdeleted (struct cnode *cp);
/*
* Test for a resource fork
vm_offset_t a_pl_offset;
int a_flags;
int is_pageoutv2 = 0;
+ kern_return_t kret;
cp = VTOC(vp);
fp = VTOF(vp);
else {
request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
}
- ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
+ kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags);
- if (upl == (upl_t) NULL) {
+ if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) {
retval = EINVAL;
goto pageout_done;
}
/*
- * Copyright (c) 1999-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2010 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
int hfs_dbg_err = 0;
#endif
+/* Enable/disable debugging code for live volume resizing */
+int hfs_resize_debug = 0;
lck_grp_attr_t * hfs_group_attr;
lck_attr_t * hfs_lock_attr;
static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context);
static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimblks, vfs_context_t context);
-static int hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk,
- u_int32_t catblks, u_int32_t fileID, int rsrcfork);
+static int hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t fileID);
static int hfs_journal_replay(vnode_t devvp, vfs_context_t context);
u_int32_t reclaimblks = 0;
int lockflags = 0;
int transaction_begun = 0;
+ Boolean updateFreeBlocks = false;
int error;
- lck_mtx_lock(&hfsmp->hfs_mutex);
+ HFS_MOUNT_LOCK(hfsmp, TRUE);
if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
- lck_mtx_unlock(&hfsmp->hfs_mutex);
+ HFS_MOUNT_UNLOCK(hfsmp, TRUE);
return (EALREADY);
}
hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
hfsmp->hfs_resize_filesmoved = 0;
hfsmp->hfs_resize_totalfiles = 0;
- lck_mtx_unlock(&hfsmp->hfs_mutex);
+ HFS_MOUNT_UNLOCK(hfsmp, TRUE);
/*
* - Journaled HFS Plus volumes only.
newblkcnt = newsize / hfsmp->blockSize;
reclaimblks = hfsmp->totalBlocks - newblkcnt;
+ if (hfs_resize_debug) {
+ printf ("hfs_truncatefs: old: size=%qu, blkcnt=%u, freeblks=%u\n", oldsize, hfsmp->totalBlocks, hfs_freeblks(hfsmp, 1));
+ printf ("hfs_truncatefs: new: size=%qu, blkcnt=%u, reclaimblks=%u\n", newsize, newblkcnt, reclaimblks);
+ }
+
/* Make sure new size is valid. */
if ((newsize < HFS_MIN_SIZE) ||
(newsize >= oldsize) ||
(newsize % hfsmp->hfs_logical_block_size) ||
(newsize % hfsmp->hfs_physical_block_size)) {
- printf ("hfs_truncatefs: invalid size\n");
+ printf ("hfs_truncatefs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize);
error = EINVAL;
goto out;
}
- /* Make sure there's enough space to work with. */
+ /* Make sure that the file system has enough free blocks reclaim */
if (reclaimblks >= hfs_freeblks(hfsmp, 1)) {
- printf("hfs_truncatefs: insufficient space (need %u blocks; have %u blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1));
+ printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1));
error = ENOSPC;
goto out;
}
* in the allocation blocks beyond (i.e. the blocks we're trying to
* truncate away.
*/
- lck_mtx_lock(&hfsmp->hfs_mutex);
+ HFS_MOUNT_LOCK(hfsmp, TRUE);
if (hfsmp->blockSize == 512)
hfsmp->allocLimit = newblkcnt - 2;
else
hfsmp->allocLimit = newblkcnt - 1;
+ /* Update the volume free block count to reflect the total number of
+ * free blocks that will exist after a successful resize.
+ */
hfsmp->freeBlocks -= reclaimblks;
- lck_mtx_unlock(&hfsmp->hfs_mutex);
-
+ updateFreeBlocks = true;
+ HFS_MOUNT_UNLOCK(hfsmp, TRUE);
+
/*
* Look for files that have blocks at or beyond the location of the
- * new alternate volume header.
+ * new alternate volume header
*/
if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) {
/*
transaction_begun = 0;
/* Attempt to reclaim some space. */
- if (hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context) != 0) {
- printf("hfs_truncatefs: couldn't reclaim space on %s\n", hfsmp->vcbVN);
+ error = hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context);
+ if (error != 0) {
+ printf("hfs_truncatefs: couldn't reclaim space on %s (error=%d)\n", hfsmp->vcbVN, error);
error = ENOSPC;
goto out;
}
transaction_begun = 1;
/* Check if we're clear now. */
- if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) {
- printf("hfs_truncatefs: didn't reclaim enough space on %s\n", hfsmp->vcbVN);
+ error = hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks);
+ if (error != 0) {
+ printf("hfs_truncatefs: didn't reclaim enough space on %s (error=%d)\n", hfsmp->vcbVN, error);
error = EAGAIN; /* tell client to try again */
goto out;
}
* since this block will be outside of the truncated file system!
*/
if (hfsmp->hfs_alt_id_sector) {
- if (buf_meta_bread(hfsmp->hfs_devvp,
+ error = buf_meta_bread(hfsmp->hfs_devvp,
HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
- hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) {
-
+ hfsmp->hfs_physical_block_size, NOCRED, &bp);
+ if (error == 0) {
bzero((void*)((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size)), kMDBSize);
(void) VNOP_BWRITE(bp);
- } else if (bp) {
- buf_brelse(bp);
+ } else {
+ if (bp) {
+ buf_brelse(bp);
+ }
}
bp = NULL;
}
/*
* TODO: Adjust the size of the metadata zone based on new volume size?
*/
-
+
/*
* Adjust the size of hfsmp->hfs_attrdata_vp
*/
}
out:
- if (error)
- hfsmp->freeBlocks += reclaimblks;
-
lck_mtx_lock(&hfsmp->hfs_mutex);
+ if (error && (updateFreeBlocks == true))
+ hfsmp->freeBlocks += reclaimblks;
hfsmp->allocLimit = hfsmp->totalBlocks;
if (hfsmp->nextAllocation >= hfsmp->allocLimit)
hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1;
hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
- lck_mtx_unlock(&hfsmp->hfs_mutex);
+ HFS_MOUNT_UNLOCK(hfsmp, TRUE);
if (lockflags) {
hfs_systemfile_unlock(hfsmp, lockflags);
if (transaction_begun) {
hfs_end_transaction(hfsmp);
hfs_journal_flush(hfsmp);
+ /* Just to be sure, sync all data to the disk */
+ (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
}
return (error);
if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread())
panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp);
- /*
- * Wait for any in-progress writes to this vnode to complete, so that we'll
- * be copying consistent bits. (Otherwise, it's possible that an async
- * write will complete to the old extent after we read from it. That
- * could lead to corruption.)
- */
- err = vnode_waitforwrites(vp, 0, 0, 0, "hfs_copy_extent");
- if (err) {
- printf("hfs_copy_extent: Error %d from vnode_waitforwrites\n", err);
- return err;
- }
-
/*
* Determine the I/O size to use
*
buf_setcount(bp, ioSize);
buf_setblkno(bp, destSector);
buf_setlblkno(bp, destSector);
- if (journal_uses_fua(hfsmp->jnl))
+ if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl))
buf_markfua(bp);
/* Do the write */
kmem_free(kernel_map, (vm_offset_t)buffer, bufferSize);
/* Make sure all writes have been flushed to disk. */
- if (!journal_uses_fua(hfsmp->jnl)) {
+ if (vnode_issystem(vp) && !journal_uses_fua(hfsmp->jnl)) {
err = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
if (err) {
printf("hfs_copy_extent: DKIOCSYNCHRONIZECACHE failed (%d)\n", err);
}
+static int
+hfs_relocate_callback(__unused HFSPlusExtentKey *key, HFSPlusExtentRecord *record, HFSPlusExtentRecord *state)
+{
+ bcopy(state, record, sizeof(HFSPlusExtentRecord));
+ return 0;
+}
+
/*
- * Reclaim space at the end of a volume, used by a given system file.
+ * Reclaim space at the end of a volume, used by a given file.
*
* This routine attempts to move any extent which contains allocation blocks
* at or after "startblk." A separate transaction is used to do the move.
* of a transaction have their physical block numbers invalidated so they will
* eventually be written to their new locations.
*
- * This routine can be used to move overflow extents for the allocation file.
- *
* Inputs:
* hfsmp The volume being resized.
* startblk Blocks >= this allocation block need to be moved.
* locks Which locks need to be taken for the given system file.
* vp The vnode for the system file.
*
+ * The caller of this function, hfs_reclaimspace(), grabs cnode lock
+ * for non-system files before calling this function.
+ *
* Outputs:
- * moved Set to true if any extents were moved.
+ * blks_moved Total number of allocation blocks moved by this routine.
*/
static int
-hfs_relocate_callback(__unused HFSPlusExtentKey *key, HFSPlusExtentRecord *record, HFSPlusExtentRecord *state)
-{
- bcopy(state, record, sizeof(HFSPlusExtentRecord));
- return 0;
-}
-static int
-hfs_reclaim_sys_file(struct hfsmount *hfsmp, struct vnode *vp, u_long startblk, int locks, Boolean *moved, vfs_context_t context)
+hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_long startblk, int locks, u_int32_t *blks_moved, vfs_context_t context)
{
int error;
int lockflags;
int i;
u_long datablks;
- u_long block;
+ u_long end_block;
u_int32_t oldStartBlock;
u_int32_t newStartBlock;
- u_int32_t blockCount;
+ u_int32_t oldBlockCount;
+ u_int32_t newBlockCount;
struct filefork *fp;
-
+ struct cnode *cp;
+ int is_sysfile;
+ int took_truncate_lock = 0;
+ struct BTreeIterator *iterator = NULL;
+ u_int8_t forktype;
+ u_int32_t fileID;
+
/* If there is no vnode for this file, then there's nothing to do. */
if (vp == NULL)
return 0;
- /* printf("hfs_reclaim_sys_file: %.*s\n", VTOC(vp)->c_desc.cd_namelen, VTOC(vp)->c_desc.cd_nameptr); */
+ cp = VTOC(vp);
+ fileID = cp->c_cnid;
+ is_sysfile = vnode_issystem(vp);
+ forktype = VNODE_IS_RSRC(vp) ? 0xFF : 0;
+
+ /* Flush all the buffer cache blocks and cluster pages associated with
+ * this vnode.
+ *
+ * If the current vnode is a system vnode, all the buffer cache blocks
+ * associated with it should already be sync'ed to the disk as part of
+ * journal flush in hfs_truncatefs(). Normally there should not be
+ * buffer cache blocks for regular files, but for objects like symlinks,
+ * we can have buffer cache blocks associated with the vnode. Therefore
+ * we call buf_flushdirtyblks() always. Resource fork data for directory
+ * hard links are directly written using buffer cache for device vnode,
+ * which should also be sync'ed as part of journal flush in hfs_truncatefs().
+ *
+ * Flushing cluster pages should be the normal case for regular files,
+ * and really should not do anything for system files. But just to be
+ * sure that all blocks associated with this vnode is sync'ed to the
+ * disk, we call both buffer cache and cluster layer functions.
+ */
+ buf_flushdirtyblks(vp, MNT_NOWAIT, 0, "hfs_reclaim_file");
+ if (!is_sysfile) {
+ /* The caller grabs cnode lock for non-system files only, therefore
+ * we unlock only non-system files before calling cluster layer.
+ */
+ hfs_unlock(cp);
+ hfs_lock_truncate(cp, TRUE);
+ took_truncate_lock = 1;
+ }
+ (void) cluster_push(vp, 0);
+ if (!is_sysfile) {
+ error = hfs_lock(cp, HFS_FORCE_LOCK);
+ if (error) {
+ hfs_unlock_truncate(cp, TRUE);
+ return error;
+ }
+
+ /* If the file no longer exists, nothing left to do */
+ if (cp->c_flag & C_NOEXISTS) {
+ hfs_unlock_truncate(cp, TRUE);
+ return 0;
+ }
+ }
+
+ /* Wait for any in-progress writes to this vnode to complete, so that we'll
+ * be copying consistent bits. (Otherwise, it's possible that an async
+ * write will complete to the old extent after we read from it. That
+ * could lead to corruption.)
+ */
+ error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file");
+ if (error) {
+ printf("hfs_reclaim_file: Error %d from vnode_waitforwrites\n", error);
+ return error;
+ }
+
+ if (hfs_resize_debug) {
+ printf("hfs_reclaim_file: Start relocating %sfork for fileid=%u name=%.*s\n", (forktype ? "rsrc" : "data"), fileID, cp->c_desc.cd_namelen, cp->c_desc.cd_nameptr);
+ }
+
/* We always need the allocation bitmap and extents B-tree */
locks |= SFL_BITMAP | SFL_EXTENTS;
error = hfs_start_transaction(hfsmp);
if (error) {
- printf("hfs_reclaim_sys_file: hfs_start_transaction returned %d\n", error);
+ printf("hfs_reclaim_file: hfs_start_transaction returned %d\n", error);
+ if (took_truncate_lock) {
+ hfs_unlock_truncate(cp, TRUE);
+ }
return error;
}
lockflags = hfs_systemfile_lock(hfsmp, locks, HFS_EXCLUSIVE_LOCK);
fp = VTOF(vp);
datablks = 0;
+ *blks_moved = 0;
/* Relocate non-overflow extents */
for (i = 0; i < kHFSPlusExtentDensity; ++i) {
if (fp->ff_extents[i].blockCount == 0)
break;
oldStartBlock = fp->ff_extents[i].startBlock;
- blockCount = fp->ff_extents[i].blockCount;
- datablks += blockCount;
- block = oldStartBlock + blockCount;
- if (block > startblk) {
- error = BlockAllocate(hfsmp, 1, blockCount, blockCount, true, true, &newStartBlock, &blockCount);
+ oldBlockCount = fp->ff_extents[i].blockCount;
+ datablks += oldBlockCount;
+ end_block = oldStartBlock + oldBlockCount;
+ /* Check if the file overlaps the target space */
+ if (end_block > startblk) {
+ /* Allocate a new extent */
+ error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, true, (is_sysfile ? true : false), &newStartBlock, &newBlockCount);
if (error) {
- printf("hfs_reclaim_sys_file: BlockAllocate returned %d\n", error);
+ printf("hfs_reclaim_file: BlockAllocate (error=%d) for fileID=%u %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount);
goto fail;
}
- if (blockCount != fp->ff_extents[i].blockCount) {
- printf("hfs_reclaim_sys_file: new blockCount=%u, original blockCount=%u", blockCount, fp->ff_extents[i].blockCount);
- goto free_fail;
+ if (newBlockCount != oldBlockCount) {
+ printf("hfs_reclaim_file: fileID=%u - newBlockCount=%u, oldBlockCount=%u", fileID, newBlockCount, oldBlockCount);
+ if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount)) {
+ hfs_mark_volume_inconsistent(hfsmp);
+ }
+ goto fail;
}
- error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, blockCount, context);
+
+ /* Copy data from old location to new location */
+ error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, newBlockCount, context);
if (error) {
- printf("hfs_reclaim_sys_file: hfs_copy_extent returned %d\n", error);
- goto free_fail;
+ printf("hfs_reclaim_file: hfs_copy_extent error=%d for fileID=%u %u:(%u,%u) to %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount);
+ if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount)) {
+ hfs_mark_volume_inconsistent(hfsmp);
+ }
+ goto fail;
}
fp->ff_extents[i].startBlock = newStartBlock;
- VTOC(vp)->c_flag |= C_MODIFIED;
- *moved = true;
- error = BlockDeallocate(hfsmp, oldStartBlock, blockCount);
+ cp->c_flag |= C_MODIFIED;
+ *blks_moved += newBlockCount;
+
+ /* Deallocate the old extent */
+ error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount);
if (error) {
- /* TODO: Mark volume inconsistent? */
- printf("hfs_reclaim_sys_file: BlockDeallocate returned %d\n", error);
+ printf("hfs_reclaim_file: BlockDeallocate returned %d\n", error);
+ hfs_mark_volume_inconsistent(hfsmp);
goto fail;
}
- error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
- if (error) {
- /* TODO: Mark volume inconsistent? */
- printf("hfs_reclaim_sys_file: hfs_flushvolumeheader returned %d\n", error);
- goto fail;
+
+ /* If this is a system file, sync the volume header on disk */
+ if (is_sysfile) {
+ error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
+ if (error) {
+ printf("hfs_reclaim_file: hfs_flushvolumeheader returned %d\n", error);
+ hfs_mark_volume_inconsistent(hfsmp);
+ goto fail;
+ }
+ }
+
+ if (hfs_resize_debug) {
+ printf ("hfs_reclaim_file: Relocated %u:(%u,%u) to %u:(%u,%u)\n", i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount);
}
}
}
/* Relocate overflow extents (if any) */
if (i == kHFSPlusExtentDensity && fp->ff_blocks > datablks) {
- struct BTreeIterator *iterator = NULL;
struct FSBufferDescriptor btdata;
HFSPlusExtentRecord record;
HFSPlusExtentKey *key;
FCB *fcb;
- u_int32_t fileID;
- u_int8_t forktype;
+ int overflow_count = 0;
- forktype = VNODE_IS_RSRC(vp) ? 0xFF : 0;
- fileID = VTOC(vp)->c_cnid;
if (kmem_alloc(kernel_map, (vm_offset_t*) &iterator, sizeof(*iterator))) {
- printf("hfs_reclaim_sys_file: kmem_alloc failed!\n");
+ printf("hfs_reclaim_file: kmem_alloc failed!\n");
error = ENOMEM;
goto fail;
}
error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator);
while (error == 0) {
/* Stop when we encounter a different file or fork. */
- if ((key->fileID != fileID) ||
- (key->forkType != forktype)) {
+ if ((key->fileID != fileID) ||
+ (key->forkType != forktype)) {
break;
}
+
+ /* Just track the overflow extent record number for debugging... */
+ if (hfs_resize_debug) {
+ overflow_count++;
+ }
+
/*
* Check if the file overlaps target space.
*/
for (i = 0; i < kHFSPlusExtentDensity; ++i) {
if (record[i].blockCount == 0) {
- goto overflow_done;
+ goto fail;
}
oldStartBlock = record[i].startBlock;
- blockCount = record[i].blockCount;
- block = oldStartBlock + blockCount;
- if (block > startblk) {
- error = BlockAllocate(hfsmp, 1, blockCount, blockCount, true, true, &newStartBlock, &blockCount);
+ oldBlockCount = record[i].blockCount;
+ end_block = oldStartBlock + oldBlockCount;
+ if (end_block > startblk) {
+ error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, true, (is_sysfile ? true : false), &newStartBlock, &newBlockCount);
if (error) {
- printf("hfs_reclaim_sys_file: BlockAllocate returned %d\n", error);
- goto overflow_done;
+ printf("hfs_reclaim_file: BlockAllocate (error=%d) for fileID=%u %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount);
+ goto fail;
}
- if (blockCount != record[i].blockCount) {
- printf("hfs_reclaim_sys_file: new blockCount=%u, original blockCount=%u", blockCount, fp->ff_extents[i].blockCount);
- kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
- goto free_fail;
+ if (newBlockCount != oldBlockCount) {
+ printf("hfs_reclaim_file: fileID=%u - newBlockCount=%u, oldBlockCount=%u", fileID, newBlockCount, oldBlockCount);
+ if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount)) {
+ hfs_mark_volume_inconsistent(hfsmp);
+ }
+ goto fail;
}
- error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, blockCount, context);
+ error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, newBlockCount, context);
if (error) {
- printf("hfs_reclaim_sys_file: hfs_copy_extent returned %d\n", error);
- kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
- goto free_fail;
+ printf("hfs_reclaim_file: hfs_copy_extent error=%d for fileID=%u (%u,%u) to (%u,%u)\n", error, fileID, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
+ if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount)) {
+ hfs_mark_volume_inconsistent(hfsmp);
+ }
+ goto fail;
}
record[i].startBlock = newStartBlock;
- VTOC(vp)->c_flag |= C_MODIFIED;
- *moved = true;
+ cp->c_flag |= C_MODIFIED;
+ *blks_moved += newBlockCount;
+
/*
* NOTE: To support relocating overflow extents of the
* allocation file, we must update the BTree record BEFORE
*/
error = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr) hfs_relocate_callback, &record);
if (error) {
- /* TODO: Mark volume inconsistent? */
- printf("hfs_reclaim_sys_file: BTUpdateRecord returned %d\n", error);
- goto overflow_done;
+ printf("hfs_reclaim_file: BTUpdateRecord returned %d\n", error);
+ hfs_mark_volume_inconsistent(hfsmp);
+ goto fail;
}
- error = BlockDeallocate(hfsmp, oldStartBlock, blockCount);
+ error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount);
if (error) {
- /* TODO: Mark volume inconsistent? */
- printf("hfs_reclaim_sys_file: BlockDeallocate returned %d\n", error);
- goto overflow_done;
+ printf("hfs_reclaim_file: BlockDeallocate returned %d\n", error);
+ hfs_mark_volume_inconsistent(hfsmp);
+ goto fail;
+ }
+ if (hfs_resize_debug) {
+ printf ("hfs_reclaim_file: Relocated overflow#%d %u:(%u,%u) to %u:(%u,%u)\n", overflow_count, i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount);
}
}
}
break;
}
}
-overflow_done:
- kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
- if (error) {
- goto fail;
- }
}
- hfs_systemfile_unlock(hfsmp, lockflags);
- error = hfs_end_transaction(hfsmp);
- if (error) {
- printf("hfs_reclaim_sys_file: hfs_end_transaction returned %d\n", error);
+fail:
+ if (iterator) {
+ kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
}
- return error;
-
-free_fail:
- (void) BlockDeallocate(hfsmp, newStartBlock, blockCount);
-fail:
(void) hfs_systemfile_unlock(hfsmp, lockflags);
+
+ if ((*blks_moved != 0) && (is_sysfile == false)) {
+ (void) hfs_update(vp, MNT_WAIT);
+ }
+
(void) hfs_end_transaction(hfsmp);
+
+ if (took_truncate_lock) {
+ hfs_unlock_truncate(cp, TRUE);
+ }
+
+ if (hfs_resize_debug) {
+ printf("hfs_reclaim_file: Finished relocating %sfork for fileid=%u (error=%d)\n", (forktype ? "rsrc" : "data"), fileID, error);
+ }
+
return error;
}
{
int error;
int lockflags;
+ u_int32_t oldStartBlock;
u_int32_t newStartBlock;
u_int32_t oldBlockCount;
u_int32_t newBlockCount;
printf("hfs_reclaim_journal_file: cat_idlookup returned %d\n", error);
goto free_fail;
}
+ oldStartBlock = journal_fork.cf_extents[0].startBlock;
journal_fork.cf_size = newBlockCount * hfsmp->blockSize;
journal_fork.cf_extents[0].startBlock = newStartBlock;
journal_fork.cf_extents[0].blockCount = newBlockCount;
printf("hfs_reclaim_journal_file: hfs_end_transaction returned %d\n", error);
}
+ if (!error && hfs_resize_debug) {
+ printf ("hfs_reclaim_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
+ }
return error;
free_fail:
fail:
hfs_systemfile_unlock(hfsmp, lockflags);
(void) hfs_end_transaction(hfsmp);
+ if (hfs_resize_debug) {
+ printf ("hfs_reclaim_journal_file: Error relocating journal file (error=%d)\n", error);
+ }
return error;
}
{
int error;
int lockflags;
+ u_int32_t oldBlock;
u_int32_t newBlock;
u_int32_t blockCount;
struct cat_desc jib_desc;
printf("hfs_reclaim_journal_file: cat_idlookup returned %d\n", error);
goto fail;
}
+ oldBlock = jib_fork.cf_extents[0].startBlock;
jib_fork.cf_size = hfsmp->blockSize;
jib_fork.cf_extents[0].startBlock = newBlock;
jib_fork.cf_extents[0].blockCount = 1;
if (error) {
printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error);
}
+
+ if (!error && hfs_resize_debug) {
+ printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount);
+ }
return error;
free_fail:
fail:
hfs_systemfile_unlock(hfsmp, lockflags);
(void) hfs_end_transaction(hfsmp);
+ if (hfs_resize_debug) {
+ printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error);
+ }
return error;
}
/*
* Reclaim space at the end of a file system.
+ *
+ * Inputs -
+ * startblk - start block of the space being reclaimed
+ * reclaimblks - number of allocation blocks to reclaim
*/
static int
hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimblks, vfs_context_t context)
int filecnt = 0;
int maxfilecnt;
u_int32_t block;
- u_int32_t datablks;
- u_int32_t rsrcblks;
- u_int32_t blkstomove = 0;
int lockflags;
- int i;
+ int i, j;
int error;
int lastprogress = 0;
- Boolean system_file_moved = false;
+ u_int32_t blks_moved = 0;
+ u_int32_t total_blks_moved = 0;
+ Boolean need_relocate;
/* Relocate extents of the Allocation file if they're in the way. */
- error = hfs_reclaim_sys_file(hfsmp, hfsmp->hfs_allocation_vp, startblk, SFL_BITMAP, &system_file_moved, context);
+ error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, startblk, SFL_BITMAP, &blks_moved, context);
if (error) {
printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error);
return error;
}
+ total_blks_moved += blks_moved;
+
/* Relocate extents of the Extents B-tree if they're in the way. */
- error = hfs_reclaim_sys_file(hfsmp, hfsmp->hfs_extents_vp, startblk, SFL_EXTENTS, &system_file_moved, context);
+ error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, startblk, SFL_EXTENTS, &blks_moved, context);
if (error) {
printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error);
return error;
}
+ total_blks_moved += blks_moved;
+
/* Relocate extents of the Catalog B-tree if they're in the way. */
- error = hfs_reclaim_sys_file(hfsmp, hfsmp->hfs_catalog_vp, startblk, SFL_CATALOG, &system_file_moved, context);
+ error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, startblk, SFL_CATALOG, &blks_moved, context);
if (error) {
printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error);
return error;
}
+ total_blks_moved += blks_moved;
+
/* Relocate extents of the Attributes B-tree if they're in the way. */
- error = hfs_reclaim_sys_file(hfsmp, hfsmp->hfs_attribute_vp, startblk, SFL_ATTRIBUTE, &system_file_moved, context);
+ error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, startblk, SFL_ATTRIBUTE, &blks_moved, context);
if (error) {
printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error);
return error;
}
+ total_blks_moved += blks_moved;
+
/* Relocate extents of the Startup File if there is one and they're in the way. */
- error = hfs_reclaim_sys_file(hfsmp, hfsmp->hfs_startup_vp, startblk, SFL_STARTUP, &system_file_moved, context);
+ error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, startblk, SFL_STARTUP, &blks_moved, context);
if (error) {
printf("hfs_reclaimspace: reclaim startup file returned %d\n", error);
return error;
}
+ total_blks_moved += blks_moved;
/*
* We need to make sure the alternate volume header gets flushed if we moved
* shrinking the size of the volume, or else the journal code will panic
* with an invalid (too large) block number.
*
- * Note that system_file_moved will be set if ANY extent was moved, even
+ * Note that total_blks_moved will be set if ANY extent was moved, even
* if it was just an overflow extent. In this case, the journal_flush isn't
* strictly required, but shouldn't hurt.
*/
- if (system_file_moved)
+ if (total_blks_moved) {
hfs_journal_flush(hfsmp);
+ }
if (hfsmp->jnl_start + (hfsmp->jnl_size / hfsmp->blockSize) > startblk) {
error = hfs_reclaim_journal_file(hfsmp, context);
}
saved_next_allocation = hfsmp->nextAllocation;
+ /* Always try allocating new blocks after the metadata zone */
HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_start);
fcb = VTOF(hfsmp->hfs_catalog_vp);
}
/*
* Iterate over all the catalog records looking for files
- * that overlap into the space we're trying to free up.
+ * that overlap into the space we're trying to free up and
+ * the total number of blocks that will require relocation.
*/
for (filecnt = 0; filecnt < maxfilecnt; ) {
error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
if (filerec.recordType != kHFSPlusFileRecord) {
continue;
}
- datablks = rsrcblks = 0;
- /*
- * Check if either fork overlaps target space.
- */
+
+ need_relocate = false;
+ /* Check if data fork overlaps the target space */
for (i = 0; i < kHFSPlusExtentDensity; ++i) {
- if (filerec.dataFork.extents[i].blockCount != 0) {
- datablks += filerec.dataFork.extents[i].blockCount;
- block = filerec.dataFork.extents[i].startBlock +
- filerec.dataFork.extents[i].blockCount;
- if (block >= startblk) {
- if ((filerec.fileID == hfsmp->hfs_jnlfileid) ||
- (filerec.fileID == hfsmp->hfs_jnlinfoblkid)) {
- printf("hfs_reclaimspace: cannot move active journal\n");
- error = EPERM;
- goto end_iteration;
- }
- cnidbufp[filecnt++] = filerec.fileID;
- blkstomove += filerec.dataFork.totalBlocks;
- break;
- }
+ if (filerec.dataFork.extents[i].blockCount == 0) {
+ break;
}
- if (filerec.resourceFork.extents[i].blockCount != 0) {
- rsrcblks += filerec.resourceFork.extents[i].blockCount;
- block = filerec.resourceFork.extents[i].startBlock +
- filerec.resourceFork.extents[i].blockCount;
- if (block >= startblk) {
- cnidbufp[filecnt++] = filerec.fileID;
- blkstomove += filerec.resourceFork.totalBlocks;
- break;
+ block = filerec.dataFork.extents[i].startBlock +
+ filerec.dataFork.extents[i].blockCount;
+ if (block >= startblk) {
+ if ((filerec.fileID == hfsmp->hfs_jnlfileid) ||
+ (filerec.fileID == hfsmp->hfs_jnlinfoblkid)) {
+ printf("hfs_reclaimspace: cannot move active journal\n");
+ error = EPERM;
+ goto end_iteration;
}
+ need_relocate = true;
+ goto save_fileid;
}
}
- /*
- * Check for any overflow extents that overlap.
- */
- if (i == kHFSPlusExtentDensity) {
- if (filerec.dataFork.totalBlocks > datablks) {
- if (hfs_overlapped_overflow_extents(hfsmp, startblk, datablks, filerec.fileID, 0)) {
- cnidbufp[filecnt++] = filerec.fileID;
- blkstomove += filerec.dataFork.totalBlocks;
- }
- } else if (filerec.resourceFork.totalBlocks > rsrcblks) {
- if (hfs_overlapped_overflow_extents(hfsmp, startblk, rsrcblks, filerec.fileID, 1)) {
- cnidbufp[filecnt++] = filerec.fileID;
- blkstomove += filerec.resourceFork.totalBlocks;
- }
+
+ /* Check if resource fork overlaps the target space */
+ for (j = 0; j < kHFSPlusExtentDensity; ++j) {
+ if (filerec.resourceFork.extents[j].blockCount == 0) {
+ break;
+ }
+ block = filerec.resourceFork.extents[j].startBlock +
+ filerec.resourceFork.extents[j].blockCount;
+ if (block >= startblk) {
+ need_relocate = true;
+ goto save_fileid;
+ }
+ }
+
+ /* Check if any forks' overflow extents overlap the target space */
+ if ((i == kHFSPlusExtentDensity) || (j == kHFSPlusExtentDensity)) {
+ if (hfs_overlapped_overflow_extents(hfsmp, startblk, filerec.fileID)) {
+ need_relocate = true;
+ goto save_fileid;
+ }
+ }
+
+save_fileid:
+ if (need_relocate == true) {
+ cnidbufp[filecnt++] = filerec.fileID;
+ if (hfs_resize_debug) {
+ printf ("hfs_reclaimspace: Will relocate extents for fileID=%u\n", filerec.fileID);
}
}
}
end_iteration:
- if (filecnt == 0 && !system_file_moved) {
+ /* If no regular file was found to be relocated and
+ * no system file was moved, we probably do not have
+ * enough space to relocate the system files, or
+ * something else went wrong.
+ */
+ if ((filecnt == 0) && (total_blks_moved == 0)) {
printf("hfs_reclaimspace: no files moved\n");
error = ENOSPC;
}
if (error || filecnt == 0)
goto out;
- /*
- * Double check space requirements to make sure
- * there is enough space to relocate any files
- * that reside in the reclaim area.
- *
- * Blocks To Move --------------
- * | | |
- * V V V
- * ------------------------------------------------------------------------
- * | | / /// // |
- * | | / /// // |
- * | | / /// // |
- * ------------------------------------------------------------------------
- *
- * <------------------- New Total Blocks ------------------><-- Reclaim -->
- *
- * <------------------------ Original Total Blocks ----------------------->
- *
- */
- if (blkstomove >= hfs_freeblks(hfsmp, 1)) {
- printf("hfs_truncatefs: insufficient space (need %u blocks; have %u blocks)\n", blkstomove, hfs_freeblks(hfsmp, 1));
- error = ENOSPC;
- goto out;
- }
hfsmp->hfs_resize_filesmoved = 0;
hfsmp->hfs_resize_totalfiles = filecnt;
/* Now move any files that are in the way. */
for (i = 0; i < filecnt; ++i) {
- struct vnode * rvp;
- struct cnode * cp;
+ struct vnode *rvp;
+ struct cnode *cp;
+ struct filefork *datafork;
if (hfs_vget(hfsmp, cnidbufp[i], &vp, 0) != 0)
continue;
+
+ cp = VTOC(vp);
+ datafork = VTOF(vp);
- /* Relocating directory hard links is not supported, so we
- * punt (see radar 6217026). */
- cp = VTOC(vp);
- if ((cp->c_flag & C_HARDLINK) && vnode_isdir(vp)) {
- printf("hfs_reclaimspace: unable to relocate directory hard link %d\n", cp->c_cnid);
- error = EINVAL;
- goto out;
- }
-
- /* Relocate any data fork blocks. */
- if (VTOF(vp) && VTOF(vp)->ff_blocks > 0) {
- error = hfs_relocate(vp, hfsmp->hfs_metazone_end + 1, kauth_cred_get(), current_proc());
+ /* Relocating directory hard links is not supported, so we punt (see radar 6217026). */
+ if ((cp->c_flag & C_HARDLINK) && vnode_isdir(vp)) {
+ printf("hfs_reclaimspace: Unable to relocate directory hard link id=%d\n", cp->c_cnid);
+ error = EINVAL;
+ goto out;
}
- if (error)
- break;
- /* Relocate any resource fork blocks. */
- if ((cp->c_blocks - (VTOF(vp) ? VTOF((vp))->ff_blocks : 0)) > 0) {
- error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE);
- if (error)
+ /* Relocate any overlapping data fork blocks. */
+ if (datafork && datafork->ff_blocks > 0) {
+ error = hfs_reclaim_file(hfsmp, vp, startblk, 0, &blks_moved, context);
+ if (error) {
+ printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", cnidbufp[i], error);
break;
- error = hfs_relocate(rvp, hfsmp->hfs_metazone_end + 1, kauth_cred_get(), current_proc());
+ }
+ total_blks_moved += blks_moved;
+ }
+
+ /* Relocate any overlapping resource fork blocks. */
+ if ((cp->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) {
+ error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE);
+ if (error) {
+ printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", cnidbufp[i], error);
+ break;
+ }
+ error = hfs_reclaim_file(hfsmp, rvp, startblk, 0, &blks_moved, context);
VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT;
- if (error)
+ if (error) {
+ printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", cnidbufp[i], error);
break;
+ }
+ total_blks_moved += blks_moved;
}
hfs_unlock(cp);
vnode_put(vp);
vp = NULL;
}
if (hfsmp->hfs_resize_filesmoved != 0) {
- printf("hfs_reclaimspace: relocated %d files on \"%s\"\n",
- (int)hfsmp->hfs_resize_filesmoved, hfsmp->vcbVN);
+ printf("hfs_reclaimspace: relocated %u blocks from %d files on \"%s\"\n",
+ total_blks_moved, (int)hfsmp->hfs_resize_filesmoved, hfsmp->vcbVN);
}
out:
kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
/*
- * Check if there are any overflow extents that overlap.
+ * Check if there are any overflow data or resource fork extents that overlap
+ * into the disk space that is being reclaimed.
+ *
+ * Output -
+ * 1 - One of the overflow extents need to be relocated
+ * 0 - No overflow extents need to be relocated, or there was an error
*/
static int
-hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t catblks, u_int32_t fileID, int rsrcfork)
+hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t fileID)
{
struct BTreeIterator * iterator = NULL;
struct FSBufferDescriptor btdata;
HFSPlusExtentRecord extrec;
HFSPlusExtentKey *extkeyptr;
FCB *fcb;
- u_int32_t block;
- u_int8_t forktype;
int overlapped = 0;
int i;
int error;
- forktype = rsrcfork ? 0xFF : 0;
if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
- return (0);
+ return 0;
}
bzero(iterator, sizeof(*iterator));
extkeyptr = (HFSPlusExtentKey *)&iterator->key;
extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength;
- extkeyptr->forkType = forktype;
+ extkeyptr->forkType = 0;
extkeyptr->fileID = fileID;
- extkeyptr->startBlock = catblks;
+ extkeyptr->startBlock = 0;
btdata.bufferAddress = &extrec;
btdata.itemSize = sizeof(extrec);
fcb = VTOF(hfsmp->hfs_extents_vp);
+ /* This will position the iterator just before the first overflow
+ * extent record for given fileID. It will always return btNotFound,
+ * so we special case the error code.
+ */
error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator);
+ if (error && (error != btNotFound)) {
+ goto out;
+ }
+
+ /* BTIterateRecord() might return error if the btree is empty, and
+ * therefore we return that the extent does not overflow to the caller
+ */
+ error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
while (error == 0) {
/* Stop when we encounter a different file. */
- if ((extkeyptr->fileID != fileID) ||
- (extkeyptr->forkType != forktype)) {
+ if (extkeyptr->fileID != fileID) {
break;
}
- /*
- * Check if the file overlaps target space.
- */
+ /* Check if any of the forks exist in the target space. */
for (i = 0; i < kHFSPlusExtentDensity; ++i) {
if (extrec[i].blockCount == 0) {
break;
}
- block = extrec[i].startBlock + extrec[i].blockCount;
- if (block >= startblk) {
+ if ((extrec[i].startBlock + extrec[i].blockCount) >= startblk) {
overlapped = 1;
- break;
+ goto out;
}
}
/* Look for more records. */
error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
}
+out:
kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
- return (overlapped);
+ return overlapped;
}
/*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
/*
* We don't bother taking the mount lock
* to look at these values since the values
- * themselves are each updated automically
+ * themselves are each updated atomically
* on aligned addresses.
*/
freeblks = hfsmp->freeBlocks;
if (cp->c_blocks - VTOF(vp)->ff_blocks) {
/* We deal with rsrc fork vnode iocount at the end of the function */
- error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE);
+ error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE);
if (error) {
+ /*
+ * hfs_vgetrsrc may have returned a vnode in rvp even though
+ * we got an error, because we specified error_on_unlinked.
+ * We need to drop the iocount after we release the cnode lock, so
+ * it will be taken care of at the end of the function if it's needed.
+ */
goto out;
}
if ((error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK))) {
return (error);
}
-
- error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE);
+ error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE);
hfs_unlock(cp);
if (error) {
- return (error);
+ /* We may have gotten a rsrc vp out even though we got an error back. */
+ if (rvp) {
+ vnode_put(rvp);
+ rvp = NULL;
+ }
+ return error;
}
drop_rsrc_vnode = 1;
}
if (error && error != ENXIO && error != ENOENT && truncated) {
if ((cp->c_datafork && cp->c_datafork->ff_size != 0) ||
(cp->c_rsrcfork && cp->c_rsrcfork->ff_size != 0)) {
+ off_t data_size = 0;
+ off_t rsrc_size = 0;
+ if (cp->c_datafork) {
+ data_size = cp->c_datafork->ff_size;
+ }
+ if (cp->c_rsrcfork) {
+ rsrc_size = cp->c_rsrcfork->ff_size;
+ }
printf("hfs: remove: couldn't delete a truncated file (%s)"
"(error %d, data sz %lld; rsrc sz %lld)",
- cp->c_desc.cd_nameptr, error, cp->c_datafork->ff_size,
- cp->c_rsrcfork->ff_size);
+ cp->c_desc.cd_nameptr, error, data_size, rsrc_size);
hfs_mark_volume_inconsistent(hfsmp);
} else {
printf("hfs: remove: strangely enough, deleting truncated file %s (%d) got err %d\n",
if ((error = hfs_lock (VTOC(fvp), HFS_EXCLUSIVE_LOCK))) {
return (error);
}
-
- error = hfs_vgetrsrc(VTOHFS(fvp), fvp, &fvp_rsrc, TRUE);
+
+ /*
+ * We care if we race against rename/delete with this cnode, so we'll
+ * error out if this file becomes open-unlinked during this call.
+ */
+ error = hfs_vgetrsrc(VTOHFS(fvp), fvp, &fvp_rsrc, TRUE, TRUE);
hfs_unlock (VTOC(fvp));
if (error) {
+ if (fvp_rsrc) {
+ vnode_put (fvp_rsrc);
+ }
return error;
}
}
* grab the resource fork if the lock succeeded.
*/
if (hfs_lock (VTOC(tvp), HFS_EXCLUSIVE_LOCK) == 0) {
- error = hfs_vgetrsrc(VTOHFS(tvp), tvp, &tvp_rsrc, TRUE);
- hfs_unlock (VTOC(tvp));
+ tcp = VTOC(tvp);
+
+ /*
+ * We only care if we get an open-unlinked file on the dst so we
+ * know to null out tvp/tcp to make the rename operation act
+ * as if they never existed. Because they're effectively out of the
+ * namespace already it's fine to do this. If this is true, then
+ * make sure to unlock the cnode and drop the iocount only after the unlock.
+ */
+ error = hfs_vgetrsrc(VTOHFS(tvp), tvp, &tvp_rsrc, TRUE, TRUE);
+ hfs_unlock (tcp);
if (error) {
- if (fvp_rsrc) {
- vnode_put (fvp_rsrc);
+ /*
+ * Since we specify TRUE for error-on-unlinked in hfs_vgetrsrc,
+ * we can get a rsrc fork vp even if it returns an error.
+ */
+ tcp = NULL;
+ tvp = NULL;
+ if (tvp_rsrc) {
+ vnode_put (tvp_rsrc);
+ tvp_rsrc = NULLVP;
}
- return error;
+ /* just bypass truncate lock and act as if we never got tcp/tvp */
+ goto retry;
}
}
}
}
-/*
- * Return a referenced vnode for the resource fork
- *
- * cnode for vnode vp must already be locked.
- *
- * can_drop_lock is true if its safe to temporarily drop/re-acquire the cnode lock
+
+/* hfs_vgetrsrc acquires a resource fork vnode corresponding to the cnode that is
+ * found in 'vp'. The rsrc fork vnode is returned with the cnode locked and iocount
+ * on the rsrc vnode.
+ *
+ * *rvpp is an output argument for returning the pointer to the resource fork vnode.
+ * In most cases, the resource fork vnode will not be set if we return an error.
+ * However, if error_on_unlinked is set, we may have already acquired the resource fork vnode
+ * before we discover the error (the file has gone open-unlinked). In this case only,
+ * we may return a vnode in the output argument despite an error.
+ *
+ * If can_drop_lock is set, then it is safe for this function to temporarily drop
+ * and then re-acquire the cnode lock. We may need to do this, for example, in order to
+ * acquire an iocount or promote our lock.
+ *
+ * error_on_unlinked is an argument which indicates that we are to return an error if we
+ * discover that the cnode has gone into an open-unlinked state ( C_DELETED or C_NOEXISTS)
+ * is set in the cnode flags. This is only necessary if can_drop_lock is true, otherwise
+ * there's really no reason to double-check for errors on the cnode.
*/
+
__private_extern__
int
-hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, int can_drop_lock)
+hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp,
+ struct vnode **rvpp, int can_drop_lock, int error_on_unlinked)
{
struct vnode *rvp;
struct vnode *dvp = NULLVP;
struct cnode *cp = VTOC(vp);
int error;
int vid;
+ int delete_status = 0;
+
+
+ /*
+ * Need to check the status of the cnode to validate it hasn't
+ * gone open-unlinked on us before we can actually do work with it.
+ */
+ delete_status = hfs_checkdeleted (cp);
+ if ((delete_status) && (error_on_unlinked)) {
+ return delete_status;
+ }
restart:
/* Attempt to use exising vnode */
if (can_drop_lock) {
(void) hfs_lock(cp, HFS_FORCE_LOCK);
+
+ /*
+ * When we relinquished our cnode lock, the cnode could have raced
+ * with a delete and gotten deleted. If the caller did not want
+ * us to ignore open-unlinked files, then re-check the C_DELETED
+ * state and see if we need to return an ENOENT here because the item
+ * got deleted in the intervening time.
+ */
+ if (error_on_unlinked) {
+ if ((delete_status = hfs_checkdeleted(cp))) {
+ /*
+ * If error == 0, this means that we succeeded in acquiring an iocount on the
+ * rsrc fork vnode. However, if we're in this block of code, that
+ * means that we noticed that the cnode has gone open-unlinked. In
+ * this case, the caller requested that we not do any other work and
+ * return an errno. The caller will be responsible for dropping the
+ * iocount we just acquired because we can't do it until we've released
+ * the cnode lock.
+ */
+ if (error == 0) {
+ *rvpp = rvp;
+ }
+ return delete_status;
+ }
+ }
+
/*
* When our lock was relinquished, the resource fork
* could have been recycled. Check for this and try
return (EINVAL);
}
/*
- * If the upgrade fails we loose the lock and
+ * If the upgrade fails we lose the lock and
* have to take the exclusive lock on our own.
*/
if (lck_rw_lock_shared_to_exclusive(&cp->c_rwlock) == FALSE)
* C_DELETED. This is because we need to continue to provide rsrc
* fork access to open-unlinked files. In this case, build a fake descriptor
* like in hfs_removefile. If we don't do this, buildkey will fail in
- * cat_lookup because this cnode has no name in its descriptor.
+ * cat_lookup because this cnode has no name in its descriptor. However,
+ * only do this if the caller did not specify that they wanted us to
+ * error out upon encountering open-unlinked files.
*/
+ if ((error_on_unlinked) && (can_drop_lock)) {
+ if ((error = hfs_checkdeleted (cp))) {
+ return error;
+ }
+ }
+
if ((cp->c_flag & C_DELETED ) && (cp->c_desc.cd_namelen == 0)) {
bzero (&to_desc, sizeof(to_desc));
bzero (delname, 32);
hfs_unlock(cp);
return (ENOATTR);
}
- error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp, TRUE);
+ error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp, TRUE, FALSE);
hfs_unlock(cp);
return (error);
if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) {
return (error);
}
- error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp, TRUE);
+ error = hfs_vgetrsrc(VTOHFS(vp), vp, svpp, TRUE, FALSE);
hfs_unlock(cp);
return (error);
openunlinked = 1;
}
- result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE);
+ result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE);
hfs_unlock(cp);
if (result) {
return (result);
openunlinked = 1;
}
- result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE);
+ result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE);
hfs_unlock(cp);
if (result) {
return (result);
hfs_unlock(cp);
return (ENOATTR);
}
- result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE);
+ result = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE);
hfs_unlock(cp);
if (result) {
return (result);
break;
}
(void)BlockDeallocate(hfsmp, extents[i].startBlock, extents[i].blockCount);
+ remblks -= extents[i].blockCount;
extents[i].startBlock = 0;
extents[i].blockCount = 0;
- remblks -= extents[i].blockCount;
#if HFS_XATTR_VERBOSE
printf("hfs: free_attr_blks: BlockDeallocate [%d, %d]\n",
goto ErrorExit;
}
+ if (buf_flags(bp) & B_LOCKED) {
+ /*
+ * This node is already part of a transaction and will be
+ * written when the transaction is committed so don't write it here.
+ * If we did, then we'd hit a panic in hfs_vnop_bwrite since
+ * B_LOCKED is still set
+ */
+ buf_brelse(bp);
+ continue;
+ }
+
+
buf_clear(bp);
buf_markaged(bp);
error = vfs_mountroot();
if (error == 0 && rootvnode != NULL) {
- struct vnode *tvp;
- struct vnode *newdp;
+ vnode_t newdp, old_rootvnode;
+ mount_t new_rootfs, old_rootfs;
/*
* Get the vnode for '/'.
if (VFS_ROOT(TAILQ_LAST(&mountlist,mntlist), &newdp, vfs_context_kernel()))
panic("%s: cannot find root vnode", __FUNCTION__);
+ old_rootvnode = rootvnode;
+ old_rootfs = rootvnode->v_mount;
+
+ mount_list_remove(old_rootfs);
+
+ mount_lock(old_rootfs);
+#ifdef CONFIG_IMGSRC_ACCESS
+ old_rootfs->mnt_kern_flag |= MNTK_BACKS_ROOT;
+#endif /* CONFIG_IMGSRC_ACCESS */
+ old_rootfs->mnt_flag &= ~MNT_ROOTFS;
+ mount_unlock(old_rootfs);
+
+ rootvnode = newdp;
+
+ new_rootfs = rootvnode->v_mount;
+ mount_lock(new_rootfs);
+ new_rootfs->mnt_flag |= MNT_ROOTFS;
+ mount_unlock(new_rootfs);
+
vnode_ref(newdp);
vnode_put(newdp);
- tvp = rootvnode;
- vnode_rele(tvp);
filedesc0.fd_cdir = newdp;
- rootvnode = newdp;
- mount_list_lock();
- TAILQ_REMOVE(&mountlist, TAILQ_FIRST(&mountlist), mnt_list);
- mount_list_unlock();
- mountlist.tqh_first->mnt_flag |= MNT_ROOTFS;
DBG_TRACE("%s: root switched\n", __FUNCTION__);
+
+#ifdef CONFIG_IMGSRC_ACCESS
+ if (PE_imgsrc_mount_supported()) {
+ imgsrc_rootvnode = old_rootvnode;
+ } else {
+ vnode_getalways(old_rootvnode);
+ vnode_rele(old_rootvnode);
+ vnode_put(old_rootvnode);
+ }
+#else
+ vnode_getalways(old_rootvnode);
+ vnode_rele(old_rootvnode);
+ vnode_put(old_rootvnode);
+#endif /* CONFIG_IMGSRC_ACCESS */
+
+
}
done:
FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI);
void *stackshot_snapbuf = NULL;
int
-stack_snapshot2(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t options, int32_t *retval);
+stack_snapshot2(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset, int32_t *retval);
extern void
-kdp_snapshot_preflight(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t options);
+kdp_snapshot_preflight(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset);
extern int
kdp_stack_snapshot_geterror(void);
return(error);
return stack_snapshot2(uap->pid, uap->tracebuf, uap->tracebuf_size,
- uap->options, retval);
+ uap->flags, uap->dispatch_offset, retval);
}
int
-stack_snapshot2(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t options, int32_t *retval)
+stack_snapshot2(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset, int32_t *retval)
{
int error = 0;
unsigned bytesTraced = 0;
goto error_exit;
}
/* Preload trace parameters*/
- kdp_snapshot_preflight(pid, stackshot_snapbuf, tracebuf_size, options);
+ kdp_snapshot_preflight(pid, stackshot_snapbuf, tracebuf_size, flags, dispatch_offset);
/* Trap to the debugger to obtain a coherent stack snapshot; this populates
* the trace buffer
knote_enqueue(kn);
}
+ /*
+ * The user may change some filter values after the
+ * initial EV_ADD, but doing so will not reset any
+ * filter which have already been triggered.
+ */
+ kn->kn_kevent.udata = kev->udata;
+ if (fops->f_isfd || fops->f_touch == NULL) {
+ kn->kn_sfflags = kev->fflags;
+ kn->kn_sdata = kev->data;
+ }
+
/*
* If somebody is in the middle of dropping this
* knote - go find/insert a new one. But we have
}
/*
- * The user may change some filter values after the
- * initial EV_ADD, but doing so will not reset any
- * filter which have already been triggered.
+ * Call touch routine to notify filter of changes
+ * in filter values.
*/
- kn->kn_kevent.udata = kev->udata;
if (!fops->f_isfd && fops->f_touch != NULL)
fops->f_touch(kn, kev, EVENT_REGISTER);
- else {
- kn->kn_sfflags = kev->fflags;
- kn->kn_sdata = kev->data;
- }
/* We may need to push some info down to a networked filesystem */
if (kn->kn_filter == EVFILT_VNODE) {
}
/* capture the kevent data - using touch if specified */
- if (result) {
- if (touch) {
- kn->kn_fop->f_touch(kn, &kev, EVENT_PROCESS);
- } else {
- kev = kn->kn_kevent;
- }
+ if (result && touch) {
+ kn->kn_fop->f_touch(kn, &kev, EVENT_PROCESS);
}
+
/* convert back to a kqlock - bail if the knote went away */
if (!knoteuse2kqlock(kq, kn)) {
return EJUSTRETURN;
if (!(kn->kn_status & KN_ACTIVE)) {
knote_activate(kn, 0);
}
+
+ /* capture all events that occurred during filter */
+ if (!touch) {
+ kev = kn->kn_kevent;
+ }
+
} else if ((kn->kn_status & KN_STAYQUEUED) == 0) {
/* was already dequeued, so just bail on this one */
return EJUSTRETURN;
if (result == 0) {
return EJUSTRETURN;
- } else if (kn->kn_flags & EV_ONESHOT) {
+ } else if ((kn->kn_flags & EV_ONESHOT) != 0) {
knote_deactivate(kn);
if (kqlock2knotedrop(kq, kn)) {
kn->kn_fop->f_detach(kn);
knote_drop(kn, p);
}
- } else if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
- knote_deactivate(kn);
- /* manually clear knotes who weren't 'touch'ed */
- if ((touch == 0) && (kn->kn_flags & EV_CLEAR)) {
+ } else if ((kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) != 0) {
+ if ((kn->kn_flags & EV_DISPATCH) != 0) {
+ /* deactivate and disable all dispatch knotes */
+ knote_deactivate(kn);
+ kn->kn_status |= KN_DISABLED;
+ } else if (!touch || kn->kn_fflags == 0) {
+ /* only deactivate if nothing since the touch */
+ knote_deactivate(kn);
+ }
+ if (!touch && (kn->kn_flags & EV_CLEAR) != 0) {
+ /* manually clear non-touch knotes */
kn->kn_data = 0;
kn->kn_fflags = 0;
}
- if (kn->kn_flags & EV_DISPATCH)
- kn->kn_status |= KN_DISABLED;
kqunlock(kq);
} else {
/*
#if DEBUG
printf("set jetsam priority pids = { ");
for (i = 0; i < jetsam_priority_list_count; i++) {
- printf("%d ", temp_list[i].pid);
+ printf("(%d, 0x%08x, %d) ", temp_list[i].pid, temp_list[i].flags, temp_list[i].hiwat_pages);
}
printf("}\n");
#endif /* DEBUG */
for (i = jetsam_priority_list_count; i < kMaxPriorityEntries; i++) {
jetsam_priority_list[i].pid = 0;
jetsam_priority_list[i].flags = 0;
+ jetsam_priority_list[i].hiwat_pages = -1;
+ jetsam_priority_list[i].hiwat_reserved1 = -1;
+ jetsam_priority_list[i].hiwat_reserved2 = -1;
+ jetsam_priority_list[i].hiwat_reserved3 = -1;
}
jetsam_priority_list_index = 0;
lck_mtx_unlock(jetsam_list_mlock);
user_addr = (mach_vm_offset_t) uap->addr;
user_size = (mach_vm_size_t) uap->len;
- prot = (vm_prot_t)(uap->prot & VM_PROT_ALL);
+ prot = (vm_prot_t)(uap->prot & (VM_PROT_ALL | VM_PROT_TRUSTED));
if (user_addr & PAGE_MASK_64) {
/* UNIX SPEC: user address is not page-aligned, return EINVAL */
if (error)
return (error);
#endif
+
+ if(prot & VM_PROT_TRUSTED) {
+#if CONFIG_DYNAMIC_CODE_SIGNING
+ /* CODE SIGNING ENFORCEMENT - JIT support */
+ /* The special protection value VM_PROT_TRUSTED requests that we treat
+ * this page as if it had a valid code signature.
+ * If this is enabled, there MUST be a MAC policy implementing the
+ * mac_proc_check_mprotect() hook above. Otherwise, Codesigning will be
+ * compromised because the check would always succeed and thusly any
+ * process could sign dynamically. */
+ result = vm_map_sign(user_map,
+ vm_map_trunc_page(user_addr),
+ vm_map_round_page(user_addr+user_size));
+ switch (result) {
+ case KERN_SUCCESS:
+ break;
+ case KERN_INVALID_ADDRESS:
+ /* UNIX SPEC: for an invalid address range, return ENOMEM */
+ return ENOMEM;
+ default:
+ return EINVAL;
+ }
+#else
+ return ENOTSUP;
+#endif
+ }
+ prot &= ~VM_PROT_TRUSTED;
+
result = mach_vm_protect(user_map, user_addr, user_size,
FALSE, prot);
switch (result) {
int donice(struct proc *curp, struct proc *chgp, int n);
int dosetrlimit(struct proc *p, u_int which, struct rlimit *limp);
+static void do_background_socket(struct proc *curp, thread_t thread, int priority);
static int do_background_thread(struct proc *curp, int priority);
+static int do_background_task(struct proc *curp, int priority);
rlim_t maxdmap = MAXDSIZ; /* XXX */
rlim_t maxsmap = MAXSSIZ - PAGE_SIZE; /* XXX */
return (EINVAL);
}
error = do_background_thread(curp, uap->prio);
+ (void) do_background_socket(curp, current_thread(), uap->prio);
found++;
break;
}
+ case PRIO_DARWIN_PROCESS: {
+ if (uap->who == 0)
+ p = curp;
+ else {
+ p = proc_find(uap->who);
+ if (p == 0)
+ break;
+ refheld = 1;
+ }
+
+ error = do_background_task(p, uap->prio);
+ (void) do_background_socket(p, NULL, uap->prio);
+
+ proc_lock(p);
+ p->p_iopol_disk = (uap->prio == PRIO_DARWIN_BG ?
+ IOPOL_THROTTLE : IOPOL_DEFAULT);
+ proc_unlock(p);
+
+ found++;
+ if (refheld != 0)
+ proc_rele(p);
+ break;
+ }
+
default:
return (EINVAL);
}
return (error);
}
+static int
+do_background_task(struct proc *p, int priority)
+{
+ int error = 0;
+ task_category_policy_data_t info;
+
+ if (priority & PRIO_DARWIN_BG) {
+ info.role = TASK_THROTTLE_APPLICATION;
+ } else {
+ info.role = TASK_DEFAULT_APPLICATION;
+ }
+
+ error = task_policy_set(p->task,
+ TASK_CATEGORY_POLICY,
+ (task_policy_t) &info,
+ TASK_CATEGORY_POLICY_COUNT);
+ return (error);
+}
+
+static void
+do_background_socket(struct proc *curp, thread_t thread, int priority)
+{
+ struct filedesc *fdp;
+ struct fileproc *fp;
+ int i;
+
+ if (priority & PRIO_DARWIN_BG) {
+ /* enable network throttle process-wide (if no thread is specified) */
+ if (thread == NULL) {
+ proc_fdlock(curp);
+ fdp = curp->p_fd;
+
+ for (i = 0; i < fdp->fd_nfiles; i++) {
+ struct socket *sockp;
+
+ fp = fdp->fd_ofiles[i];
+ if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
+ fp->f_fglob->fg_type != DTYPE_SOCKET) {
+ continue;
+ }
+ sockp = (struct socket *)fp->f_fglob->fg_data;
+ sockp->so_traffic_mgt_flags |= TRAFFIC_MGT_SO_BACKGROUND;
+ sockp->so_background_thread = NULL;
+ }
+ proc_fdunlock(curp);
+ }
+
+ } else {
+ /* disable networking IO throttle.
+ * NOTE - It is a known limitation of the current design that we
+ * could potentially clear TRAFFIC_MGT_SO_BACKGROUND bit for
+ * sockets created by other threads within this process.
+ */
+ proc_fdlock(curp);
+ fdp = curp->p_fd;
+ for ( i = 0; i < fdp->fd_nfiles; i++ ) {
+ struct socket *sockp;
+
+ fp = fdp->fd_ofiles[ i ];
+ if ( fp == NULL || (fdp->fd_ofileflags[ i ] & UF_RESERVED) != 0 ||
+ fp->f_fglob->fg_type != DTYPE_SOCKET ) {
+ continue;
+ }
+ sockp = (struct socket *)fp->f_fglob->fg_data;
+ /* skip if only clearing this thread's sockets */
+ if ((thread) && (sockp->so_background_thread != thread)) {
+ continue;
+ }
+ sockp->so_traffic_mgt_flags &= ~TRAFFIC_MGT_SO_BACKGROUND;
+ sockp->so_background_thread = NULL;
+ }
+ proc_fdunlock(curp);
+ }
+}
+
+
/*
* do_background_thread
* Returns: 0 Success
* XXX - todo - does this need a MACF hook?
*/
static int
-do_background_thread(struct proc *curp, int priority)
+do_background_thread(struct proc *curp __unused, int priority)
{
- int i;
thread_t thread;
struct uthread *ut;
thread_precedence_policy_data_t policy;
- struct filedesc *fdp;
- struct fileproc *fp;
thread = current_thread();
ut = get_bsdthread_info(thread);
thread_policy_set( thread, THREAD_PRECEDENCE_POLICY,
(thread_policy_t)&policy,
THREAD_PRECEDENCE_POLICY_COUNT );
-
- /* disable networking IO throttle.
- * NOTE - It is a known limitation of the current design that we
- * could potentially clear TRAFFIC_MGT_SO_BACKGROUND bit for
- * sockets created by other threads within this process.
- */
- proc_fdlock(curp);
- fdp = curp->p_fd;
- for ( i = 0; i < fdp->fd_nfiles; i++ ) {
- struct socket *sockp;
-
- fp = fdp->fd_ofiles[ i ];
- if ( fp == NULL || (fdp->fd_ofileflags[ i ] & UF_RESERVED) != 0 ||
- fp->f_fglob->fg_type != DTYPE_SOCKET ) {
- continue;
- }
- sockp = (struct socket *)fp->f_fglob->fg_data;
- if ( sockp->so_background_thread != thread ) {
- continue;
- }
- sockp->so_traffic_mgt_flags &= ~TRAFFIC_MGT_SO_BACKGROUND;
- sockp->so_background_thread = NULL;
- }
- proc_fdunlock(curp);
-
return(0);
}
0, 0, sysctl_netboot, "I", "");
#endif
+#ifdef CONFIG_IMGSRC_ACCESS
+static int
+sysctl_imgsrcdev
+(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+ vfs_context_t ctx = vfs_context_current();
+ vnode_t devvp;
+ int result;
+
+ if (!vfs_context_issuser(ctx)) {
+ return EPERM;
+ }
+
+ if (imgsrc_rootvnode == NULL) {
+ return ENOENT;
+ }
+
+ result = vnode_getwithref(imgsrc_rootvnode);
+ if (result != 0) {
+ return result;
+ }
+
+ devvp = vnode_mount(imgsrc_rootvnode)->mnt_devvp;
+ result = vnode_getwithref(devvp);
+ if (result != 0) {
+ goto out;
+ }
+
+ result = sysctl_io_number(req, vnode_specrdev(devvp), sizeof(dev_t), NULL, NULL);
+
+ vnode_put(devvp);
+out:
+ vnode_put(imgsrc_rootvnode);
+ return result;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, imgsrcdev,
+ CTLTYPE_INT | CTLFLAG_RD,
+ 0, 0, sysctl_imgsrcdev, "I", "");
+#endif /* CONFIG_IMGSRC_ACCESS */
+
static int
sysctl_usrstack
(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
SYSCTL_INT (_kern, OID_AUTO, stack_depth_max,
CTLFLAG_RD, (int *) &kernel_stack_depth_max, 0, "Max kernel stack depth at interrupt or context switch");
+/*
+ * enable back trace for port allocations
+ */
+extern int ipc_portbt;
+
+SYSCTL_INT(_kern, OID_AUTO, ipc_portbt,
+ CTLFLAG_RW | CTLFLAG_KERN,
+ &ipc_portbt, 0, "");
+
static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl,
int reuse_thread, int wake_thread, int return_directly);
static void wq_unpark_continue(void);
+static void wq_unsuspend_continue(void);
static int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl);
static boolean_t workqueue_addnewthread(struct workqueue *wq);
static void workqueue_removethread(struct threadlist *tl);
return(0);
}
-
uint32_t wq_yielded_threshold = WQ_YIELDED_THRESHOLD;
uint32_t wq_yielded_window_usecs = WQ_YIELDED_WINDOW_USECS;
uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS;
* the thread lock for the thread being UNBLOCKED
* is also held
*/
- if (tl->th_suspended) {
- OSAddAtomic(-1, &tl->th_suspended);
- KERNEL_DEBUG1(0xefffd024, wq, wq->wq_threads_scheduled, tl->th_priority, tl->th_affinity_tag, thread_tid(thread));
- } else {
- OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority][tl->th_affinity_tag]);
+ OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority][tl->th_affinity_tag]);
- KERNEL_DEBUG1(0xefffd020 | DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, tl->th_affinity_tag, thread_tid(thread));
- }
- break;
+ KERNEL_DEBUG1(0xefffd020 | DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, tl->th_affinity_tag, thread_tid(thread));
+
+ break;
}
}
p = wq->wq_proc;
workqueue_unlock(p);
- kret = thread_create_workq(wq->wq_task, &th);
+ kret = thread_create_workq(wq->wq_task, (thread_continue_t)wq_unsuspend_continue, &th);
if (kret != KERN_SUCCESS)
goto failed;
tl->th_affinity_tag = -1;
tl->th_priority = WORKQUEUE_NUMPRIOS;
tl->th_policy = -1;
- tl->th_suspended = 1;
#if defined(__ppc__)
//ml_fp_setvalid(FALSE);
uth->uu_threadlist = (void *)tl;
workqueue_lock_spin(p);
-
+
TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
wq->wq_thidlecount++;
}
-
void
workqueue_exit(struct proc *p)
{
return (error);
}
-
-
-
static int workqueue_importance[WORKQUEUE_NUMPRIOS] =
{
2, 0, -2,
tl->th_flags &= ~TH_LIST_SUSPENDED;
reuse_thread = 0;
- thread_sched_call(tl->th_thread, workqueue_callback);
-
} else if ((tl->th_flags & TH_LIST_BLOCKED) == TH_LIST_BLOCKED) {
tl->th_flags &= ~TH_LIST_BLOCKED;
- tl->th_flags |= TH_LIST_BUSY;
wake_thread = 1;
}
- tl->th_flags |= TH_LIST_RUNNING;
+ tl->th_flags |= TH_LIST_RUNNING | TH_LIST_BUSY;
wq->wq_threads_scheduled++;
wq->wq_thscheduled_count[priority][affinity_tag]++;
}
+static void
+wq_unsuspend_continue(void)
+{
+ struct uthread *uth = NULL;
+ thread_t th_to_unsuspend;
+ struct threadlist *tl;
+ proc_t p;
+
+ th_to_unsuspend = current_thread();
+ uth = get_bsdthread_info(th_to_unsuspend);
+
+ if (uth != NULL && (tl = uth->uu_threadlist) != NULL) {
+
+ if ((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) == TH_LIST_RUNNING) {
+ /*
+ * most likely a normal resume of this thread occurred...
+ * it's also possible that the thread was aborted after we
+ * finished setting it up so that it could be dispatched... if
+ * so, thread_bootstrap_return will notice the abort and put
+ * the thread on the path to self-destruction
+ */
+normal_resume_to_user:
+ thread_sched_call(th_to_unsuspend, workqueue_callback);
+
+ thread_bootstrap_return();
+ }
+ /*
+ * if we get here, it's because we've been resumed due to
+ * an abort of this thread (process is crashing)
+ */
+ p = current_proc();
+
+ workqueue_lock_spin(p);
+
+ if (tl->th_flags & TH_LIST_SUSPENDED) {
+ /*
+ * thread has been aborted while still on our idle
+ * queue... remove it from our domain...
+ * workqueue_removethread consumes the lock
+ */
+ workqueue_removethread(tl);
+
+ thread_bootstrap_return();
+ }
+ while ((tl->th_flags & TH_LIST_BUSY)) {
+ /*
+ * this thread was aborted after we started making
+ * it runnable, but before we finished dispatching it...
+ * we need to wait for that process to finish,
+ * and we need to ask for a wakeup instead of a
+ * thread_resume since the abort has already resumed us
+ */
+ tl->th_flags |= TH_LIST_NEED_WAKEUP;
+
+ assert_wait((caddr_t)tl, (THREAD_UNINT));
+
+ workqueue_unlock(p);
+
+ thread_block(THREAD_CONTINUE_NULL);
+
+ workqueue_lock_spin(p);
+ }
+ workqueue_unlock(p);
+ /*
+ * we have finished setting up the thread's context...
+ * thread_bootstrap_return will take us through the abort path
+ * where the thread will self destruct
+ */
+ goto normal_resume_to_user;
+ }
+ thread_bootstrap_return();
+}
+
+
static void
wq_unpark_continue(void)
{
} else {
KERNEL_DEBUG1(0xefffd014 | DBG_FUNC_END, tl->th_workq, 0, 0, thread_tid(current_thread()), thread_tid(th));
- thread_resume(th);
+ workqueue_lock_spin(p);
+
+ if (tl->th_flags & TH_LIST_NEED_WAKEUP)
+ wakeup(tl);
+ else
+ thread_resume(th);
+
+ tl->th_flags &= ~(TH_LIST_BUSY | TH_LIST_NEED_WAKEUP);
+
+ workqueue_unlock(p);
}
}
-
int
setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl)
{
donefileread(p, fp, fd);
- if (!error)
- KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
+ KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
out:
else
fp_drop(p, fd, fp, 0);
- if (!error)
- KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
+ KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
return(error);
362 AUE_KQUEUE ALL { int kqueue(void); }
363 AUE_NULL ALL { int kevent(int fd, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); }
364 AUE_LCHOWN ALL { int lchown(user_addr_t path, uid_t owner, gid_t group); }
-365 AUE_STACKSNAPSHOT ALL { int stack_snapshot(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t options) NO_SYSCALL_STUB; }
+365 AUE_STACKSNAPSHOT ALL { int stack_snapshot(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset) NO_SYSCALL_STUB; }
#if CONFIG_WORKQUEUE
366 AUE_NULL ALL { int bsdthread_register(user_addr_t threadstart, user_addr_t wqthread, int pthsize,user_addr_t dummy_value, user_addr_t targetconc_ptr, uint64_t dispatchqueue_offset) NO_SYSCALL_STUB; }
367 AUE_WORKQOPEN ALL { int workq_open(void) NO_SYSCALL_STUB; }
static void
unp_detach(struct unpcb *unp)
{
+ int so_locked = 1;
+
lck_rw_lock_exclusive(unp_list_mtx);
LIST_REMOVE(unp, unp_link);
lck_rw_done(unp_list_mtx);
if (unp->unp_conn)
unp_disconnect(unp);
while (unp->unp_refs.lh_first) {
- struct unpcb *unp2 = unp->unp_refs.lh_first;
- socket_unlock(unp->unp_socket, 0);
-
- socket_lock(unp2->unp_socket, 1);
- unp_drop(unp2, ECONNRESET);
- socket_unlock(unp2->unp_socket, 1);
+ struct unpcb *unp2 = NULL;
+
+ /* This datagram socket is connected to one or more
+ * sockets. In order to avoid a race condition between removing
+ * this reference and closing the connected socket, we need
+ * to check disconnect_in_progress
+ */
+ if (so_locked == 1) {
+ socket_unlock(unp->unp_socket, 0);
+ so_locked = 0;
+ }
+ lck_mtx_lock(unp_disconnect_lock);
+ while (disconnect_in_progress != 0) {
+ (void)msleep((caddr_t)&disconnect_in_progress, unp_disconnect_lock,
+ PSOCK, "disconnect", NULL);
+ }
+ disconnect_in_progress = 1;
+ lck_mtx_unlock(unp_disconnect_lock);
+
+ /* Now we are sure that any unpcb socket disconnect is not happening */
+ if (unp->unp_refs.lh_first != NULL) {
+ unp2 = unp->unp_refs.lh_first;
+ socket_lock(unp2->unp_socket, 1);
+ }
+
+ lck_mtx_lock(unp_disconnect_lock);
+ disconnect_in_progress = 0;
+ wakeup(&disconnect_in_progress);
+ lck_mtx_unlock(unp_disconnect_lock);
+
+ if (unp2 != NULL) {
+ /* We already locked this socket and have a reference on it */
+ unp_drop(unp2, ECONNRESET);
+ socket_unlock(unp2->unp_socket, 1);
+ }
+ }
+
+ if (so_locked == 0) {
socket_lock(unp->unp_socket, 0);
+ so_locked = 1;
}
soisdisconnected(unp->unp_socket);
/* makes sure we're getting dealloced */
switch (so->so_type) {
case SOCK_DGRAM:
- lck_rw_lock_exclusive(unp_list_mtx);
LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
- lck_rw_done(unp_list_mtx);
/* Avoid lock order reversals due to drop/acquire in soisconnected. */
switch (unp->unp_socket->so_type) {
case SOCK_DGRAM:
- lck_rw_lock_exclusive(unp_list_mtx);
LIST_REMOVE(unp, unp_reflink);
- lck_rw_done(unp_list_mtx);
unp->unp_socket->so_state &= ~SS_ISCONNECTED;
socket_unlock(so2, 1);
break;
PRIVATE_DATAFILES = \
if_atm.h if_vlan_var.h if_ppp.h firewire.h \
ppp_defs.h radix.h if_bond_var.h lacp.h ndrv_var.h \
- raw_cb.h etherdefs.h iso88025.h if_pflog.h pfvar.h
+ raw_cb.h etherdefs.h iso88025.h if_pflog.h pfvar.h \
+ if_bridgevar.h
PRIVATE_KERNELFILES = ${KERNELFILES} \
bpfdesc.h dlil_pvt.h ppp_comp.h \
+++ /dev/null
-/*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * Copyright (c) 1998 Luigi Rizzo
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: src/sys/net/bridge.c,v 1.16.2.14 2001/02/09 23:13:41 luigi Exp $
- */
-
-/*
- * This code implements bridging in FreeBSD. It only acts on ethernet
- * type of interfaces (others are still usable for routing).
- * A bridging table holds the source MAC address/dest. interface for each
- * known node. The table is indexed using an hash of the source address.
- *
- * Input packets are tapped near the beginning of ether_input(), and
- * analysed by calling bridge_in(). Depending on the result, the packet
- * can be forwarded to one or more output interfaces using bdg_forward(),
- * and/or sent to the upper layer (e.g. in case of multicast).
- *
- * Output packets are intercepted near the end of ether_output(),
- * the correct destination is selected calling bridge_dst_lookup(),
- * and then forwarding is done using bdg_forward().
- * Bridging is controlled by the sysctl variable net.link.ether.bridge
- *
- * The arp code is also modified to let a machine answer to requests
- * irrespective of the port the request came from.
- *
- * In case of loops in the bridging topology, the bridge detects this
- * event and temporarily mutes output bridging on one of the ports.
- * Periodically, interfaces are unmuted by bdg_timeout().
- * Muting is only implemented as a safety measure, and also as
- * a mechanism to support a user-space implementation of the spanning
- * tree algorithm. In the final release, unmuting will only occur
- * because of explicit action of the user-level daemon.
- *
- * To build a bridging kernel, use the following option
- * option BRIDGE
- * and then at runtime set the sysctl variable to enable bridging.
- *
- * Only one interface is supposed to have addresses set (but
- * there are no problems in practice if you set addresses for more
- * than one interface).
- * Bridging will act before routing, but nothing prevents a machine
- * from doing both (modulo bugs in the implementation...).
- *
- * THINGS TO REMEMBER
- * - bridging is incompatible with multicast routing on the same
- * machine. There is not an easy fix to this.
- * - loop detection is still not very robust.
- * - the interface of bdg_forward() could be improved.
- */
-
-#include <sys/param.h>
-#include <sys/mbuf.h>
-#include <sys/malloc.h>
-#include <sys/systm.h>
-#include <sys/socket.h> /* for net/if.h */
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
-
-#include <net/if.h>
-#include <net/if_types.h>
-
-#include <netinet/in.h> /* for struct arpcom */
-#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
-#include <netinet/ip.h>
-#include <netinet/if_ether.h> /* for struct arpcom */
-
-#include "opt_ipfw.h"
-#include "opt_ipdn.h"
-
-#if defined(IPFIREWALL)
-#include <net/route.h>
-#include <netinet/ip_fw.h>
-#if defined(DUMMYNET)
-#include <netinet/ip_dummynet.h>
-#endif
-#endif
-
-#include <net/bridge.h>
-
-/*
- * For debugging, you can use the following macros.
- * remember, rdtsc() only works on Pentium-class machines
-
- quad_t ticks;
- DDB(ticks = rdtsc();)
- ... interesting code ...
- DDB(bdg_fw_ticks += (u_int32_t)(rdtsc() - ticks) ; bdg_fw_count++ ;)
-
- *
- */
-
-#define DDB(x) x
-#define DEB(x)
-
-static void bdginit(void *);
-static void bdgtakeifaces(void);
-static void flush_table(void);
-static void bdg_promisc_on(void);
-static void parse_bdg_cfg(void);
-
-static int bdg_ipfw = 0 ;
-int do_bridge = 0;
-bdg_hash_table *bdg_table = NULL ;
-
-/*
- * System initialization
- */
-
-SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, bdginit, NULL)
-
-static struct bdg_stats bdg_stats ;
-struct bdg_softc *ifp2sc = NULL ;
-/* XXX make it static of size BDG_MAX_PORTS */
-
-#define IFP_CHK(ifp, x) \
- if (ifp2sc[ifp->if_index].magic != 0xDEADBEEF) { x ; }
-
-/*
- * turn off promisc mode, optionally clear the IFF_USED flag.
- * The flag is turned on by parse_bdg_config
- */
-static void
-bdg_promisc_off(int clear_used)
-{
- struct ifnet *ifp ;
- ifnet_head_lock_shared();
- TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
- if ( (ifp2sc[ifp->if_index].flags & IFF_BDG_PROMISC) ) {
- int s, ret ;
- s = splimp();
- ret = ifnet_set_promiscuous(ifp, 0);
- splx(s);
- ifp2sc[ifp->if_index].flags &= ~(IFF_BDG_PROMISC|IFF_MUTE) ;
- DEB(printf(">> now %s%d promisc OFF if_flags 0x%x bdg_flags 0x%x\n",
- ifp->if_name, ifp->if_unit,
- ifp->if_flags, ifp2sc[ifp->if_index].flags);)
- }
- if (clear_used) {
- ifp2sc[ifp->if_index].flags &= ~(IFF_USED) ;
- bdg_stats.s[ifp->if_index].name[0] = '\0';
- }
- }
- ifnet_head_done();
-}
-
-/*
- * set promisc mode on the interfaces we use.
- */
-static void
-bdg_promisc_on()
-{
- struct ifnet *ifp ;
- int s ;
-
- ifnet_head_lock_shared();
- TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
- if ( !BDG_USED(ifp) )
- continue ;
- if ( 0 == ( ifp->if_flags & IFF_UP) ) {
- s = splimp();
- if_up(ifp);
- splx(s);
- }
- if ( !(ifp2sc[ifp->if_index].flags & IFF_BDG_PROMISC) ) {
- int ret ;
- s = splimp();
- ret = ifnet_set_promiscuous(ifp, 1);
- splx(s);
- ifp2sc[ifp->if_index].flags |= IFF_BDG_PROMISC ;
- printf(">> now %s%d promisc ON if_flags 0x%x bdg_flags 0x%x\n",
- ifp->if_name, ifp->if_unit,
- ifp->if_flags, ifp2sc[ifp->if_index].flags);
- }
- if (BDG_MUTED(ifp)) {
- printf(">> unmuting %s%d\n", ifp->if_name, ifp->if_unit);
- BDG_UNMUTE(ifp) ;
- }
- }
- ifnet_head_done();
-}
-
-static int
-sysctl_bdg(SYSCTL_HANDLER_ARGS)
-{
- int error, oldval = do_bridge ;
-
- error = sysctl_handle_int(oidp,
- oidp->oid_arg1, oidp->oid_arg2, req);
- DEB( printf("called sysctl for bridge name %s arg2 %d val %d->%d\n",
- oidp->oid_name, oidp->oid_arg2,
- oldval, do_bridge); )
-
- if (bdg_table == NULL)
- do_bridge = 0 ;
- if (oldval != do_bridge) {
- bdg_promisc_off( 1 ); /* reset previously used interfaces */
- flush_table();
- if (do_bridge) {
- parse_bdg_cfg();
- bdg_promisc_on();
- }
- }
- return error ;
-}
-
-static char bridge_cfg[256] = { "" } ;
-
-/*
- * parse the config string, set IFF_USED, name and cluster_id
- * for all interfaces found.
- */
-static void
-parse_bdg_cfg()
-{
- char *p, *beg ;
- int i, l, cluster;
- struct bdg_softc *b;
-
- for (p= bridge_cfg; *p ; p++) {
- /* interface names begin with [a-z] and continue up to ':' */
- if (*p < 'a' || *p > 'z')
- continue ;
- for ( beg = p ; *p && *p != ':' ; p++ )
- ;
- if (*p == 0) /* end of string, ':' not found */
- return ;
- l = p - beg ; /* length of name string */
- p++ ;
- DEB(printf("-- match beg(%d) <%s> p <%s>\n", l, beg, p);)
- for (cluster = 0 ; *p && *p >= '0' && *p <= '9' ; p++)
- cluster = cluster*10 + (*p -'0');
- /*
- * now search in bridge strings
- */
- for (i=0, b = ifp2sc ; i < if_index ; i++, b++) {
- char buf[32];
- struct ifnet *ifp = b->ifp ;
-
- if (ifp == NULL)
- continue;
- sprintf(buf, "%s%d", ifp->if_name, ifp->if_unit);
- if (!strncmp(beg, buf, l)) { /* XXX not correct for >10 if! */
- b->cluster_id = htons(cluster) ;
- b->flags |= IFF_USED ;
- sprintf(bdg_stats.s[ifp->if_index].name,
- "%s%d:%d", ifp->if_name, ifp->if_unit, cluster);
-
- DEB(printf("--++ found %s\n",
- bdg_stats.s[ifp->if_index].name);)
- break ;
- }
- }
- if (*p == '\0')
- break ;
- }
-}
-
-static int
-sysctl_bdg_cfg(SYSCTL_HANDLER_ARGS)
-{
- int error = 0 ;
- char oldval[256] ;
-
- strlcpy(oldval, bridge_cfg, sizeof (oldval));
-
- error = sysctl_handle_string(oidp,
- bridge_cfg, oidp->oid_arg2, req);
- DEB(
- printf("called sysctl for bridge name %s arg2 %d err %d val %s->%s\n",
- oidp->oid_name, oidp->oid_arg2,
- error,
- oldval, bridge_cfg);
- )
- if (strcmp(oldval, bridge_cfg)) {
- bdg_promisc_off( 1 ); /* reset previously-used interfaces */
- flush_table();
- parse_bdg_cfg(); /* and set new ones... */
- if (do_bridge)
- bdg_promisc_on(); /* re-enable interfaces */
- }
- return error ;
-}
-
-static int
-sysctl_refresh(SYSCTL_HANDLER_ARGS)
-{
- if (req->newptr)
- bdgtakeifaces();
-
- return 0;
-}
-
-
-SYSCTL_DECL(_net_link_ether);
-SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge_cfg, CTLTYPE_STRING|CTLFLAG_RW,
- &bridge_cfg, sizeof(bridge_cfg), &sysctl_bdg_cfg, "A",
- "Bridge configuration");
-
-SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge, CTLTYPE_INT|CTLFLAG_RW,
- &do_bridge, 0, &sysctl_bdg, "I", "Bridging");
-
-SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw, CTLFLAG_RW,
- &bdg_ipfw,0,"Pass bridged pkts through firewall");
-
-#define SY(parent, var, comment) \
- static int var ; \
- SYSCTL_INT(parent, OID_AUTO, var, CTLFLAG_RW, &(var), 0, comment);
-
-int bdg_ipfw_drops;
-SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw_drop,
- CTLFLAG_RW, &bdg_ipfw_drops,0,"");
-
-int bdg_ipfw_colls;
-SYSCTL_INT(_net_link_ether, OID_AUTO, bridge_ipfw_collisions,
- CTLFLAG_RW, &bdg_ipfw_colls,0,"");
-
-SYSCTL_PROC(_net_link_ether, OID_AUTO, bridge_refresh, CTLTYPE_INT|CTLFLAG_WR,
- NULL, 0, &sysctl_refresh, "I", "iface refresh");
-
-#if 1 /* diagnostic vars */
-
-SY(_net_link_ether, verbose, "Be verbose");
-SY(_net_link_ether, bdg_split_pkts, "Packets split in bdg_forward");
-
-SY(_net_link_ether, bdg_thru, "Packets through bridge");
-
-SY(_net_link_ether, bdg_copied, "Packets copied in bdg_forward");
-
-SY(_net_link_ether, bdg_copy, "Force copy in bdg_forward");
-SY(_net_link_ether, bdg_predict, "Correctly predicted header location");
-
-SY(_net_link_ether, bdg_fw_avg, "Cycle counter avg");
-SY(_net_link_ether, bdg_fw_ticks, "Cycle counter item");
-SY(_net_link_ether, bdg_fw_count, "Cycle counter count");
-#endif
-
-SYSCTL_STRUCT(_net_link_ether, PF_BDG, bdgstats,
- CTLFLAG_RD, &bdg_stats , bdg_stats, "bridge statistics");
-
-static int bdg_loops ;
-
-/*
- * completely flush the bridge table.
- */
-static void
-flush_table()
-{
- int s,i;
-
- if (bdg_table == NULL)
- return ;
- s = splimp();
- for (i=0; i< HASH_SIZE; i++)
- bdg_table[i].name= NULL; /* clear table */
- splx(s);
-}
-
-/*
- * called periodically to flush entries etc.
- */
-static void
-bdg_timeout(void *dummy)
-{
- static int slowtimer = 0 ;
-
- if (bdg_inted == 0) {
- bdg_init2(0);
- } else if (do_bridge) {
- static int age_index = 0 ; /* index of table position to age */
- int l = age_index + HASH_SIZE/4 ;
- /*
- * age entries in the forwarding table.
- */
- if (l > HASH_SIZE)
- l = HASH_SIZE ;
- for (; age_index < l ; age_index++)
- if (bdg_table[age_index].used)
- bdg_table[age_index].used = 0 ;
- else if (bdg_table[age_index].name) {
- /* printf("xx flushing stale entry %d\n", age_index); */
- bdg_table[age_index].name = NULL ;
- }
- if (age_index >= HASH_SIZE)
- age_index = 0 ;
-
- if (--slowtimer <= 0 ) {
- slowtimer = 5 ;
-
- bdg_promisc_on() ; /* we just need unmute, really */
- bdg_loops = 0 ;
- }
- }
- timeout(bdg_timeout, (void *)0, 2*hz );
-}
-
-/*
- * local MAC addresses are held in a small array. This makes comparisons
- * much faster.
- */
-bdg_addr bdg_addresses[BDG_MAX_PORTS];
-int bdg_ports ;
-
-/*
- * initialization of bridge code. This needs to be done after all
- * interfaces have been configured.
- */
-
-static int bdg_inited = 0;
-
-static void
-bdg_init2(void)
-{
- if (bdg_inited != 0)
- return;
-
- if (bdg_table == NULL) {
- bdg_table = (struct hash_table *)
- _MALLOC(HASH_SIZE * sizeof(struct hash_table),
- M_IFADDR, M_WAITOK);
- if (bdg_table == NULL)
- return;
-
- flush_table();
- }
-
- if (ifp2sc == NULL) {
- ifp2sc = _MALLOC(BDG_MAX_PORTS * sizeof(struct bdg_softc),
- M_IFADDR, M_WAITOK );
- if (ifp2sc == NULL)
- return;
-
- bzero(ifp2sc, BDG_MAX_PORTS * sizeof(struct bdg_softc) );
- bdgtakeifaces();
- }
-
- bdg_inited = 1;
-}
-
-static void
-bdginit(void *dummy)
-{
- /* Initialize first what can't fail */
- bzero(&bdg_stats, sizeof(bdg_stats) );
- do_bridge=0;
-
- /* Attempt to initialize the rest and start the timer */
- bdg_timeout(0);
-}
-
-void
-bdgtakeifaces(void)
-{
- int i ;
- struct ifnet *ifp;
- bdg_addr *p = bdg_addresses ;
- struct bdg_softc *bp;
-
- bdg_ports = 0 ;
- *bridge_cfg = '\0';
-
- printf("BRIDGE 010131, have %d interfaces\n", if_index);
- ifnet_head_lock_shared();
- for (i = 0 , ifp = ifnet.tqh_first ; i < if_index ;
- i++, ifp = TAILQ_NEXT(ifp, if_link) )
- if (ifp->if_type == IFT_ETHER) { /* ethernet ? */
- ifnet_lladdr_copy_bytes(ifp, p->etheraddr, ETHER_ADDR_LEN);
- bp = &ifp2sc[ifp->if_index] ;
- sprintf(bridge_cfg + strlen(bridge_cfg),
- "%s%d:1,", ifp->if_name, ifp->if_unit);
- printf("-- index %d %s type %d phy %d addrl %d addr %6D\n",
- ifp->if_index,
- bdg_stats.s[ifp->if_index].name,
- (int)ifp->if_type, (int) ifp->if_physical,
- (int)ifp->if_addrlen,
- p->etheraddr, "." );
- p++ ;
- bp->ifp = ifp ;
- bp->flags = IFF_USED ;
- bp->cluster_id = htons(1) ;
- bp->magic = 0xDEADBEEF ;
-
- sprintf(bdg_stats.s[ifp->if_index].name,
- "%s%d:%d", ifp->if_name, ifp->if_unit,
- ntohs(bp->cluster_id));
- bdg_ports ++ ;
- }
- ifnet_head_done();
-}
-
-/*
- * bridge_in() is invoked to perform bridging decision on input packets.
- *
- * On Input:
- * eh Ethernet header of the incoming packet.
- *
- * On Return: destination of packet, one of
- * BDG_BCAST broadcast
- * BDG_MCAST multicast
- * BDG_LOCAL is only for a local address (do not forward)
- * BDG_DROP drop the packet
- * ifp ifp of the destination interface.
- *
- * Forwarding is not done directly to give a chance to some drivers
- * to fetch more of the packet, or simply drop it completely.
- */
-
-struct ifnet *
-bridge_in(struct ifnet *ifp, struct ether_header *eh)
-{
- int index;
- struct ifnet *dst , *old ;
- int dropit = BDG_MUTED(ifp) ;
-
- /*
- * hash the source address
- */
- index= HASH_FN(eh->ether_shost);
- bdg_table[index].used = 1 ;
- old = bdg_table[index].name ;
- if ( old ) { /* the entry is valid. */
- IFP_CHK(old, printf("bridge_in-- reading table\n") );
-
- if (!BDG_MATCH( eh->ether_shost, bdg_table[index].etheraddr) ) {
- bdg_ipfw_colls++ ;
- bdg_table[index].name = NULL ;
- } else if (old != ifp) {
- /*
- * found a loop. Either a machine has moved, or there
- * is a misconfiguration/reconfiguration of the network.
- * First, do not forward this packet!
- * Record the relocation anyways; then, if loops persist,
- * suspect a reconfiguration and disable forwarding
- * from the old interface.
- */
- bdg_table[index].name = ifp ; /* relocate address */
- printf("-- loop (%d) %6D to %s%d from %s%d (%s)\n",
- bdg_loops, eh->ether_shost, ".",
- ifp->if_name, ifp->if_unit,
- old->if_name, old->if_unit,
- BDG_MUTED(old) ? "muted":"active");
- dropit = 1 ;
- if ( !BDG_MUTED(old) ) {
- if (++bdg_loops > 10)
- BDG_MUTE(old) ;
- }
- }
- }
-
- /*
- * now write the source address into the table
- */
- if (bdg_table[index].name == NULL) {
- DEB(printf("new addr %6D at %d for %s%d\n",
- eh->ether_shost, ".", index, ifp->if_name, ifp->if_unit);)
- bcopy(eh->ether_shost, bdg_table[index].etheraddr, 6);
- bdg_table[index].name = ifp ;
- }
- dst = bridge_dst_lookup(eh);
- /* Return values:
- * BDG_BCAST, BDG_MCAST, BDG_LOCAL, BDG_UNKNOWN, BDG_DROP, ifp.
- * For muted interfaces, the first 3 are changed in BDG_LOCAL,
- * and others to BDG_DROP. Also, for incoming packets, ifp is changed
- * to BDG_DROP in case ifp == src . These mods are not necessary
- * for outgoing packets from ether_output().
- */
- BDG_STAT(ifp, BDG_IN);
- switch ((int)dst) {
- case (int)BDG_BCAST:
- case (int)BDG_MCAST:
- case (int)BDG_LOCAL:
- case (int)BDG_UNKNOWN:
- case (int)BDG_DROP:
- BDG_STAT(ifp, dst);
- break ;
- default :
- if (dst == ifp || dropit )
- BDG_STAT(ifp, BDG_DROP);
- else
- BDG_STAT(ifp, BDG_FORWARD);
- break ;
- }
-
- if ( dropit ) {
- if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_LOCAL)
- return BDG_LOCAL ;
- else
- return BDG_DROP ;
- } else {
- return (dst == ifp ? BDG_DROP : dst ) ;
- }
-}
-
-/*
- * Forward to dst, excluding src port and muted interfaces.
- * If src == NULL, the pkt comes from ether_output, and dst is the real
- * interface the packet is originally sent to. In this case we must forward
- * it to the whole cluster. We never call bdg_forward ether_output on
- * interfaces which are not part of a cluster.
- *
- * The packet is freed if possible (i.e. surely not of interest for
- * the upper layer), otherwise a copy is left for use by the caller
- * (pointer in m0).
- *
- * It would be more efficient to make bdg_forward() always consume
- * the packet, leaving to the caller the task to check if it needs a copy
- * and get one in case. As it is now, bdg_forward() can sometimes make
- * a copy whereas it is not necessary.
- *
- * XXX be careful about eh, it can be a pointer into *m
- */
-struct mbuf *
-bdg_forward(struct mbuf *m0, struct ether_header *const eh, struct ifnet *dst)
-{
- struct ifnet *src = m0->m_pkthdr.rcvif; /* could be NULL in output */
- struct ifnet *ifp, *last = NULL ;
- int s ;
- int shared = bdg_copy ; /* someone else is using the mbuf */
- int once = 0; /* loop only once */
- struct ifnet *real_dst = dst ; /* real dst from ether_output */
-#ifdef IPFIREWALL
- struct ip_fw_chain *rule = NULL ; /* did we match a firewall rule ? */
-#endif
-
- /*
- * XXX eh is usually a pointer within the mbuf (some ethernet drivers
- * do that), so we better copy it before doing anything with the mbuf,
- * or we might corrupt the header.
- */
- struct ether_header save_eh = *eh ;
-
-#if defined(IPFIREWALL) && defined(DUMMYNET)
- if (m0->m_type == MT_DUMMYNET) {
- /* extract info from dummynet header */
- rule = (struct ip_fw_chain *)(m0->m_data) ;
- m0 = m0->m_next ;
- src = m0->m_pkthdr.rcvif;
- shared = 0 ; /* For sure this is our own mbuf. */
- } else
-#endif
- bdg_thru++; /* only count once */
-
- if (src == NULL) /* packet from ether_output */
- dst = bridge_dst_lookup(eh);
- if (dst == BDG_DROP) { /* this should not happen */
- printf("xx bdg_forward for BDG_DROP\n");
- m_freem(m0);
- return NULL;
- }
- if (dst == BDG_LOCAL) { /* this should not happen as well */
- printf("xx ouch, bdg_forward for local pkt\n");
- return m0;
- }
- if (dst == BDG_BCAST || dst == BDG_MCAST || dst == BDG_UNKNOWN) {
- ifp = ifnet_head.tqh_first ; /* scan all ports */
- once = 0 ;
- if (dst != BDG_UNKNOWN) /* need a copy for the local stack */
- shared = 1 ;
- } else {
- ifp = dst ;
- once = 1 ;
- }
- if ( (u_int)(ifp) <= (u_int)BDG_FORWARD )
- panic("bdg_forward: bad dst");
-
-#ifdef IPFIREWALL
- /*
- * Do filtering in a very similar way to what is done in ip_output.
- * Only if firewall is loaded, enabled, and the packet is not
- * from ether_output() (src==NULL, or we would filter it twice).
- * Additional restrictions may apply e.g. non-IP, short packets,
- * and pkts already gone through a pipe.
- */
- if (ip_fw_chk_ptr && bdg_ipfw != 0 && src != NULL) {
- struct ip *ip ;
- int i;
-
- if (rule != NULL) /* dummynet packet, already partially processed */
- goto forward; /* HACK! I should obey the fw_one_pass */
- if (ntohs(save_eh.ether_type) != ETHERTYPE_IP)
- goto forward ; /* not an IP packet, ipfw is not appropriate */
- if (m0->m_pkthdr.len < sizeof(struct ip) )
- goto forward ; /* header too short for an IP pkt, cannot filter */
- /*
- * i need some amt of data to be contiguous, and in case others need
- * the packet (shared==1) also better be in the first mbuf.
- */
- i = min(m0->m_pkthdr.len, max_protohdr) ;
- if ( shared || m0->m_len < i) {
- m0 = m_pullup(m0, i) ;
- if (m0 == NULL) {
- printf("-- bdg: pullup failed.\n") ;
- return NULL ;
- }
- }
-
- /*
- * before calling the firewall, swap fields the same as IP does.
- * here we assume the pkt is an IP one and the header is contiguous
- */
- ip = mtod(m0, struct ip *);
- NTOHS(ip->ip_len);
- NTOHS(ip->ip_off);
-
- /*
- * The third parameter to the firewall code is the dst. interface.
- * Since we apply checks only on input pkts we use NULL.
- * The firewall knows this is a bridged packet as the cookie ptr
- * is NULL.
- */
- i = (*ip_fw_chk_ptr)(&ip, 0, NULL, NULL /* cookie */, &m0, &rule, NULL);
- if ( (i & IP_FW_PORT_DENY_FLAG) || m0 == NULL) /* drop */
- return m0 ;
- /*
- * If we get here, the firewall has passed the pkt, but the mbuf
- * pointer might have changed. Restore ip and the fields NTOHS()'d.
- */
- ip = mtod(m0, struct ip *);
- HTONS(ip->ip_len);
- HTONS(ip->ip_off);
-
- if (i == 0) /* a PASS rule. */
- goto forward ;
-#ifdef DUMMYNET
- if (i & IP_FW_PORT_DYNT_FLAG) {
- /*
- * Pass the pkt to dummynet, which consumes it.
- * If shared, make a copy and keep the original.
- * Need to prepend the ethernet header, optimize the common
- * case of eh pointing already into the original mbuf.
- */
- struct mbuf *m ;
- if (shared) {
- m = m_copypacket(m0, M_DONTWAIT);
- if (m == NULL) {
- printf("bdg_fwd: copy(1) failed\n");
- return m0;
- }
- } else {
- m = m0 ; /* pass the original to dummynet */
- m0 = NULL ; /* and nothing back to the caller */
- }
- if ( (void *)(eh + 1) == (void *)m->m_data) {
- m->m_data -= ETHER_HDR_LEN ;
- m->m_len += ETHER_HDR_LEN ;
- m->m_pkthdr.len += ETHER_HDR_LEN ;
- bdg_predict++;
- } else {
- M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT);
- if (!m && verbose) printf("M_PREPEND failed\n");
- if (m == NULL) /* nope... */
- return m0 ;
- bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN);
- }
- dummynet_io((i & 0xffff),DN_TO_BDG_FWD,m,real_dst,NULL,0,rule,0);
- return m0 ;
- }
-#endif
- /*
- * XXX add divert/forward actions...
- */
- /* if none of the above matches, we have to drop the pkt */
- bdg_ipfw_drops++ ;
- printf("bdg_forward: No rules match, so dropping packet!\n");
- return m0 ;
- }
-forward:
-#endif /* IPFIREWALL */
- /*
- * Again, bring up the headers in case of shared bufs to avoid
- * corruptions in the future.
- */
- if ( shared ) {
- int i = min(m0->m_pkthdr.len, max_protohdr) ;
-
- m0 = m_pullup(m0, i) ;
- if (m0 == NULL) {
- printf("-- bdg: pullup2 failed.\n") ;
- return NULL ;
- }
- }
- /* now real_dst is used to determine the cluster where to forward */
- if (src != NULL) /* pkt comes from ether_input */
- real_dst = src ;
- for (;;) {
- if (last) { /* need to forward packet leftover from previous loop */
- struct mbuf *m ;
- if (shared == 0 && once ) { /* no need to copy */
- m = m0 ;
- m0 = NULL ; /* original is gone */
- } else {
- m = m_copypacket(m0, M_DONTWAIT);
- if (m == NULL) {
- printf("bdg_forward: sorry, m_copypacket failed!\n");
- return m0 ; /* the original is still there... */
- }
- }
- /*
- * Add header (optimized for the common case of eh pointing
- * already into the mbuf) and execute last part of ether_output:
- * queue pkt and start output if interface not yet active.
- */
- if ( (void *)(eh + 1) == (void *)m->m_data) {
- m->m_data -= ETHER_HDR_LEN ;
- m->m_len += ETHER_HDR_LEN ;
- m->m_pkthdr.len += ETHER_HDR_LEN ;
- bdg_predict++;
- } else {
- M_PREPEND(m, ETHER_HDR_LEN, M_DONTWAIT);
- if (!m && verbose) printf("M_PREPEND failed\n");
- if (m == NULL)
- return m0;
- bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN);
- }
- s = splimp();
- if (IF_QFULL(&last->if_snd)) {
- IF_DROP(&last->if_snd);
-#if 0
- BDG_MUTE(last); /* should I also mute ? */
-#endif
- splx(s);
- m_freem(m); /* consume the pkt anyways */
- } else {
- last->if_obytes += m->m_pkthdr.len ;
- if (m->m_flags & M_MCAST)
- last->if_omcasts++;
- if (m->m_pkthdr.len != m->m_len) /* this pkt is on >1 bufs */
- bdg_split_pkts++;
-
- IF_ENQUEUE(&last->if_snd, m);
- if ((last->if_flags & IFF_OACTIVE) == 0)
- (*last->if_start)(last);
- splx(s);
- }
- BDG_STAT(last, BDG_OUT);
- last = NULL ;
- if (once)
- break ;
- }
- if (ifp == NULL)
- break ;
- /*
- * If the interface is used for bridging, not muted, not full,
- * up and running, is not the source interface, and belongs to
- * the same cluster as the 'real_dst', then send here.
- */
- if ( BDG_USED(ifp) && !BDG_MUTED(ifp) && !IF_QFULL(&ifp->if_snd) &&
- (ifp->if_flags & (IFF_UP|IFF_RUNNING)) == (IFF_UP|IFF_RUNNING) &&
- ifp != src && BDG_SAMECLUSTER(ifp, real_dst) )
- last = ifp ;
- ifp = TAILQ_NEXT(ifp, if_link) ;
- if (ifp == NULL)
- once = 1 ;
- }
- DEB(bdg_fw_ticks += (u_int32_t)(rdtsc() - ticks) ; bdg_fw_count++ ;
- if (bdg_fw_count != 0) bdg_fw_avg = bdg_fw_ticks/bdg_fw_count; )
- return m0 ;
-}
+++ /dev/null
-/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * Copyright (c) 1998 Luigi Rizzo
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- */
-#ifndef _NET_BRIDGE_H_
-#define _NET_BRIDGE_H_
-#include <sys/appleapiopts.h>
-
-#warning This is not used by Darwin, do not include
-
-extern int do_bridge;
-/*
- * the hash table for bridge
- */
-typedef struct hash_table {
- struct ifnet *name ;
- unsigned char etheraddr[6] ;
- unsigned short used ;
-} bdg_hash_table ;
-
-extern bdg_hash_table *bdg_table ;
-
-/*
- * We need additional info for the bridge. The bdg_ifp2sc[] array
- * provides a pointer to this struct using the if_index.
- * bdg_softc has a backpointer to the struct ifnet, the bridge
- * flags, and a cluster (bridging occurs only between port of the
- * same cluster).
- */
-struct bdg_softc {
- struct ifnet *ifp ;
- /* also ((struct arpcom *)ifp)->ac_enaddr is the eth. addr */
- int flags ;
-#define IFF_BDG_PROMISC 0x0001 /* set promisc mode on this if. */
-#define IFF_MUTE 0x0002 /* mute this if for bridging. */
-#define IFF_USED 0x0004 /* use this if for bridging. */
- short cluster_id ; /* in network format */
- uint32_t magic;
-} ;
-
-extern struct bdg_softc *ifp2sc;
-
-#define BDG_USED(ifp) (ifp2sc[ifp->if_index].flags & IFF_USED)
-#define BDG_MUTED(ifp) (ifp2sc[ifp->if_index].flags & IFF_MUTE)
-#define BDG_MUTE(ifp) ifp2sc[ifp->if_index].flags |= IFF_MUTE
-#define BDG_UNMUTE(ifp) ifp2sc[ifp->if_index].flags &= ~IFF_MUTE
-#define BDG_CLUSTER(ifp) (ifp2sc[ifp->if_index].cluster_id)
-
-#define BDG_SAMECLUSTER(ifp,src) \
- (src == NULL || BDG_CLUSTER(ifp) == BDG_CLUSTER(src) )
-
-
-#define BDG_MAX_PORTS 128
-typedef struct _bdg_addr {
- unsigned char etheraddr[6] ;
- short cluster_id ;
-} bdg_addr ;
-extern bdg_addr bdg_addresses[BDG_MAX_PORTS];
-extern int bdg_ports ;
-
-/*
- * out of the 6 bytes, the last ones are more "variable". Since
- * we are on a little endian machine, we have to do some gimmick...
- */
-#define HASH_SIZE 8192 /* must be a power of 2 */
-#define HASH_FN(addr) ( \
- ntohs( ((short *)addr)[1] ^ ((short *)addr)[2] ) & (HASH_SIZE -1))
-
-#define IFF_MUTE IFF_LINK2 /* will need a separate flag... */
-
-struct ifnet *bridge_in(struct ifnet *ifp, struct ether_header *eh);
-/* bdg_forward frees the mbuf if necessary, returning null */
-struct mbuf *bdg_forward(struct mbuf *m0, struct ether_header *eh, struct ifnet *dst);
-
-#ifdef __i386__
-#define BDG_MATCH(a,b) ( \
- ((unsigned short *)(a))[2] == ((unsigned short *)(b))[2] && \
- *((unsigned int *)(a)) == *((unsigned int *)(b)) )
-#define IS_ETHER_BROADCAST(a) ( \
- *((unsigned int *)(a)) == 0xffffffff && \
- ((unsigned short *)(a))[2] == 0xffff )
-#else
-#warning... must complete these for the alpha etc.
-#define BDG_MATCH(a,b) (!bcmp(a, b, ETHER_ADDR_LEN) )
-#endif
-/*
- * The following constants are not legal ifnet pointers, and are used
- * as return values from the classifier, bridge_dst_lookup()
- * The same values are used as index in the statistics arrays,
- * with BDG_FORWARD replacing specifically forwarded packets.
- */
-#define BDG_BCAST ( (struct ifnet *)1 )
-#define BDG_MCAST ( (struct ifnet *)2 )
-#define BDG_LOCAL ( (struct ifnet *)3 )
-#define BDG_DROP ( (struct ifnet *)4 )
-#define BDG_UNKNOWN ( (struct ifnet *)5 )
-#define BDG_IN ( (struct ifnet *)7 )
-#define BDG_OUT ( (struct ifnet *)8 )
-#define BDG_FORWARD ( (struct ifnet *)9 )
-
-#define PF_BDG 3 /* XXX superhack */
-/*
- * statistics, passed up with sysctl interface and ns -p bdg
- */
-
-#define STAT_MAX (int)BDG_FORWARD
-struct bdg_port_stat {
- char name[16];
- uint32_t collisions;
- uint32_t p_in[STAT_MAX+1];
-} ;
-
-struct bdg_stats {
- struct bdg_port_stat s[16];
-} ;
-
-
-#define BDG_STAT(ifp, type) bdg_stats.s[ifp->if_index].p_in[(int)type]++
-
-#ifdef KERNEL
-/*
- * Find the right pkt destination:
- * BDG_BCAST is a broadcast
- * BDG_MCAST is a multicast
- * BDG_LOCAL is for a local address
- * BDG_DROP must be dropped
- * other ifp of the dest. interface (incl.self)
- *
- * We assume this is only called for interfaces for which bridging
- * is enabled, i.e. BDG_USED(ifp) is true.
- */
-static __inline
-struct ifnet *
-bridge_dst_lookup(struct ether_header *eh)
-{
- struct ifnet *dst ;
- int index ;
- bdg_addr *p ;
-
- if (IS_ETHER_BROADCAST(eh->ether_dhost))
- return BDG_BCAST ;
- if (eh->ether_dhost[0] & 1)
- return BDG_MCAST ;
- /*
- * Lookup local addresses in case one matches.
- */
- for (index = bdg_ports, p = bdg_addresses ; index ; index--, p++ )
- if (BDG_MATCH(p->etheraddr, eh->ether_dhost) )
- return BDG_LOCAL ;
- /*
- * Look for a possible destination in table
- */
- index= HASH_FN( eh->ether_dhost );
- dst = bdg_table[index].name;
- if ( dst && BDG_MATCH( bdg_table[index].etheraddr, eh->ether_dhost) )
- return dst ;
- else
- return BDG_UNKNOWN ;
-}
-
-#endif /* KERNEL */
-
-#endif /* _NET_BRIDGE_H_ */
--- /dev/null
+/*
+ * Copyright (c) 2007-2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/* $fpwf: Revision 1.2 2007/05/17 03:38:46 rnewberry Exp $ */
+/* $NetBSD: bridgestp.c,v 1.10 2006/11/16 01:33:40 christos Exp $ */
+
+/*
+ * Copyright (c) 2000 Jason L. Wright (jason@thought.net)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Jason L. Wright
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * OpenBSD: bridgestp.c,v 1.5 2001/03/22 03:48:29 jason Exp
+ */
+
+/*
+ * Implementation of the spanning tree protocol as defined in
+ * ISO/IEC Final DIS 15802-3 (IEEE P802.1D/D17), May 25, 1998.
+ * (In English: IEEE 802.1D, Draft 17, 1998)
+ */
+
+/* $NetBSD: if_bridgevar.h,v 1.8 2005/12/10 23:21:38 elad Exp $ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/kernel.h>
+#include <sys/callout.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/if_llc.h>
+
+#include <net/if_ether.h>
+#include <net/if_bridgevar.h>
+#include <net/if_media.h>
+
+#include <net/kpi_interface.h>
+
+/* BPDU message types */
+#define BSTP_MSGTYPE_CFG 0x00 /* Configuration */
+#define BSTP_MSGTYPE_TCN 0x80 /* Topology chg notification */
+
+/* BPDU flags */
+#define BSTP_FLAG_TC 0x01 /* Topology change */
+#define BSTP_FLAG_TCA 0x80 /* Topology change ack */
+
+#define BSTP_MESSAGE_AGE_INCR (1 * 256) /* in 256ths of a second */
+#define BSTP_TICK_VAL (1 * 256) /* in 256ths of a second */
+
+/*
+ * Because BPDU's do not make nicely aligned structures, two different
+ * declarations are used: bstp_?bpdu (wire representation, packed) and
+ * bstp_*_unit (internal, nicely aligned version).
+ */
+
+/* configuration bridge protocol data unit */
+struct bstp_cbpdu {
+ uint8_t cbu_dsap; /* LLC: destination sap */
+ uint8_t cbu_ssap; /* LLC: source sap */
+ uint8_t cbu_ctl; /* LLC: control */
+ uint16_t cbu_protoid; /* protocol id */
+ uint8_t cbu_protover; /* protocol version */
+ uint8_t cbu_bpdutype; /* message type */
+ uint8_t cbu_flags; /* flags (below) */
+
+ /* root id */
+ uint16_t cbu_rootpri; /* root priority */
+ uint8_t cbu_rootaddr[6]; /* root address */
+
+ uint32_t cbu_rootpathcost; /* root path cost */
+
+ /* bridge id */
+ uint16_t cbu_bridgepri; /* bridge priority */
+ uint8_t cbu_bridgeaddr[6]; /* bridge address */
+
+ uint16_t cbu_portid; /* port id */
+ uint16_t cbu_messageage; /* current message age */
+ uint16_t cbu_maxage; /* maximum age */
+ uint16_t cbu_hellotime; /* hello time */
+ uint16_t cbu_forwarddelay; /* forwarding delay */
+} __attribute__((__packed__));
+
+/* topology change notification bridge protocol data unit */
+struct bstp_tbpdu {
+ uint8_t tbu_dsap; /* LLC: destination sap */
+ uint8_t tbu_ssap; /* LLC: source sap */
+ uint8_t tbu_ctl; /* LLC: control */
+ uint16_t tbu_protoid; /* protocol id */
+ uint8_t tbu_protover; /* protocol version */
+ uint8_t tbu_bpdutype; /* message type */
+} __attribute__((__packed__));
+
+const uint8_t bstp_etheraddr[] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
+
+void bstp_initialize_port(struct bridge_softc *, struct bridge_iflist *);
+void bstp_ifupdstatus(struct bridge_softc *, struct bridge_iflist *);
+void bstp_enable_port(struct bridge_softc *, struct bridge_iflist *);
+void bstp_disable_port(struct bridge_softc *, struct bridge_iflist *);
+void bstp_enable_change_detection(struct bridge_iflist *);
+void bstp_disable_change_detection(struct bridge_iflist *);
+int bstp_root_bridge(struct bridge_softc *sc);
+int bstp_supersedes_port_info(struct bridge_softc *,
+ struct bridge_iflist *, struct bstp_config_unit *);
+int bstp_designated_port(struct bridge_softc *, struct bridge_iflist *);
+int bstp_designated_for_some_port(struct bridge_softc *);
+void bstp_transmit_config(struct bridge_softc *, struct bridge_iflist *);
+void bstp_transmit_tcn(struct bridge_softc *);
+void bstp_received_config_bpdu(struct bridge_softc *,
+ struct bridge_iflist *, struct bstp_config_unit *);
+void bstp_received_tcn_bpdu(struct bridge_softc *, struct bridge_iflist *,
+ struct bstp_tcn_unit *);
+void bstp_record_config_information(struct bridge_softc *,
+ struct bridge_iflist *, struct bstp_config_unit *);
+void bstp_record_config_timeout_values(struct bridge_softc *,
+ struct bstp_config_unit *);
+void bstp_config_bpdu_generation(struct bridge_softc *);
+void bstp_send_config_bpdu(struct bridge_softc *, struct bridge_iflist *,
+ struct bstp_config_unit *);
+void bstp_configuration_update(struct bridge_softc *);
+void bstp_root_selection(struct bridge_softc *);
+void bstp_designated_port_selection(struct bridge_softc *);
+void bstp_become_designated_port(struct bridge_softc *,
+ struct bridge_iflist *);
+void bstp_port_state_selection(struct bridge_softc *);
+void bstp_make_forwarding(struct bridge_softc *, struct bridge_iflist *);
+void bstp_make_blocking(struct bridge_softc *, struct bridge_iflist *);
+void bstp_set_port_state(struct bridge_iflist *, uint8_t);
+void bstp_set_bridge_priority(struct bridge_softc *, uint64_t);
+void bstp_set_port_priority(struct bridge_softc *, struct bridge_iflist *,
+ uint16_t);
+void bstp_set_path_cost(struct bridge_softc *, struct bridge_iflist *,
+ uint32_t);
+void bstp_topology_change_detection(struct bridge_softc *);
+void bstp_topology_change_acknowledged(struct bridge_softc *);
+void bstp_acknowledge_topology_change(struct bridge_softc *,
+ struct bridge_iflist *);
+
+void bstp_tick(void *);
+void bstp_timer_start(struct bridge_timer *, uint16_t);
+void bstp_timer_stop(struct bridge_timer *);
+int bstp_timer_expired(struct bridge_timer *, uint16_t);
+
+void bstp_hold_timer_expiry(struct bridge_softc *, struct bridge_iflist *);
+void bstp_message_age_timer_expiry(struct bridge_softc *,
+ struct bridge_iflist *);
+void bstp_forward_delay_timer_expiry(struct bridge_softc *,
+ struct bridge_iflist *);
+void bstp_topology_change_timer_expiry(struct bridge_softc *);
+void bstp_tcn_timer_expiry(struct bridge_softc *);
+void bstp_hello_timer_expiry(struct bridge_softc *);
+
+void
+bstp_transmit_config(struct bridge_softc *sc, struct bridge_iflist *bif)
+{
+ if (bif->bif_hold_timer.active) {
+ bif->bif_config_pending = 1;
+ return;
+ }
+
+ bif->bif_config_bpdu.cu_message_type = BSTP_MSGTYPE_CFG;
+ bif->bif_config_bpdu.cu_rootid = sc->sc_designated_root;
+ bif->bif_config_bpdu.cu_root_path_cost = sc->sc_root_path_cost;
+ bif->bif_config_bpdu.cu_bridge_id = sc->sc_bridge_id;
+ bif->bif_config_bpdu.cu_port_id = bif->bif_port_id;
+
+ if (bstp_root_bridge(sc))
+ bif->bif_config_bpdu.cu_message_age = 0;
+ else
+ bif->bif_config_bpdu.cu_message_age =
+ sc->sc_root_port->bif_message_age_timer.value +
+ BSTP_MESSAGE_AGE_INCR;
+
+ bif->bif_config_bpdu.cu_max_age = sc->sc_max_age;
+ bif->bif_config_bpdu.cu_hello_time = sc->sc_hello_time;
+ bif->bif_config_bpdu.cu_forward_delay = sc->sc_forward_delay;
+ bif->bif_config_bpdu.cu_topology_change_acknowledgment
+ = bif->bif_topology_change_acknowledge;
+ bif->bif_config_bpdu.cu_topology_change = sc->sc_topology_change;
+
+ if (bif->bif_config_bpdu.cu_message_age < sc->sc_max_age) {
+ bif->bif_topology_change_acknowledge = 0;
+ bif->bif_config_pending = 0;
+ bstp_send_config_bpdu(sc, bif, &bif->bif_config_bpdu);
+ bstp_timer_start(&bif->bif_hold_timer, 0);
+ }
+}
+
+void
+bstp_send_config_bpdu(struct bridge_softc *sc, struct bridge_iflist *bif,
+ struct bstp_config_unit *cu)
+{
+ struct ifnet *ifp;
+ struct mbuf *m;
+ struct ether_header *eh;
+ struct bstp_cbpdu bpdu;
+
+ ifp = bif->bif_ifp;
+
+ if ((ifp->if_flags & IFF_RUNNING) == 0)
+ return;
+
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL)
+ return;
+
+ eh = mtod(m, struct ether_header *);
+
+ m->m_pkthdr.rcvif = ifp;
+ m->m_pkthdr.len = sizeof(*eh) + sizeof(bpdu);
+ m->m_len = m->m_pkthdr.len;
+
+ bpdu.cbu_ssap = bpdu.cbu_dsap = LLC_8021D_LSAP;
+ bpdu.cbu_ctl = LLC_UI;
+ bpdu.cbu_protoid = htons(0);
+ bpdu.cbu_protover = 0;
+ bpdu.cbu_bpdutype = cu->cu_message_type;
+ bpdu.cbu_flags = (cu->cu_topology_change ? BSTP_FLAG_TC : 0) |
+ (cu->cu_topology_change_acknowledgment ? BSTP_FLAG_TCA : 0);
+
+ bpdu.cbu_rootpri = htons(cu->cu_rootid >> 48);
+ bpdu.cbu_rootaddr[0] = cu->cu_rootid >> 40;
+ bpdu.cbu_rootaddr[1] = cu->cu_rootid >> 32;
+ bpdu.cbu_rootaddr[2] = cu->cu_rootid >> 24;
+ bpdu.cbu_rootaddr[3] = cu->cu_rootid >> 16;
+ bpdu.cbu_rootaddr[4] = cu->cu_rootid >> 8;
+ bpdu.cbu_rootaddr[5] = cu->cu_rootid >> 0;
+
+ bpdu.cbu_rootpathcost = htonl(cu->cu_root_path_cost);
+
+ bpdu.cbu_bridgepri = htons(cu->cu_rootid >> 48);
+ bpdu.cbu_bridgeaddr[0] = cu->cu_rootid >> 40;
+ bpdu.cbu_bridgeaddr[1] = cu->cu_rootid >> 32;
+ bpdu.cbu_bridgeaddr[2] = cu->cu_rootid >> 24;
+ bpdu.cbu_bridgeaddr[3] = cu->cu_rootid >> 16;
+ bpdu.cbu_bridgeaddr[4] = cu->cu_rootid >> 8;
+ bpdu.cbu_bridgeaddr[5] = cu->cu_rootid >> 0;
+
+ bpdu.cbu_portid = htons(cu->cu_port_id);
+ bpdu.cbu_messageage = htons(cu->cu_message_age);
+ bpdu.cbu_maxage = htons(cu->cu_max_age);
+ bpdu.cbu_hellotime = htons(cu->cu_hello_time);
+ bpdu.cbu_forwarddelay = htons(cu->cu_forward_delay);
+
+ memcpy(eh->ether_shost, ifnet_lladdr(ifp), ETHER_ADDR_LEN);
+ memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN);
+ eh->ether_type = htons(sizeof(bpdu));
+
+ memcpy(mtod(m, caddr_t) + sizeof(*eh), &bpdu, sizeof(bpdu));
+
+ bridge_enqueue(sc, ifp, m); // APPLE MODIFICATION - no flags param
+}
+
+int
+bstp_root_bridge(struct bridge_softc *sc)
+{
+ return (sc->sc_designated_root == sc->sc_bridge_id);
+}
+
+int
+bstp_supersedes_port_info(struct bridge_softc *sc, struct bridge_iflist *bif,
+ struct bstp_config_unit *cu)
+{
+ if (cu->cu_rootid < bif->bif_designated_root)
+ return (1);
+ if (cu->cu_rootid > bif->bif_designated_root)
+ return (0);
+
+ if (cu->cu_root_path_cost < bif->bif_designated_cost)
+ return (1);
+ if (cu->cu_root_path_cost > bif->bif_designated_cost)
+ return (0);
+
+ if (cu->cu_bridge_id < bif->bif_designated_bridge)
+ return (1);
+ if (cu->cu_bridge_id > bif->bif_designated_bridge)
+ return (0);
+
+ if (sc->sc_bridge_id != cu->cu_bridge_id)
+ return (1);
+ if (cu->cu_port_id <= bif->bif_designated_port)
+ return (1);
+ return (0);
+}
+
+void
+bstp_record_config_information(__unused struct bridge_softc *sc,
+ struct bridge_iflist *bif, struct bstp_config_unit *cu)
+{
+ bif->bif_designated_root = cu->cu_rootid;
+ bif->bif_designated_cost = cu->cu_root_path_cost;
+ bif->bif_designated_bridge = cu->cu_bridge_id;
+ bif->bif_designated_port = cu->cu_port_id;
+ bstp_timer_start(&bif->bif_message_age_timer, cu->cu_message_age);
+}
+
+void
+bstp_record_config_timeout_values(struct bridge_softc *sc,
+ struct bstp_config_unit *config)
+{
+ sc->sc_max_age = config->cu_max_age;
+ sc->sc_hello_time = config->cu_hello_time;
+ sc->sc_forward_delay = config->cu_forward_delay;
+ sc->sc_topology_change = config->cu_topology_change;
+}
+
+void
+bstp_config_bpdu_generation(struct bridge_softc *sc)
+{
+ struct bridge_iflist *bif;
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ if (bstp_designated_port(sc, bif) &&
+ (bif->bif_state != BSTP_IFSTATE_DISABLED))
+ bstp_transmit_config(sc, bif);
+ }
+}
+
+int
+bstp_designated_port(struct bridge_softc *sc, struct bridge_iflist *bif)
+{
+ return ((bif->bif_designated_bridge == sc->sc_bridge_id)
+ && (bif->bif_designated_port == bif->bif_port_id));
+}
+
+void
+bstp_transmit_tcn(struct bridge_softc *sc)
+{
+ struct bstp_tbpdu bpdu;
+ struct bridge_iflist *bif = sc->sc_root_port;
+ struct ifnet *ifp;
+ struct ether_header *eh;
+ struct mbuf *m;
+
+ KASSERT(bif != NULL, "bstp_transmit_tcn bif NULL");
+ ifp = bif->bif_ifp;
+ if ((ifp->if_flags & IFF_RUNNING) == 0)
+ return;
+
+ MGETHDR(m, M_DONTWAIT, MT_DATA);
+ if (m == NULL)
+ return;
+
+ m->m_pkthdr.rcvif = ifp;
+ m->m_pkthdr.len = sizeof(*eh) + sizeof(bpdu);
+ m->m_len = m->m_pkthdr.len;
+
+ eh = mtod(m, struct ether_header *);
+
+ memcpy(eh->ether_shost, ifnet_lladdr(ifp), ETHER_ADDR_LEN);
+ memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN);
+ eh->ether_type = htons(sizeof(bpdu));
+
+ bpdu.tbu_ssap = bpdu.tbu_dsap = LLC_8021D_LSAP;
+ bpdu.tbu_ctl = LLC_UI;
+ bpdu.tbu_protoid = 0;
+ bpdu.tbu_protover = 0;
+ bpdu.tbu_bpdutype = BSTP_MSGTYPE_TCN;
+
+ memcpy(mtod(m, caddr_t) + sizeof(*eh), &bpdu, sizeof(bpdu));
+
+ bridge_enqueue(sc, ifp, m); // APPLE MODIFICATION - no flags param
+}
+
+void
+bstp_configuration_update(struct bridge_softc *sc)
+{
+ bstp_root_selection(sc);
+ bstp_designated_port_selection(sc);
+}
+
+void
+bstp_root_selection(struct bridge_softc *sc)
+{
+ struct bridge_iflist *root_port = NULL, *bif;
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ if (bstp_designated_port(sc, bif))
+ continue;
+ if (bif->bif_state == BSTP_IFSTATE_DISABLED)
+ continue;
+ if (bif->bif_designated_root >= sc->sc_bridge_id)
+ continue;
+ if (root_port == NULL)
+ goto set_port;
+
+ if (bif->bif_designated_root < root_port->bif_designated_root)
+ goto set_port;
+ if (bif->bif_designated_root > root_port->bif_designated_root)
+ continue;
+
+ if ((bif->bif_designated_cost + bif->bif_path_cost) <
+ (root_port->bif_designated_cost + root_port->bif_path_cost))
+ goto set_port;
+ if ((bif->bif_designated_cost + bif->bif_path_cost) >
+ (root_port->bif_designated_cost + root_port->bif_path_cost))
+ continue;
+
+ if (bif->bif_designated_bridge <
+ root_port->bif_designated_bridge)
+ goto set_port;
+ if (bif->bif_designated_bridge >
+ root_port->bif_designated_bridge)
+ continue;
+
+ if (bif->bif_designated_port < root_port->bif_designated_port)
+ goto set_port;
+ if (bif->bif_designated_port > root_port->bif_designated_port)
+ continue;
+
+ if (bif->bif_port_id >= root_port->bif_port_id)
+ continue;
+set_port:
+ root_port = bif;
+ }
+
+ sc->sc_root_port = root_port;
+ if (root_port == NULL) {
+ sc->sc_designated_root = sc->sc_bridge_id;
+ sc->sc_root_path_cost = 0;
+ } else {
+ sc->sc_designated_root = root_port->bif_designated_root;
+ sc->sc_root_path_cost = root_port->bif_designated_cost +
+ root_port->bif_path_cost;
+ }
+}
+
+void
+bstp_designated_port_selection(struct bridge_softc *sc)
+{
+ struct bridge_iflist *bif;
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ if (bstp_designated_port(sc, bif))
+ goto designated;
+ if (bif->bif_designated_root != sc->sc_designated_root)
+ goto designated;
+
+ if (sc->sc_root_path_cost < bif->bif_designated_cost)
+ goto designated;
+ if (sc->sc_root_path_cost > bif->bif_designated_cost)
+ continue;
+
+ if (sc->sc_bridge_id < bif->bif_designated_bridge)
+ goto designated;
+ if (sc->sc_bridge_id > bif->bif_designated_bridge)
+ continue;
+
+ if (bif->bif_port_id > bif->bif_designated_port)
+ continue;
+designated:
+ bstp_become_designated_port(sc, bif);
+ }
+}
+
+void
+bstp_become_designated_port(struct bridge_softc *sc, struct bridge_iflist *bif)
+{
+ bif->bif_designated_root = sc->sc_designated_root;
+ bif->bif_designated_cost = sc->sc_root_path_cost;
+ bif->bif_designated_bridge = sc->sc_bridge_id;
+ bif->bif_designated_port = bif->bif_port_id;
+}
+
+void
+bstp_port_state_selection(struct bridge_softc *sc)
+{
+ struct bridge_iflist *bif;
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ if (bif == sc->sc_root_port) {
+ bif->bif_config_pending = 0;
+ bif->bif_topology_change_acknowledge = 0;
+ bstp_make_forwarding(sc, bif);
+ } else if (bstp_designated_port(sc, bif)) {
+ bstp_timer_stop(&bif->bif_message_age_timer);
+ bstp_make_forwarding(sc, bif);
+ } else {
+ bif->bif_config_pending = 0;
+ bif->bif_topology_change_acknowledge = 0;
+ bstp_make_blocking(sc, bif);
+ }
+ }
+}
+
+void
+bstp_make_forwarding(__unused struct bridge_softc *sc,
+ struct bridge_iflist *bif)
+{
+ if (bif->bif_state == BSTP_IFSTATE_BLOCKING) {
+ bstp_set_port_state(bif, BSTP_IFSTATE_LISTENING);
+ bstp_timer_start(&bif->bif_forward_delay_timer, 0);
+ }
+}
+
+void
+bstp_make_blocking(struct bridge_softc *sc, struct bridge_iflist *bif)
+{
+ if ((bif->bif_state != BSTP_IFSTATE_DISABLED) &&
+ (bif->bif_state != BSTP_IFSTATE_BLOCKING)) {
+ if ((bif->bif_state == BSTP_IFSTATE_FORWARDING) ||
+ (bif->bif_state == BSTP_IFSTATE_LEARNING)) {
+ if (bif->bif_change_detection_enabled) {
+ bstp_topology_change_detection(sc);
+ }
+ }
+ bstp_set_port_state(bif, BSTP_IFSTATE_BLOCKING);
+ bstp_timer_stop(&bif->bif_forward_delay_timer);
+ }
+}
+
+void
+bstp_set_port_state(struct bridge_iflist *bif, uint8_t state)
+{
+ bif->bif_state = state;
+}
+
+void
+bstp_topology_change_detection(struct bridge_softc *sc)
+{
+ if (bstp_root_bridge(sc)) {
+ sc->sc_topology_change = 1;
+ bstp_timer_start(&sc->sc_topology_change_timer, 0);
+ } else if (!sc->sc_topology_change_detected) {
+ bstp_transmit_tcn(sc);
+ bstp_timer_start(&sc->sc_tcn_timer, 0);
+ }
+ sc->sc_topology_change_detected = 1;
+}
+
+void
+bstp_topology_change_acknowledged(struct bridge_softc *sc)
+{
+ sc->sc_topology_change_detected = 0;
+ bstp_timer_stop(&sc->sc_tcn_timer);
+}
+
+void
+bstp_acknowledge_topology_change(struct bridge_softc *sc,
+ struct bridge_iflist *bif)
+{
+ bif->bif_topology_change_acknowledge = 1;
+ bstp_transmit_config(sc, bif);
+}
+
+__private_extern__ struct mbuf *
+bstp_input(struct bridge_softc *sc, struct ifnet *ifp, struct mbuf *m)
+{
+ struct bridge_iflist *bif = NULL;
+ struct ether_header *eh;
+ struct bstp_tbpdu tpdu;
+ struct bstp_cbpdu cpdu;
+ struct bstp_config_unit cu;
+ struct bstp_tcn_unit tu;
+ uint16_t len;
+
+ eh = mtod(m, struct ether_header *);
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ if (bif->bif_ifp == ifp)
+ break;
+ }
+ if (bif == NULL)
+ goto out;
+
+ len = ntohs(eh->ether_type);
+ if (len < sizeof(tpdu))
+ goto out;
+
+ m_adj(m, ETHER_HDR_LEN);
+
+ if (m->m_pkthdr.len > len)
+ m_adj(m, len - m->m_pkthdr.len);
+ if ((size_t)m->m_len < sizeof(tpdu) &&
+ (m = m_pullup(m, sizeof(tpdu))) == NULL)
+ goto out;
+
+ memcpy(&tpdu, mtod(m, caddr_t), sizeof(tpdu));
+
+ if (tpdu.tbu_dsap != LLC_8021D_LSAP ||
+ tpdu.tbu_ssap != LLC_8021D_LSAP ||
+ tpdu.tbu_ctl != LLC_UI)
+ goto out;
+ if (tpdu.tbu_protoid != 0 || tpdu.tbu_protover != 0)
+ goto out;
+
+ switch (tpdu.tbu_bpdutype) {
+ case BSTP_MSGTYPE_TCN:
+ tu.tu_message_type = tpdu.tbu_bpdutype;
+ bstp_received_tcn_bpdu(sc, bif, &tu);
+ break;
+ case BSTP_MSGTYPE_CFG:
+ if ((size_t)m->m_len < sizeof(cpdu) &&
+ (m = m_pullup(m, sizeof(cpdu))) == NULL)
+ goto out;
+ memcpy(&cpdu, mtod(m, caddr_t), sizeof(cpdu));
+
+ cu.cu_rootid =
+ (((uint64_t)ntohs(cpdu.cbu_rootpri)) << 48) |
+ (((uint64_t)cpdu.cbu_rootaddr[0]) << 40) |
+ (((uint64_t)cpdu.cbu_rootaddr[1]) << 32) |
+ (((uint64_t)cpdu.cbu_rootaddr[2]) << 24) |
+ (((uint64_t)cpdu.cbu_rootaddr[3]) << 16) |
+ (((uint64_t)cpdu.cbu_rootaddr[4]) << 8) |
+ (((uint64_t)cpdu.cbu_rootaddr[5]) << 0);
+
+ cu.cu_bridge_id =
+ (((uint64_t)ntohs(cpdu.cbu_bridgepri)) << 48) |
+ (((uint64_t)cpdu.cbu_bridgeaddr[0]) << 40) |
+ (((uint64_t)cpdu.cbu_bridgeaddr[1]) << 32) |
+ (((uint64_t)cpdu.cbu_bridgeaddr[2]) << 24) |
+ (((uint64_t)cpdu.cbu_bridgeaddr[3]) << 16) |
+ (((uint64_t)cpdu.cbu_bridgeaddr[4]) << 8) |
+ (((uint64_t)cpdu.cbu_bridgeaddr[5]) << 0);
+
+ cu.cu_root_path_cost = ntohl(cpdu.cbu_rootpathcost);
+ cu.cu_message_age = ntohs(cpdu.cbu_messageage);
+ cu.cu_max_age = ntohs(cpdu.cbu_maxage);
+ cu.cu_hello_time = ntohs(cpdu.cbu_hellotime);
+ cu.cu_forward_delay = ntohs(cpdu.cbu_forwarddelay);
+ cu.cu_port_id = ntohs(cpdu.cbu_portid);
+ cu.cu_message_type = cpdu.cbu_bpdutype;
+ cu.cu_topology_change_acknowledgment =
+ (cpdu.cbu_flags & BSTP_FLAG_TCA) ? 1 : 0;
+ cu.cu_topology_change =
+ (cpdu.cbu_flags & BSTP_FLAG_TC) ? 1 : 0;
+ bstp_received_config_bpdu(sc, bif, &cu);
+ break;
+ default:
+ goto out;
+ }
+
+ out:
+ if (m)
+ m_freem(m);
+ return (NULL);
+}
+
+void
+bstp_received_config_bpdu(struct bridge_softc *sc, struct bridge_iflist *bif,
+ struct bstp_config_unit *cu)
+{
+ int root;
+
+ root = bstp_root_bridge(sc);
+
+ if (bif->bif_state != BSTP_IFSTATE_DISABLED) {
+ if (bstp_supersedes_port_info(sc, bif, cu)) {
+ bstp_record_config_information(sc, bif, cu);
+ bstp_configuration_update(sc);
+ bstp_port_state_selection(sc);
+
+ if ((bstp_root_bridge(sc) == 0) && root) {
+ bstp_timer_stop(&sc->sc_hello_timer);
+
+ if (sc->sc_topology_change_detected) {
+ bstp_timer_stop(
+ &sc->sc_topology_change_timer);
+ bstp_transmit_tcn(sc);
+ bstp_timer_start(&sc->sc_tcn_timer, 0);
+ }
+ }
+
+ if (bif == sc->sc_root_port) {
+ bstp_record_config_timeout_values(sc, cu);
+ bstp_config_bpdu_generation(sc);
+
+ if (cu->cu_topology_change_acknowledgment)
+ bstp_topology_change_acknowledged(sc);
+ }
+ } else if (bstp_designated_port(sc, bif))
+ bstp_transmit_config(sc, bif);
+ }
+}
+
+void
+bstp_received_tcn_bpdu(struct bridge_softc *sc, struct bridge_iflist *bif,
+ __unused struct bstp_tcn_unit *tcn)
+{
+ if (bif->bif_state != BSTP_IFSTATE_DISABLED &&
+ bstp_designated_port(sc, bif)) {
+ bstp_topology_change_detection(sc);
+ bstp_acknowledge_topology_change(sc, bif);
+ }
+}
+
+void
+bstp_hello_timer_expiry(struct bridge_softc *sc)
+{
+ bstp_config_bpdu_generation(sc);
+ bstp_timer_start(&sc->sc_hello_timer, 0);
+}
+
+void
+bstp_message_age_timer_expiry(struct bridge_softc *sc,
+ struct bridge_iflist *bif)
+{
+ int root;
+
+ root = bstp_root_bridge(sc);
+ bstp_become_designated_port(sc, bif);
+ bstp_configuration_update(sc);
+ bstp_port_state_selection(sc);
+
+ if ((bstp_root_bridge(sc)) && (root == 0)) {
+ sc->sc_max_age = sc->sc_bridge_max_age;
+ sc->sc_hello_time = sc->sc_bridge_hello_time;
+ sc->sc_forward_delay = sc->sc_bridge_forward_delay;
+
+ bstp_topology_change_detection(sc);
+ bstp_timer_stop(&sc->sc_tcn_timer);
+ bstp_config_bpdu_generation(sc);
+ bstp_timer_start(&sc->sc_hello_timer, 0);
+ }
+}
+
+void
+bstp_forward_delay_timer_expiry(struct bridge_softc *sc,
+ struct bridge_iflist *bif)
+{
+ if (bif->bif_state == BSTP_IFSTATE_LISTENING) {
+ bstp_set_port_state(bif, BSTP_IFSTATE_LEARNING);
+ bstp_timer_start(&bif->bif_forward_delay_timer, 0);
+ } else if (bif->bif_state == BSTP_IFSTATE_LEARNING) {
+ bstp_set_port_state(bif, BSTP_IFSTATE_FORWARDING);
+ if (bstp_designated_for_some_port(sc) &&
+ bif->bif_change_detection_enabled)
+ bstp_topology_change_detection(sc);
+ }
+}
+
+int
+bstp_designated_for_some_port(struct bridge_softc *sc)
+{
+
+ struct bridge_iflist *bif;
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ if (bif->bif_designated_bridge == sc->sc_bridge_id)
+ return (1);
+ }
+ return (0);
+}
+
+void
+bstp_tcn_timer_expiry(struct bridge_softc *sc)
+{
+ bstp_transmit_tcn(sc);
+ bstp_timer_start(&sc->sc_tcn_timer, 0);
+}
+
+void
+bstp_topology_change_timer_expiry(struct bridge_softc *sc)
+{
+ sc->sc_topology_change_detected = 0;
+ sc->sc_topology_change = 0;
+}
+
+void
+bstp_hold_timer_expiry(struct bridge_softc *sc, struct bridge_iflist *bif)
+{
+ if (bif->bif_config_pending)
+ bstp_transmit_config(sc, bif);
+}
+
+__private_extern__ void
+bstp_initialization(struct bridge_softc *sc)
+{
+ struct bridge_iflist *bif, *mif;
+ struct timespec ts;
+ unsigned char *lladdr;
+
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED);
+
+ mif = NULL;
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ if (bif->bif_ifp->if_type != IFT_ETHER)
+ continue;
+ bif->bif_port_id = (bif->bif_priority << 8) |
+ (bif->bif_ifp->if_index & 0xff);
+
+ if (mif == NULL) {
+ mif = bif;
+ continue;
+ }
+ if (memcmp(ifnet_lladdr(bif->bif_ifp),
+ ifnet_lladdr(mif->bif_ifp), ETHER_ADDR_LEN) < 0) {
+ mif = bif;
+ continue;
+ }
+ }
+ if (mif == NULL) {
+ bstp_stop(sc);
+ return;
+ }
+
+ lladdr = ifnet_lladdr(mif->bif_ifp);
+ sc->sc_bridge_id =
+ (((uint64_t)sc->sc_bridge_priority) << 48) |
+ (((uint64_t)lladdr[0]) << 40) |
+ (((uint64_t)lladdr[1]) << 32) |
+ (lladdr[2] << 24) |
+ (lladdr[3] << 16) |
+ (lladdr[4] << 8) |
+ (lladdr[5]);
+
+ sc->sc_designated_root = sc->sc_bridge_id;
+ sc->sc_root_path_cost = 0;
+ sc->sc_root_port = NULL;
+
+ sc->sc_max_age = sc->sc_bridge_max_age;
+ sc->sc_hello_time = sc->sc_bridge_hello_time;
+ sc->sc_forward_delay = sc->sc_bridge_forward_delay;
+ sc->sc_topology_change_detected = 0;
+ sc->sc_topology_change = 0;
+ bstp_timer_stop(&sc->sc_tcn_timer);
+ bstp_timer_stop(&sc->sc_topology_change_timer);
+
+ bsd_untimeout(bstp_tick, sc);
+ ts.tv_sec = 1;
+ ts.tv_nsec = 0;
+ bsd_timeout(bstp_tick, sc, &ts);
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if (bif->bif_flags & IFBIF_STP)
+ bstp_enable_port(sc, bif);
+ else
+ bstp_disable_port(sc, bif);
+ }
+
+ bstp_port_state_selection(sc);
+ bstp_config_bpdu_generation(sc);
+ bstp_timer_start(&sc->sc_hello_timer, 0);
+}
+
+__private_extern__ void
+bstp_stop(struct bridge_softc *sc)
+{
+ struct bridge_iflist *bif;
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ bstp_set_port_state(bif, BSTP_IFSTATE_DISABLED);
+ bstp_timer_stop(&bif->bif_hold_timer);
+ bstp_timer_stop(&bif->bif_message_age_timer);
+ bstp_timer_stop(&bif->bif_forward_delay_timer);
+ }
+
+ bsd_untimeout(bstp_tick, sc);
+
+ bstp_timer_stop(&sc->sc_topology_change_timer);
+ bstp_timer_stop(&sc->sc_tcn_timer);
+ bstp_timer_stop(&sc->sc_hello_timer);
+
+}
+
+void
+bstp_initialize_port(struct bridge_softc *sc, struct bridge_iflist *bif)
+{
+ bstp_become_designated_port(sc, bif);
+ bstp_set_port_state(bif, BSTP_IFSTATE_BLOCKING);
+ bif->bif_topology_change_acknowledge = 0;
+ bif->bif_config_pending = 0;
+ bif->bif_change_detection_enabled = 1;
+ bstp_timer_stop(&bif->bif_message_age_timer);
+ bstp_timer_stop(&bif->bif_forward_delay_timer);
+ bstp_timer_stop(&bif->bif_hold_timer);
+}
+
+void
+bstp_enable_port(struct bridge_softc *sc, struct bridge_iflist *bif)
+{
+ bstp_initialize_port(sc, bif);
+ bstp_port_state_selection(sc);
+}
+
+void
+bstp_disable_port(struct bridge_softc *sc, struct bridge_iflist *bif)
+{
+ int root;
+
+ root = bstp_root_bridge(sc);
+ bstp_become_designated_port(sc, bif);
+ bstp_set_port_state(bif, BSTP_IFSTATE_DISABLED);
+ bif->bif_topology_change_acknowledge = 0;
+ bif->bif_config_pending = 0;
+ bstp_timer_stop(&bif->bif_message_age_timer);
+ bstp_timer_stop(&bif->bif_forward_delay_timer);
+ bstp_configuration_update(sc);
+ bstp_port_state_selection(sc);
+
+ if (bstp_root_bridge(sc) && (root == 0)) {
+ sc->sc_max_age = sc->sc_bridge_max_age;
+ sc->sc_hello_time = sc->sc_bridge_hello_time;
+ sc->sc_forward_delay = sc->sc_bridge_forward_delay;
+
+ bstp_topology_change_detection(sc);
+ bstp_timer_stop(&sc->sc_tcn_timer);
+ bstp_config_bpdu_generation(sc);
+ bstp_timer_start(&sc->sc_hello_timer, 0);
+ }
+}
+
+void
+bstp_set_bridge_priority(struct bridge_softc *sc, uint64_t new_bridge_id)
+{
+ struct bridge_iflist *bif;
+ int root;
+
+ root = bstp_root_bridge(sc);
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ if (bstp_designated_port(sc, bif))
+ bif->bif_designated_bridge = new_bridge_id;
+ }
+
+ sc->sc_bridge_id = new_bridge_id;
+
+ bstp_configuration_update(sc);
+ bstp_port_state_selection(sc);
+
+ if (bstp_root_bridge(sc) && (root == 0)) {
+ sc->sc_max_age = sc->sc_bridge_max_age;
+ sc->sc_hello_time = sc->sc_bridge_hello_time;
+ sc->sc_forward_delay = sc->sc_bridge_forward_delay;
+
+ bstp_topology_change_detection(sc);
+ bstp_timer_stop(&sc->sc_tcn_timer);
+ bstp_config_bpdu_generation(sc);
+ bstp_timer_start(&sc->sc_hello_timer, 0);
+ }
+}
+
+void
+bstp_set_port_priority(struct bridge_softc *sc, struct bridge_iflist *bif,
+ uint16_t new_port_id)
+{
+ if (bstp_designated_port(sc, bif))
+ bif->bif_designated_port = new_port_id;
+
+ bif->bif_port_id = new_port_id;
+
+ if ((sc->sc_bridge_id == bif->bif_designated_bridge) &&
+ (bif->bif_port_id < bif->bif_designated_port)) {
+ bstp_become_designated_port(sc, bif);
+ bstp_port_state_selection(sc);
+ }
+}
+
+void
+bstp_set_path_cost(struct bridge_softc *sc, struct bridge_iflist *bif,
+ uint32_t path_cost)
+{
+ bif->bif_path_cost = path_cost;
+ bstp_configuration_update(sc);
+ bstp_port_state_selection(sc);
+}
+
+void
+bstp_enable_change_detection(struct bridge_iflist *bif)
+{
+ bif->bif_change_detection_enabled = 1;
+}
+
+void
+bstp_disable_change_detection(struct bridge_iflist *bif)
+{
+ bif->bif_change_detection_enabled = 0;
+}
+
+void
+bstp_ifupdstatus(struct bridge_softc *sc, struct bridge_iflist *bif)
+{
+ struct ifnet *ifp = bif->bif_ifp;
+ struct ifmediareq ifmr;
+
+ if ((ifnet_flags(ifp) & IFF_UP)) {
+ bzero(&ifmr, sizeof(ifmr));
+ if (ifnet_ioctl(ifp, 0, SIOCGIFMEDIA, &ifmr) == 0) {
+ // enable the port when the link is up, or its state is unknown
+ if ((ifmr.ifm_status & IFM_ACTIVE) || !(ifmr.ifm_status & IFM_AVALID)) {
+ if (bif->bif_state == BSTP_IFSTATE_DISABLED)
+ bstp_enable_port(sc, bif);
+ } else {
+ if (bif->bif_state != BSTP_IFSTATE_DISABLED)
+ bstp_disable_port(sc, bif);
+ }
+ }
+ return;
+ }
+
+ if (bif->bif_state != BSTP_IFSTATE_DISABLED)
+ bstp_disable_port(sc, bif);
+}
+
+void
+bstp_tick(void *arg)
+{
+ struct bridge_softc *sc = arg;
+ struct bridge_iflist *bif;
+ struct timespec ts;
+
+ lck_mtx_lock(sc->sc_mtx);
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ /*
+ * XXX This can cause a lag in "link does away"
+ * XXX and "spanning tree gets updated". We need
+ * XXX come sort of callback from the link state
+ * XXX update code to kick spanning tree.
+ * XXX --thorpej@NetBSD.org
+ */
+ bstp_ifupdstatus(sc, bif);
+ }
+
+ if (bstp_timer_expired(&sc->sc_hello_timer, sc->sc_hello_time))
+ bstp_hello_timer_expiry(sc);
+
+ if (bstp_timer_expired(&sc->sc_tcn_timer, sc->sc_bridge_hello_time))
+ bstp_tcn_timer_expiry(sc);
+
+ if (bstp_timer_expired(&sc->sc_topology_change_timer,
+ sc->sc_topology_change_time))
+ bstp_topology_change_timer_expiry(sc);
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ if (bstp_timer_expired(&bif->bif_message_age_timer,
+ sc->sc_max_age))
+ bstp_message_age_timer_expiry(sc, bif);
+ }
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if ((bif->bif_flags & IFBIF_STP) == 0)
+ continue;
+ if (bstp_timer_expired(&bif->bif_forward_delay_timer,
+ sc->sc_forward_delay))
+ bstp_forward_delay_timer_expiry(sc, bif);
+
+ if (bstp_timer_expired(&bif->bif_hold_timer,
+ sc->sc_hold_time))
+ bstp_hold_timer_expiry(sc, bif);
+ }
+
+ lck_mtx_unlock(sc->sc_mtx);
+
+ /* APPLE MODIFICATION - bridge changes */
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING) {
+ ts.tv_sec = 1;
+ ts.tv_nsec = 0;
+ bsd_timeout(bstp_tick, sc, &ts);
+ }
+}
+
+void
+bstp_timer_start(struct bridge_timer *t, uint16_t v)
+{
+ t->value = v;
+ t->active = 1;
+}
+
+void
+bstp_timer_stop(struct bridge_timer *t)
+{
+ t->value = 0;
+ t->active = 0;
+}
+
+int
+bstp_timer_expired(struct bridge_timer *t, uint16_t v)
+{
+ if (t->active == 0)
+ return (0);
+ t->value += BSTP_TICK_VAL;
+ if (t->value >= v) {
+ bstp_timer_stop(t);
+ return (1);
+ }
+ return (0);
+
+}
/*
- * Copyright (c) 1999-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
}
}
}
+
+ /*
+ * Strip away M_PROTO1 bit prior to sending packet up the stack as
+ * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
+ */
+ if (*m_p != NULL)
+ (*m_p)->m_flags &= ~M_PROTO1;
+
return (0);
}
}
}
-#if BRIDGE
- /* !!!LOCKING!!!
- *
- * Need to consider how to handle this.
- * Also note that return should be a goto cleanup
- */
- broken-locking
- if (do_bridge) {
- struct mbuf *m0 = m;
- struct ether_header *eh = mtod(m, struct ether_header *);
-
- if (m->m_pkthdr.rcvif)
- m->m_pkthdr.rcvif = NULL;
- ifp = bridge_dst_lookup(eh);
- bdg_forward(&m0, ifp);
- if (m0)
- m_freem(m0);
-
- return 0 - should be goto cleanup?
- }
-#endif
-
/*
* Let interface filters (if any) do their thing ...
*/
}
}
}
+ /*
+ * Strip away M_PROTO1 bit prior to sending packet to the driver
+ * as this field may be used by the driver
+ */
+ m->m_flags &= ~M_PROTO1;
/*
* Finally, call the driver.
m->m_pkthdr.rcvif = NULL;
}
-#if BRIDGE
- /* !!!LOCKING!!!
- *
- * Need to consider how to handle this.
- * Also note that return should be a goto cleanup
- */
- broken-locking
- if (do_bridge) {
- struct mbuf *m0 = m;
- struct ether_header *eh = mtod(m, struct ether_header *);
-
- if (m->m_pkthdr.rcvif)
- m->m_pkthdr.rcvif = NULL;
- ifp = bridge_dst_lookup(eh);
- bdg_forward(&m0, ifp);
- if (m0)
- m_freem(m0);
-
- return 0 - should be goto cleanup?
- }
-#endif
-
/*
* Let interface filters (if any) do their thing ...
*/
}
}
+ /*
+ * Strip away M_PROTO1 bit prior to sending packet to the driver
+ * as this field may be used by the driver
+ */
+ m->m_flags &= ~M_PROTO1;
+
/*
* If the underlying interface is not capable of handling a
* packet whose data portion spans across physically disjoint
/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000,2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#endif
-#if BRIDGE
-#include <net/bridge.h>
-#endif
-
/* #include "vlan.h" */
#if NVLAN > 0
#include <net/if_vlan_var.h>
/*
- * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <sys/socketvar.h>
#include <net/if_vlan_var.h>
#include <net/if_bond_var.h>
+#if IF_BRIDGE
+#include <net/if_bridgevar.h>
+#endif
#include <net/dlil.h>
#endif
-#if BRIDGE
-#include <net/bridge.h>
-#endif
-
#define memcpy(x,y,z) bcopy(y, x, z)
#if BOND
bond_family_init();
#endif /* BOND */
+#if IF_BRIDGE
+ bridgeattach(0);
+#endif
done:
/*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
extern struct ifqueue pkintrq;
#endif
-
-#if BRIDGE
-#include <net/bridge.h>
-#endif
-
/* #include "vlan.h" */
#if NVLAN > 0
#include <net/if_vlan_var.h>
/*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <net/dlil.h>
-#if BRIDGE
-#include <net/bridge.h>
-#endif
-
/* #include "vlan.h" */
#if NVLAN > 0
#include <net/if_vlan_var.h>
/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000,2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#define ETHERTYPE_REVARP 0x8035 /* reverse Addr. resolution protocol */
#define ETHERTYPE_VLAN 0x8100 /* IEEE 802.1Q VLAN tagging */
#define ETHERTYPE_IPV6 0x86dd /* IPv6 */
+#define ETHERTYPE_PAE 0x888e /* EAPOL PAE/802.1x */
+#define ETHERTYPE_RSN_PREAUTH 0x88c7 /* 802.11i / RSN Pre-Authentication */
#define ETHERTYPE_LOOPBACK 0x9000 /* used to test interfaces */
/* XXX - add more useful types here */
#ifdef BSD_KERNEL_PRIVATE
extern u_char etherbroadcastaddr[ETHER_ADDR_LEN];
#endif
+
+#define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */
+
#endif /* KERNEL_PRIVATE */
#ifndef KERNEL
/*
- * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#endif
#ifdef KERNEL_PRIVATE
+#define IF_MAXUNIT 0x7fff /* historical value */
+
struct if_clonereq {
int ifcr_total; /* total cloners (out) */
int ifcr_count; /* room for this many in user buffer */
#pragma pack()
#endif /* KERNEL_PRIVATE */
+
+#pragma pack(4)
+struct ifdrv {
+ char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */
+ unsigned long ifd_cmd;
+ size_t ifd_len;
+ void *ifd_data;
+};
+#pragma pack()
+
+#ifdef KERNEL_PRIVATE
+#pragma pack(4)
+struct ifdrv32 {
+ char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */
+ u_int32_t ifd_cmd;
+ u_int32_t ifd_len;
+ user32_addr_t ifd_data;
+};
+
+struct ifdrv64 {
+ char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */
+ u_int64_t ifd_cmd;
+ u_int64_t ifd_len;
+ user64_addr_t ifd_data;
+};
+#pragma pack()
+#endif /* KERNEL_PRIVATE */
+
/*
* Structure used to retrieve aux status data from interfaces.
* Kernel suppliers to this interface should respect the formatting
--- /dev/null
+/*
+ * Copyright (c) 2004-2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/* $apfw: Revision 1.19 2008/10/24 02:34:06 cbzimmer Exp $ */
+/* $NetBSD: if_bridge.c,v 1.46 2006/11/23 04:07:07 rpaulo Exp $ */
+
+/*
+ * Copyright 2001 Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Jason R. Thorpe for Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed for the NetBSD Project by
+ * Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ * or promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Jason L. Wright
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp
+ */
+
+/*
+ * Network interface bridge support.
+ *
+ * TODO:
+ *
+ * - Currently only supports Ethernet-like interfaces (Ethernet,
+ * 802.11, VLANs on Ethernet, etc.) Figure out a nice way
+ * to bridge other types of interfaces (FDDI-FDDI, and maybe
+ * consider heterogenous bridges).
+ */
+
+#include <sys/cdefs.h>
+//_KERNEL_RCSID(0, "$NetBSD: if_bridge.c,v 1.46 2006/11/23 04:07:07 rpaulo Exp $");
+
+//#include "opt_bridge_ipf.h"
+//#include "opt_inet.h"
+//#include "opt_pfil_hooks.h"
+//#include "opt_wlan.h" /* APPLE MODIFICATION <cbz@apple.com> - Proxy STA support */
+//#include "bpfilter.h"
+//#include "gif.h" // APPLE MODIFICATION - add gif support
+
+#define BRIDGE_DEBUG 0
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/mbuf.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/sockio.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+//#include <sys/pool.h>
+#include <sys/kauth.h>
+#include <sys/random.h>
+#include <sys/kern_event.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+#include <libkern/libkern.h>
+
+#include <kern/zalloc.h>
+
+#if NBPFILTER > 0
+#include <net/bpf.h>
+#endif
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/if_llc.h>
+
+#include <net/if_ether.h>
+#include <net/if_bridgevar.h>
+#include <net/dlil.h>
+
+#include <net/kpi_interfacefilter.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/in6_var.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#if BRIDGE_DEBUG
+#define static __private_extern__
+#endif
+
+extern void dlil_input_packet_list(struct ifnet *, struct mbuf *);
+
+/*
+ * Size of the route hash table. Must be a power of two.
+ */
+/* APPLE MODIFICATION - per Wasabi performance improvement, change the hash table size */
+#if 0
+#ifndef BRIDGE_RTHASH_SIZE
+#define BRIDGE_RTHASH_SIZE 1024
+#endif
+#else
+#ifndef BRIDGE_RTHASH_SIZE
+#define BRIDGE_RTHASH_SIZE 256
+#endif
+#endif
+
+/* APPLE MODIFICATION - support for HW checksums */
+#if APPLE_BRIDGE_HWCKSUM_SUPPORT
+#include <netinet/udp.h>
+#include <netinet/tcp.h>
+#endif
+
+#define BRIDGE_RTHASH_MASK (BRIDGE_RTHASH_SIZE - 1)
+
+//#include "carp.h"
+#if NCARP > 0
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/ip_carp.h>
+#endif
+
+/*
+ * Maximum number of addresses to cache.
+ */
+#ifndef BRIDGE_RTABLE_MAX
+#define BRIDGE_RTABLE_MAX 100
+#endif
+
+/* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+/*
+ * Maximum (additional to maxcache) number of proxysta addresses to cache.
+ */
+#ifndef BRIDGE_RTABLE_MAX_PROXYSTA
+#define BRIDGE_RTABLE_MAX_PROXYSTA 16
+#endif
+#endif
+
+/*
+ * Spanning tree defaults.
+ */
+#define BSTP_DEFAULT_MAX_AGE (20 * 256)
+#define BSTP_DEFAULT_HELLO_TIME (2 * 256)
+#define BSTP_DEFAULT_FORWARD_DELAY (15 * 256)
+#define BSTP_DEFAULT_HOLD_TIME (1 * 256)
+#define BSTP_DEFAULT_BRIDGE_PRIORITY 0x8000
+#define BSTP_DEFAULT_PORT_PRIORITY 0x80
+#define BSTP_DEFAULT_PATH_COST 55
+
+/*
+ * Timeout (in seconds) for entries learned dynamically.
+ */
+#ifndef BRIDGE_RTABLE_TIMEOUT
+#define BRIDGE_RTABLE_TIMEOUT (20 * 60) /* same as ARP */
+#endif
+
+/*
+ * Number of seconds between walks of the route list.
+ */
+#ifndef BRIDGE_RTABLE_PRUNE_PERIOD
+#define BRIDGE_RTABLE_PRUNE_PERIOD (5 * 60)
+#endif
+
+/*
+ * List of capabilities to mask on the member interface.
+ */
+#define BRIDGE_IFCAPS_MASK \
+ (IFCAP_CSUM_IPv4_Tx | \
+ IFCAP_CSUM_TCPv4_Tx | \
+ IFCAP_CSUM_UDPv4_Tx | \
+ IFCAP_CSUM_TCPv6_Tx | \
+ IFCAP_CSUM_UDPv6_Tx)
+
+
+int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD;
+
+static zone_t bridge_rtnode_pool = NULL;
+
+static errno_t
+bridge_iff_input(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol,
+ mbuf_t *data, char **frame_ptr);
+static void
+bridge_iff_event(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol,
+ const struct kev_msg *event_msg);
+static void
+bridge_iff_detached(void* cookie, __unused ifnet_t interface);
+
+static uint32_t
+bridge_rthash(__unused struct bridge_softc *sc, const uint8_t *addr);
+
+static int bridge_clone_create(struct if_clone *, int);
+static void bridge_clone_destroy(struct ifnet *);
+
+static errno_t bridge_ioctl(ifnet_t ifp, unsigned long cmd, void *data);
+#if HAS_IF_CAP
+static void bridge_mutecaps(struct bridge_iflist *, int);
+#endif
+static int bridge_init(struct ifnet *);
+static void bridge_stop(struct ifnet *, int);
+
+#if BRIDGE_MEMBER_OUT_FILTER
+static errno_t
+bridge_iff_output(void *cookie, ifnet_t ifp, protocol_family_t protocol, mbuf_t *data);
+static int bridge_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m);
+#endif /* BRIDGE_MEMBER_OUT_FILTER */
+
+static errno_t bridge_start(struct ifnet *, mbuf_t);
+static errno_t bridge_set_bpf_tap(ifnet_t ifn, bpf_tap_mode mode, bpf_packet_func bpf_callback);
+__private_extern__ errno_t bridge_bpf_input(ifnet_t ifp, struct mbuf *m);
+__private_extern__ errno_t bridge_bpf_output(ifnet_t ifp, struct mbuf *m);
+
+static void bridge_detach(ifnet_t ifp);
+
+static errno_t bridge_input(struct bridge_iflist *, struct ifnet *, struct mbuf *, void *frame_header);
+
+static void bridge_forward(struct bridge_softc *, struct mbuf *m);
+
+static void bridge_timer(void *);
+
+static void bridge_broadcast(struct bridge_softc *, struct ifnet *,
+ struct mbuf *, int);
+
+static int bridge_rtupdate(struct bridge_softc *, const uint8_t *,
+ struct ifnet *, int, uint8_t);
+static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *);
+static void bridge_rttrim(struct bridge_softc *);
+static void bridge_rtage(struct bridge_softc *);
+static void bridge_rtflush(struct bridge_softc *, int);
+/* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+static void bridge_rtdiscovery(struct bridge_softc *);
+static void bridge_rtpurge(struct bridge_softc *, struct ifnet *);
+#endif
+static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *);
+
+static int bridge_rtable_init(struct bridge_softc *);
+static void bridge_rtable_fini(struct bridge_softc *);
+
+static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *,
+ const uint8_t *);
+static int bridge_rtnode_insert(struct bridge_softc *,
+ struct bridge_rtnode *);
+static void bridge_rtnode_destroy(struct bridge_softc *,
+ struct bridge_rtnode *);
+
+static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *,
+ const char *name);
+static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *,
+ struct ifnet *ifp);
+static void bridge_delete_member(struct bridge_softc *,
+ struct bridge_iflist *);
+
+static void bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp);
+
+
+static int bridge_ioctl_add(struct bridge_softc *, void *);
+static int bridge_ioctl_del(struct bridge_softc *, void *);
+/* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+static int bridge_ioctl_purge(struct bridge_softc *sc, void *arg);
+#endif
+static int bridge_ioctl_gifflags(struct bridge_softc *, void *);
+static int bridge_ioctl_sifflags(struct bridge_softc *, void *);
+static int bridge_ioctl_scache(struct bridge_softc *, void *);
+static int bridge_ioctl_gcache(struct bridge_softc *, void *);
+static int bridge_ioctl_gifs32(struct bridge_softc *, void *);
+static int bridge_ioctl_gifs64(struct bridge_softc *, void *);
+static int bridge_ioctl_rts32(struct bridge_softc *, void *);
+static int bridge_ioctl_rts64(struct bridge_softc *, void *);
+static int bridge_ioctl_saddr32(struct bridge_softc *, void *);
+static int bridge_ioctl_saddr64(struct bridge_softc *, void *);
+static int bridge_ioctl_sto(struct bridge_softc *, void *);
+static int bridge_ioctl_gto(struct bridge_softc *, void *);
+static int bridge_ioctl_daddr32(struct bridge_softc *, void *);
+static int bridge_ioctl_daddr64(struct bridge_softc *, void *);
+static int bridge_ioctl_flush(struct bridge_softc *, void *);
+static int bridge_ioctl_gpri(struct bridge_softc *, void *);
+static int bridge_ioctl_spri(struct bridge_softc *, void *);
+static int bridge_ioctl_ght(struct bridge_softc *, void *);
+static int bridge_ioctl_sht(struct bridge_softc *, void *);
+static int bridge_ioctl_gfd(struct bridge_softc *, void *);
+static int bridge_ioctl_sfd(struct bridge_softc *, void *);
+static int bridge_ioctl_gma(struct bridge_softc *, void *);
+static int bridge_ioctl_sma(struct bridge_softc *, void *);
+static int bridge_ioctl_sifprio(struct bridge_softc *, void *);
+static int bridge_ioctl_sifcost(struct bridge_softc *, void *);
+
+struct bridge_control {
+ int (*bc_func)(struct bridge_softc *, void *);
+ unsigned int bc_argsize;
+ unsigned int bc_flags;
+};
+
+#define BC_F_COPYIN 0x01 /* copy arguments in */
+#define BC_F_COPYOUT 0x02 /* copy arguments out */
+#define BC_F_SUSER 0x04 /* do super-user check */
+
+static const struct bridge_control bridge_control_table32[] = {
+ { bridge_ioctl_add, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+ { bridge_ioctl_del, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_gifflags, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_COPYOUT },
+ { bridge_ioctl_sifflags, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_scache, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+ { bridge_ioctl_gcache, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+
+ { bridge_ioctl_gifs32, sizeof(struct ifbifconf32),
+ BC_F_COPYIN|BC_F_COPYOUT },
+ { bridge_ioctl_rts32, sizeof(struct ifbaconf32),
+ BC_F_COPYIN|BC_F_COPYOUT },
+
+ { bridge_ioctl_saddr32, sizeof(struct ifbareq32),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_sto, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+ { bridge_ioctl_gto, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+
+ { bridge_ioctl_daddr32, sizeof(struct ifbareq32),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_flush, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_gpri, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+ { bridge_ioctl_spri, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_ght, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+ { bridge_ioctl_sht, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_gfd, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+ { bridge_ioctl_sfd, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_gma, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+ { bridge_ioctl_sma, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_sifprio, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_sifcost, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+ { bridge_ioctl_purge, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+#endif
+};
+
+static const struct bridge_control bridge_control_table64[] = {
+ { bridge_ioctl_add, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+ { bridge_ioctl_del, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_gifflags, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_COPYOUT },
+ { bridge_ioctl_sifflags, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_scache, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+ { bridge_ioctl_gcache, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+
+ { bridge_ioctl_gifs64, sizeof(struct ifbifconf64),
+ BC_F_COPYIN|BC_F_COPYOUT },
+ { bridge_ioctl_rts64, sizeof(struct ifbaconf64),
+ BC_F_COPYIN|BC_F_COPYOUT },
+
+ { bridge_ioctl_saddr64, sizeof(struct ifbareq64),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_sto, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+ { bridge_ioctl_gto, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+
+ { bridge_ioctl_daddr64, sizeof(struct ifbareq64),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_flush, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_gpri, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+ { bridge_ioctl_spri, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_ght, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+ { bridge_ioctl_sht, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_gfd, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+ { bridge_ioctl_sfd, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_gma, sizeof(struct ifbrparam),
+ BC_F_COPYOUT },
+ { bridge_ioctl_sma, sizeof(struct ifbrparam),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_sifprio, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ { bridge_ioctl_sifcost, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+ { bridge_ioctl_purge, sizeof(struct ifbreq),
+ BC_F_COPYIN|BC_F_SUSER },
+#endif
+};
+
+static const unsigned int bridge_control_table_size =
+sizeof(bridge_control_table32) / sizeof(bridge_control_table32[0]);
+
+static LIST_HEAD(, bridge_softc) bridge_list = LIST_HEAD_INITIALIZER(bridge_list);
+
+static lck_grp_t *bridge_lock_grp = NULL;
+static lck_attr_t *bridge_lock_attr = NULL;
+
+static lck_rw_t *bridge_list_lock = NULL;
+
+
+static struct if_clone bridge_cloner =
+ IF_CLONE_INITIALIZER("bridge",
+ bridge_clone_create,
+ bridge_clone_destroy,
+ 0,
+ IF_MAXUNIT);
+
+#if BRIDGE_DEBUG
+
+SYSCTL_DECL(_net_link);
+
+SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Bridge");
+
+__private_extern__ int _if_brige_debug = 0;
+
+SYSCTL_INT(_net_link_bridge, OID_AUTO, debug, CTLFLAG_RW,
+ &_if_brige_debug, 0, "Bridge debug");
+
+static void printf_ether_header(struct ether_header *eh);
+static void printf_mbuf_data(mbuf_t m, size_t offset, size_t len);
+static void printf_mbuf_pkthdr(mbuf_t m, const char *prefix, const char *suffix);
+static void printf_mbuf(mbuf_t m, const char *prefix, const char *suffix);
+static void link_print(struct sockaddr_dl * dl_p);
+
+void
+printf_mbuf_pkthdr(mbuf_t m, const char *prefix, const char *suffix)
+{
+ if (m)
+ printf("%spktlen: %u rcvif: %p header: %p nextpkt: %p%s",
+ prefix ? prefix : "",
+ (unsigned int)mbuf_pkthdr_len(m), mbuf_pkthdr_rcvif(m), mbuf_pkthdr_header(m), mbuf_nextpkt(m),
+ suffix ? suffix : "");
+ else
+ printf("%s<NULL>%s\n", prefix, suffix);
+}
+
+void
+printf_mbuf(mbuf_t m, const char *prefix, const char *suffix)
+{
+ if (m) {
+ printf("%s%p type: %u flags: 0x%x len: %u data: %p maxlen: %u datastart: %p next: %p%s",
+ prefix ? prefix : "",
+ m, mbuf_type(m), mbuf_flags(m), (unsigned int)mbuf_len(m), mbuf_data(m),
+ (unsigned int)mbuf_maxlen(m), mbuf_datastart(m), mbuf_next(m),
+ !suffix || (mbuf_flags(m) & MBUF_PKTHDR) ? "" : suffix);
+ if ((mbuf_flags(m) & MBUF_PKTHDR))
+ printf_mbuf_pkthdr(m, " ", suffix);
+ } else
+ printf("%s<NULL>%s\n", prefix, suffix);
+}
+
+void
+printf_mbuf_data(mbuf_t m, size_t offset, size_t len)
+{
+ mbuf_t n;
+ size_t i, j;
+ size_t pktlen, mlen, maxlen;
+ unsigned char *ptr;
+
+ pktlen = mbuf_pkthdr_len(m);
+
+ if (offset > pktlen)
+ return;
+
+ maxlen = (pktlen - offset > len) ? len : pktlen;
+ n = m;
+ mlen = mbuf_len(n);
+ ptr = mbuf_data(n);
+ for (i = 0, j = 0; i < maxlen; i++, j++) {
+ if (j >= mlen) {
+ n = mbuf_next(n);
+ if (n == 0)
+ break;
+ ptr = mbuf_data(n);
+ mlen = mbuf_len(n);
+ j = 0;
+ }
+ if (i >= offset) {
+ printf("%02x%s", ptr[j], i % 2 ? " " : "");
+ }
+ }
+ return;
+}
+
+static void
+printf_ether_header(struct ether_header *eh)
+{
+ printf("%02x:%02x:%02x:%02x:%02x:%02x > %02x:%02x:%02x:%02x:%02x:%02x 0x%04x ",
+ eh->ether_shost[0], eh->ether_shost[1], eh->ether_shost[2],
+ eh->ether_shost[3], eh->ether_shost[4], eh->ether_shost[5],
+ eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2],
+ eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5],
+ eh->ether_type);
+}
+#endif /* BRIDGE_DEBUG */
+
+/*
+ * bridgeattach:
+ *
+ * Pseudo-device attach routine.
+ */
+__private_extern__ int
+bridgeattach(__unused int n)
+{
+ int error;
+ lck_grp_attr_t *lck_grp_attr = NULL;
+
+ bridge_rtnode_pool = zinit(sizeof(struct bridge_rtnode), 1024 * sizeof(struct bridge_rtnode),
+ 0, "bridge_rtnode");
+
+ lck_grp_attr = lck_grp_attr_alloc_init();
+
+ bridge_lock_grp = lck_grp_alloc_init("if_bridge", lck_grp_attr);
+
+ bridge_lock_attr = lck_attr_alloc_init();
+
+#if BRIDGE_DEBUG
+ lck_attr_setdebug(bridge_lock_attr);
+#endif
+
+ bridge_list_lock = lck_rw_alloc_init(bridge_lock_grp, bridge_lock_attr);
+
+ // can free the attributes once we've allocated the group lock
+ lck_grp_attr_free(lck_grp_attr);
+
+ LIST_INIT(&bridge_list);
+ error = if_clone_attach(&bridge_cloner);
+
+ return error;
+}
+
+#if BRIDGE_DEBUG
+
+static void
+link_print(struct sockaddr_dl * dl_p)
+{
+ int i;
+
+#if 1
+ printf("sdl len %d index %d family %d type 0x%x nlen %d alen %d"
+ " slen %d addr ", dl_p->sdl_len,
+ dl_p->sdl_index, dl_p->sdl_family, dl_p->sdl_type,
+ dl_p->sdl_nlen, dl_p->sdl_alen, dl_p->sdl_slen);
+#endif
+ for (i = 0; i < dl_p->sdl_alen; i++)
+ printf("%s%x", i ? ":" : "",
+ (CONST_LLADDR(dl_p))[i]);
+ printf("\n");
+ return;
+}
+#endif /* BRIDGE_DEBUG */
+
+
+/*
+ * bridge_clone_create:
+ *
+ * Create a new bridge instance.
+ */
+/* APPLE MODIFICATION <cbz@apple.com> - add opaque <const caddr_t params> argument for cloning. This is done for
+ net80211's VAP creation (with the Marvell codebase). I think this could end up being useful
+ for other devices, too. This is not in an ifdef because it doesn't hurt anything to have
+ this extra param */
+static int
+bridge_clone_create(struct if_clone *ifc, int unit)
+{
+ struct bridge_softc *sc = NULL;
+ struct ifnet *ifp = NULL;
+ u_char eaddr[6];
+ uint32_t r;
+ struct ifnet_init_params init_params;
+ errno_t error = 0;
+ uint32_t sdl_buffer[offsetof(struct sockaddr_dl, sdl_data) + IFNAMSIZ + ETHER_ADDR_LEN];
+ struct sockaddr_dl *sdl = (struct sockaddr_dl *)sdl_buffer;
+
+ sc = _MALLOC(sizeof(*sc), M_DEVBUF, M_WAITOK);
+ memset(sc, 0, sizeof(*sc));
+
+ sc->sc_brtmax = BRIDGE_RTABLE_MAX;
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+ sc->sc_brtmax_proxysta = BRIDGE_RTABLE_MAX_PROXYSTA;
+#endif
+ sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT;
+ sc->sc_bridge_max_age = BSTP_DEFAULT_MAX_AGE;
+ sc->sc_bridge_hello_time = BSTP_DEFAULT_HELLO_TIME;
+ sc->sc_bridge_forward_delay = BSTP_DEFAULT_FORWARD_DELAY;
+ sc->sc_bridge_priority = BSTP_DEFAULT_BRIDGE_PRIORITY;
+ sc->sc_hold_time = BSTP_DEFAULT_HOLD_TIME;
+ sc->sc_filter_flags = IFBF_FILT_DEFAULT;
+#ifndef BRIDGE_IPF
+ /*
+ * For backwards compatibility with previous behaviour...
+ * Switch off filtering on the bridge itself if BRIDGE_IPF is
+ * not defined.
+ */
+ sc->sc_filter_flags &= ~IFBF_FILT_USEIPF;
+#endif
+
+ /* Initialize our routing table. */
+ error = bridge_rtable_init(sc);
+ if (error != 0) {
+ printf("bridge_clone_create: bridge_rtable_init failed %d\n", error);
+ goto done;
+ }
+
+ LIST_INIT(&sc->sc_iflist);
+
+ sc->sc_mtx = lck_mtx_alloc_init(bridge_lock_grp, bridge_lock_attr);
+
+ /* use the interface name as the unique id for ifp recycle */
+ snprintf(sc->sc_if_xname, sizeof(sc->sc_if_xname), "%s%d",
+ ifc->ifc_name, unit);
+ memset(&init_params, 0, sizeof(struct ifnet_init_params));
+ init_params.uniqueid = sc->sc_if_xname;
+ init_params.uniqueid_len = strlen(sc->sc_if_xname);
+ init_params.name = ifc->ifc_name;
+ init_params.unit = unit;
+ init_params.family = IFNET_FAMILY_ETHERNET;
+ init_params.type = IFT_BRIDGE;
+ init_params.output = bridge_start;
+ init_params.demux = ether_demux;
+ init_params.add_proto = ether_add_proto;
+ init_params.del_proto = ether_del_proto;
+ init_params.check_multi = ether_check_multi;
+ init_params.framer = ether_frameout;
+ init_params.softc = sc;
+ init_params.ioctl = bridge_ioctl;
+ init_params.set_bpf_tap = bridge_set_bpf_tap;
+ init_params.detach = bridge_detach;
+ init_params.broadcast_addr = etherbroadcastaddr;
+ init_params.broadcast_len = ETHER_ADDR_LEN;
+ error = ifnet_allocate(&init_params, &ifp);
+ if (error != 0) {
+ printf("bridge_clone_create: ifnet_allocate failed %d\n", error);
+ goto done;
+ }
+ sc->sc_if = ifp;
+
+ error = ifnet_set_mtu(ifp, ETHERMTU);
+ if (error != 0) {
+ printf("bridge_clone_create: ifnet_set_mtu failed %d\n", error);
+ goto done;
+ }
+ error = ifnet_set_addrlen(ifp, ETHER_ADDR_LEN);
+ if (error != 0) {
+ printf("bridge_clone_create: ifnet_set_addrlen failed %d\n", error);
+ goto done;
+ }
+ error = ifnet_set_baudrate(ifp, 10000000) ; // XXX: this is what IONetworking does
+ if (error != 0) {
+ printf("bridge_clone_create: ifnet_set_baudrate failed %d\n", error);
+ goto done;
+ }
+ error = ifnet_set_hdrlen(ifp, ETHER_HDR_LEN);
+ if (error != 0) {
+ printf("bridge_clone_create: ifnet_set_hdrlen failed %d\n", error);
+ goto done;
+ }
+ error = ifnet_set_flags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_NOTRAILERS | IFF_MULTICAST,
+ 0xffff);
+ if (error != 0) {
+ printf("bridge_clone_create: ifnet_set_flags failed %d\n", error);
+ goto done;
+ }
+
+ /*
+ * Generate a random ethernet address and use the private AC:DE:48
+ * OUI code.
+ */
+ read_random(&r, sizeof(r));
+ eaddr[0] = 0xAC;
+ eaddr[1] = 0xDE;
+ eaddr[2] = 0x48;
+ eaddr[3] = (r >> 0) & 0xffu;
+ eaddr[4] = (r >> 8) & 0xffu;
+ eaddr[5] = (r >> 16) & 0xffu;
+
+ memset(sdl, 0, sizeof(sdl_buffer));
+ sdl->sdl_family = AF_LINK;
+ sdl->sdl_nlen = strlen(sc->sc_if_xname);
+ sdl->sdl_alen = ETHER_ADDR_LEN;
+ sdl->sdl_len = offsetof(struct sockaddr_dl, sdl_data);
+ memcpy(sdl->sdl_data, sc->sc_if_xname, sdl->sdl_nlen);
+ memcpy(LLADDR(sdl), eaddr, ETHER_ADDR_LEN);
+
+#if BRIDGE_DEBUG
+ link_print(sdl);
+#endif
+
+ error = ifnet_attach(ifp, NULL);
+ if (error != 0) {
+ printf("bridge_clone_create: ifnet_attach failed %d\n", error);
+ goto done;
+ }
+
+ error = ifnet_set_lladdr_and_type(ifp, eaddr, ETHER_ADDR_LEN, IFT_ETHER);
+ if (error != 0) {
+ printf("bridge_clone_create: ifnet_set_lladdr_and_type failed %d\n", error);
+ goto done;
+ }
+
+#if APPLE_BRIDGE_HWCKSUM_SUPPORT
+ /*
+ * APPLE MODIFICATION - our bridge can support HW checksums
+ * (useful if underlying interfaces support them) on TX,
+ * RX is not that interesting, since the stack just looks to
+ * see if the packet has been checksummed already (I think)
+ * but we might as well indicate we support it
+ */
+ ifp->if_capabilities =
+ IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_UDPv4_Tx |
+ IFCAP_CSUM_IPv4_Rx | IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_UDPv4_Rx ;
+#endif
+
+ lck_rw_lock_exclusive(bridge_list_lock);
+ LIST_INSERT_HEAD(&bridge_list, sc, sc_list);
+ lck_rw_done(bridge_list_lock);
+
+ /* attach as ethernet */
+ error = bpf_attach(ifp, DLT_EN10MB, sizeof(struct ether_header), NULL, NULL);
+
+done:
+ if (error != 0) {
+ printf("bridge_clone_create failed error %d\n", error);
+ /* Cleanup TBD */
+ }
+
+ return error;
+}
+
+/*
+ * bridge_clone_destroy:
+ *
+ * Destroy a bridge instance.
+ */
+static void
+bridge_clone_destroy(struct ifnet *ifp)
+{
+ struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp);
+ struct bridge_iflist *bif;
+ int error;
+
+ lck_mtx_lock(sc->sc_mtx);
+ if ((sc->sc_flags & SCF_DETACHING)) {
+ lck_mtx_unlock(sc->sc_mtx);
+ return;
+ }
+ sc->sc_flags |= SCF_DETACHING;
+
+ bridge_stop(ifp, 1);
+
+ error = ifnet_set_flags(ifp, 0, IFF_UP);
+ if (error != 0) {
+ printf("bridge_clone_destroy: ifnet_set_flags failed %d\n", error);
+ }
+
+ while ((bif = LIST_FIRST(&sc->sc_iflist)) != NULL)
+ bridge_delete_member(sc, bif);
+
+ lck_mtx_unlock(sc->sc_mtx);
+
+ error = ifnet_detach(ifp);
+ if (error != 0) {
+ printf("bridge_clone_destroy: ifnet_detach failed %d\n", error);
+ if ((sc = (struct bridge_softc *)ifnet_softc(ifp)) != NULL) {
+ lck_mtx_lock(sc->sc_mtx);
+ sc->sc_flags &= ~SCF_DETACHING;
+ lck_mtx_unlock(sc->sc_mtx);
+ }
+ }
+
+ return;
+}
+
+#define DRVSPEC \
+ if (ifd->ifd_cmd >= bridge_control_table_size) { \
+ error = EINVAL; \
+ break; \
+ } \
+ bc = &bridge_control_table[ifd->ifd_cmd]; \
+ \
+ if ((cmd & IOC_DIRMASK) == IOC_INOUT && \
+ (bc->bc_flags & BC_F_COPYOUT) == 0) { \
+ error = EINVAL; \
+ break; \
+ } \
+ else if (((cmd & IOC_DIRMASK) == IOC_IN) && \
+ (bc->bc_flags & BC_F_COPYOUT) != 0) { \
+ error = EINVAL; \
+ break; \
+ } \
+ \
+ if (bc->bc_flags & BC_F_SUSER) { \
+ error = kauth_authorize_generic(kauth_cred_get(), KAUTH_GENERIC_ISSUSER); \
+ if (error) \
+ break; \
+ } \
+ \
+ if (ifd->ifd_len != bc->bc_argsize || \
+ ifd->ifd_len > sizeof(args)) { \
+ error = EINVAL; \
+ break; \
+ } \
+ \
+ memset(&args, 0, sizeof(args)); \
+ if (bc->bc_flags & BC_F_COPYIN) { \
+ error = copyin(ifd->ifd_data, &args, ifd->ifd_len); \
+ if (error) \
+ break; \
+ } \
+ \
+ lck_mtx_lock(sc->sc_mtx); \
+ error = (*bc->bc_func)(sc, &args); \
+ lck_mtx_unlock(sc->sc_mtx); \
+ if (error) \
+ break; \
+ \
+ if (bc->bc_flags & BC_F_COPYOUT) \
+ error = copyout(&args, ifd->ifd_data, ifd->ifd_len)
+
+/*
+ * bridge_ioctl:
+ *
+ * Handle a control request from the operator.
+ */
+static errno_t
+bridge_ioctl(ifnet_t ifp, unsigned long cmd, void *data)
+{
+ struct bridge_softc *sc = ifnet_softc(ifp);
+ struct ifreq *ifr = (struct ifreq *) data;
+ int error = 0;
+
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED);
+
+#if BRIDGE_DEBUG
+ printf("bridge_ioctl: ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu)\n",
+ ifp,
+ cmd,
+ (cmd & IOC_IN) ? 'I' : ' ',
+ (cmd & IOC_OUT) ? 'O' : ' ',
+ IOCPARM_LEN(cmd),
+ (char)IOCGROUP(cmd),
+ cmd & 0xff);
+ printf("SIOCGDRVSPEC32 %lx SIOCGDRVSPEC64 %lx\n", SIOCGDRVSPEC32, SIOCGDRVSPEC64);
+#endif
+
+ switch (cmd) {
+ case SIOCADDMULTI:
+ break;
+ case SIOCDELMULTI:
+ break;
+
+ case SIOCSDRVSPEC32:
+ case SIOCGDRVSPEC32: {
+ union {
+ struct ifbreq ifbreq;
+ struct ifbifconf32 ifbifconf;
+ struct ifbareq32 ifbareq;
+ struct ifbaconf32 ifbaconf;
+ struct ifbrparam ifbrparam;
+ } args;
+ struct ifdrv32 *ifd = (struct ifdrv32 *) data;
+ const struct bridge_control *bridge_control_table = bridge_control_table32, *bc;
+
+ DRVSPEC;
+
+ break;
+ }
+ case SIOCSDRVSPEC64:
+ case SIOCGDRVSPEC64: {
+ union {
+ struct ifbreq ifbreq;
+ struct ifbifconf64 ifbifconf;
+ struct ifbareq64 ifbareq;
+ struct ifbaconf64 ifbaconf;
+ struct ifbrparam ifbrparam;
+ } args;
+ struct ifdrv64 *ifd = (struct ifdrv64 *) data;
+ const struct bridge_control *bridge_control_table = bridge_control_table64, *bc;
+
+ DRVSPEC;
+
+ break;
+ }
+
+ case SIOCSIFFLAGS:
+ if ((ifnet_flags(ifp) & (IFF_UP|IFF_RUNNING)) == IFF_RUNNING) {
+ /*
+ * If interface is marked down and it is running,
+ * then stop and disable it.
+ */
+ lck_mtx_lock(sc->sc_mtx);
+ bridge_stop(ifp, 1);
+ lck_mtx_unlock(sc->sc_mtx);
+ } else if ((ifnet_flags(ifp) & (IFF_UP|IFF_RUNNING)) == IFF_UP) {
+ /*
+ * If interface is marked up and it is stopped, then
+ * start it.
+ */
+ lck_mtx_lock(sc->sc_mtx);
+ error = bridge_init(ifp);
+ lck_mtx_unlock(sc->sc_mtx);
+ }
+ break;
+
+ case SIOCSIFMTU:
+#if 0
+ /* APPLE MODIFICATION <cbz@apple.com>
+ if we wanted to support changing the MTU */
+ {
+ struct ifreq *ifr = (struct ifreq *)data;
+ struct bridge_iflist *bif;
+ struct ifnet *dst_if;
+ sc->sc_if.if_mtu = ifr->ifr_mtu;
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ dst_if = bif->bif_ifp;
+ error = ifnet_ioctl(dst_if, 0, cmd, data);
+ if (error)
+ break;
+ }
+ }
+#else
+ /* Do not allow the MTU to be changed on the bridge */
+ error = EINVAL;
+#endif
+ break;
+
+ /* APPLE MODIFICATION - don't pass this down to ether_ioctl, just indicate we don't handle it */
+ case SIOCGIFMEDIA:
+ error = EINVAL;
+ break;
+
+ case SIOCSIFLLADDR:
+ error = ifnet_set_lladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
+ if (error != 0)
+ printf("bridge_ioctl: ifnet_set_lladdr failed %d\n", error);
+ break;
+
+ default:
+ error = ether_ioctl(ifp, cmd, data);
+#if BRIDGE_DEBUG
+ if (error != 0)
+ printf("bridge_ioctl: ether_ioctl ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu) failed error: %d\n",
+ ifp,
+ cmd,
+ (cmd & IOC_IN) ? 'I' : ' ',
+ (cmd & IOC_OUT) ? 'O' : ' ',
+ IOCPARM_LEN(cmd),
+ (char) IOCGROUP(cmd),
+ cmd & 0xff,
+ error);
+#endif /* BRIDGE_DEBUG */
+ break;
+ }
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED);
+
+ return (error);
+}
+
+/*
+ * bridge_mutecaps:
+ *
+ * Clear or restore unwanted capabilities on the member interface
+ */
+#if HAS_IF_CAP
+void
+bridge_mutecaps(struct bridge_iflist *bif, int mute)
+{
+ struct ifnet *ifp = bif->bif_ifp;
+ struct ifcapreq ifcr;
+
+ if (ifp->if_ioctl == NULL)
+ return;
+
+ memset(&ifcr, 0, sizeof(ifcr));
+ ifcr.ifcr_capenable = ifp->if_capenable;
+
+ if (mute) {
+ /* mask off and save capabilities */
+ bif->bif_mutecap = ifcr.ifcr_capenable & BRIDGE_IFCAPS_MASK;
+ if (bif->bif_mutecap != 0)
+ ifcr.ifcr_capenable &= ~BRIDGE_IFCAPS_MASK;
+ } else
+ /* restore muted capabilities */
+ ifcr.ifcr_capenable |= bif->bif_mutecap;
+
+ if (bif->bif_mutecap != 0) {
+ (void) (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifcr);
+ }
+}
+#endif /* HAS_IF_CAP */
+
+/*
+ * bridge_lookup_member:
+ */
+static struct bridge_iflist *
+bridge_lookup_member(struct bridge_softc *sc, const char *name)
+{
+ struct bridge_iflist *bif;
+ struct ifnet *ifp;
+ char if_xname[IFNAMSIZ];
+
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED);
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ ifp = bif->bif_ifp;
+ snprintf(if_xname, sizeof(if_xname), "%s%d",
+ ifnet_name(ifp), ifnet_unit(ifp));
+ if (strncmp(if_xname, name, sizeof(if_xname)) == 0)
+ return (bif);
+ }
+
+ return (NULL);
+}
+
+/*
+ * bridge_lookup_member_if:
+ */
+static struct bridge_iflist *
+bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp)
+{
+ struct bridge_iflist *bif;
+
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED);
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ if (bif->bif_ifp == member_ifp)
+ return (bif);
+ }
+
+ return (NULL);
+}
+
+static errno_t
+bridge_iff_input(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol,
+ mbuf_t *data, char **frame_ptr)
+{
+ errno_t error = 0;
+ struct bridge_iflist *bif = (struct bridge_iflist *)cookie;
+ struct bridge_softc *sc = bif->bif_sc;
+ int included = 0;
+ size_t frmlen = 0;
+ mbuf_t m = *data;
+
+ if ((m->m_flags & M_PROTO1))
+ goto out;
+
+ if (*frame_ptr >= (char *)mbuf_datastart(m) && *frame_ptr <= (char *)mbuf_data(m)) {
+ included = 1;
+ frmlen = (char *)mbuf_data(m) - *frame_ptr;
+ }
+#if BRIDGE_DEBUG
+ if (_if_brige_debug) {
+ printf("bridge_iff_input %s%d from %s%d m %p data %p frame %p %s frmlen %lu\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if),
+ ifnet_name(ifp), ifnet_unit(ifp),
+ m, mbuf_data(m), *frame_ptr, included ? "inside" : "outside", frmlen);
+
+ if (_if_brige_debug > 1) {
+ printf_mbuf(m, "bridge_iff_input[", "\n");
+ printf_ether_header((struct ether_header *)*frame_ptr);
+ printf_mbuf_data(m, 0, 20);
+ printf("\n");
+ }
+ }
+#endif /* BRIDGE_DEBUG */
+
+ /* Move data pointer to start of frame to the link layer header */
+ if (included) {
+ (void) mbuf_setdata(m, (char *)mbuf_data(m) - frmlen, mbuf_len(m) + frmlen);
+ (void) mbuf_pkthdr_adjustlen(m, frmlen);
+ } else {
+ printf("bridge_iff_input: frame_ptr outside mbuf\n");
+ goto out;
+ }
+
+ error = bridge_input(bif, ifp, m, *frame_ptr);
+
+ /* Adjust packet back to original */
+ if (error == 0) {
+ (void) mbuf_setdata(m, (char *)mbuf_data(m) + frmlen, mbuf_len(m) - frmlen);
+ (void) mbuf_pkthdr_adjustlen(m, -frmlen);
+ }
+#if BRIDGE_DEBUG
+ if (_if_brige_debug > 1) {
+ printf("\n");
+ printf_mbuf(m, "bridge_iff_input]", "\n");
+ }
+#endif /* BRIDGE_DEBUG */
+
+out:
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED);
+
+ return error;
+}
+
+
+#if BRIDGE_MEMBER_OUT_FILTER
+static errno_t
+bridge_iff_output(void *cookie, ifnet_t ifp, __unused protocol_family_t protocol, mbuf_t *data)
+{
+ errno_t error = 0;
+ struct bridge_iflist *bif = (struct bridge_iflist *)cookie;
+ struct bridge_softc *sc = bif->bif_sc;
+ mbuf_t m = *data;
+
+ if ((m->m_flags & M_PROTO1))
+ goto out;
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug) {
+ printf("bridge_iff_output %s%d from %s%d m %p data %p\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if),
+ ifnet_name(ifp), ifnet_unit(ifp),
+ m, mbuf_data(m));
+ }
+#endif /* BRIDGE_DEBUG */
+
+ error = bridge_output(sc, ifp, m);
+ if (error != 0) {
+ printf("bridge_iff_output: bridge_output failed error %d\n", error);
+ }
+
+out:
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED);
+
+ return error;
+}
+#endif /* BRIDGE_MEMBER_OUT_FILTER */
+
+
+static void
+bridge_iff_event(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol,
+ const struct kev_msg *event_msg)
+{
+ struct bridge_iflist *bif = (struct bridge_iflist *)cookie;
+
+ if (event_msg->vendor_code == KEV_VENDOR_APPLE &&
+ event_msg->kev_class == KEV_NETWORK_CLASS &&
+ event_msg->kev_subclass == KEV_DL_SUBCLASS) {
+ switch (event_msg->event_code) {
+ case KEV_DL_IF_DETACHING:
+ bridge_ifdetach(bif, ifp);
+ break;
+
+ default:
+ break;
+ }
+ }
+}
+
+static void
+bridge_iff_detached(void* cookie, __unused ifnet_t interface)
+{
+ struct bridge_iflist *bif = (struct bridge_iflist *)cookie;
+
+ _FREE(bif, M_DEVBUF);
+
+ return;
+}
+
+/*
+ * bridge_delete_member:
+ *
+ * Delete the specified member interface.
+ */
+static void
+bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif)
+{
+ struct ifnet *ifs = bif->bif_ifp;
+
+ switch (ifnet_type(ifs)) {
+ case IFT_ETHER:
+ /*
+ * Take the interface out of promiscuous mode.
+ */
+ (void) ifnet_set_promiscuous(ifs, 0);
+ break;
+#if NGIF > 0
+ case IFT_GIF:
+ break;
+#endif
+ default:
+#ifdef DIAGNOSTIC
+ panic("bridge_delete_member: impossible");
+#endif
+ break;
+ }
+
+ ifs->if_bridge = NULL;
+ LIST_REMOVE(bif, bif_next);
+
+ /* Respect lock ordering with DLIL lock */
+ lck_mtx_unlock(sc->sc_mtx);
+ iflt_detach(bif->bif_iff_ref);
+ lck_mtx_lock(sc->sc_mtx);
+
+ bridge_rtdelete(sc, ifs, IFBF_FLUSHALL);
+
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING)
+ bstp_initialization(sc);
+
+ /* On the last deleted interface revert the MTU */
+
+ if (LIST_EMPTY(&sc->sc_iflist))
+ (void) ifnet_set_mtu(sc->sc_if, ETHERMTU);
+}
+
+static int
+bridge_ioctl_add(struct bridge_softc *sc, void *arg)
+{
+ struct ifbreq *req = arg;
+ struct bridge_iflist *bif = NULL;
+ struct ifnet *ifs;
+ int error = 0;
+ /* APPLE MODIFICATION <cbz@apple.com> - is this a proxy sta being added? */
+#if IEEE80211_PROXYSTA
+ struct bridge_rtnode *brt;
+#endif
+
+ error = ifnet_find_by_name(req->ifbr_ifsname, &ifs);
+ if (error || ifs == NULL)
+ return (ENOENT);
+
+ /* Is the interface already attached to this bridge interface */
+ if (ifs->if_bridge == sc)
+ return (EEXIST);
+
+ if (ifs->if_bridge != NULL)
+ return (EBUSY);
+
+ /* First added interface resets the MTU */
+
+ if (LIST_EMPTY(&sc->sc_iflist))
+ (void) ifnet_set_mtu(sc->sc_if, ETHERMTU);
+
+ if (ifnet_mtu(sc->sc_if) != ifnet_mtu(ifs))
+ return (EINVAL);
+
+ bif = _MALLOC(sizeof(*bif), M_DEVBUF, M_WAITOK|M_ZERO);
+ if (bif == NULL)
+ return (ENOMEM);
+
+ bif->bif_ifp = ifs;
+ bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER;
+ bif->bif_priority = BSTP_DEFAULT_PORT_PRIORITY;
+ bif->bif_path_cost = BSTP_DEFAULT_PATH_COST;
+ bif->bif_sc = sc;
+
+ switch (ifnet_type(ifs)) {
+ case IFT_ETHER:
+ /*
+ * Place the interface into promiscuous mode.
+ */
+ error = ifnet_set_promiscuous(ifs, 1);
+ if (error)
+ goto out;
+#if HAS_IF_CAP
+ bridge_mutecaps(bif, 1);
+#endif
+ break;
+#if NGIF > 0
+ case IFT_GIF:
+ break;
+#endif
+ default:
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * If the LINK0 flag is set, and this is the first member interface,
+ * attempt to inherit its link-layer address.
+ */
+ if ((ifnet_flags(sc->sc_if) & IFF_LINK0) && LIST_EMPTY(&sc->sc_iflist) &&
+ ifnet_type(ifs) == IFT_ETHER) {
+ (void) ifnet_set_lladdr(sc->sc_if, ifnet_lladdr(ifs),
+ ETHER_ADDR_LEN);
+ }
+
+ // install an interface filter
+ {
+ struct iff_filter iff;
+
+ memset(&iff, 0, sizeof(struct iff_filter));
+
+ iff.iff_cookie = bif;
+ iff.iff_name = "com.apple.kernel.bsd.net.if_bridge";
+ iff.iff_input = bridge_iff_input;
+#if BRIDGE_MEMBER_OUT_FILTER
+ iff.iff_output = bridge_iff_output;
+#endif /* BRIDGE_MEMBER_OUT_FILTER */
+ iff.iff_event = bridge_iff_event;
+ iff.iff_detached = bridge_iff_detached;
+
+ /* Respect lock ordering with DLIL lock */
+ lck_mtx_unlock(sc->sc_mtx);
+ error = iflt_attach(ifs, &iff, &bif->bif_iff_ref);
+ lck_mtx_lock(sc->sc_mtx);
+ if (error != 0) {
+ printf("bridge_ioctl_add: iflt_attach failed %d\n", error);
+ goto out;
+ }
+ }
+ ifs->if_bridge = sc;
+ LIST_INSERT_HEAD(&sc->sc_iflist, bif, bif_next);
+
+
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING)
+ bstp_initialization(sc);
+ else
+ bstp_stop(sc);
+
+ /* APPLE MODIFICATION <cbz@apple.com> - is this a proxy sta being added? */
+#if IEEE80211_PROXYSTA
+ brt = bridge_rtnode_lookup(sc, ifnet_lladdr(ifs));
+ if (brt) {
+#if DIAGNOSTIC
+ printf( "%s: attach %s to bridge as proxysta for %02x:%02x:%02x:%02x:%02x:%02x discovered on %s\n",
+ __func__, ifs->if_xname, brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2],
+ brt->brt_addr[3], brt->brt_addr[4], brt->brt_addr[5], brt->brt_ifp->if_xname );
+#endif
+ brt->brt_ifp_proxysta = ifs;
+ }
+#endif
+
+
+out:
+ if (error) {
+ if (bif != NULL)
+ _FREE(bif, M_DEVBUF);
+ }
+ return (error);
+}
+
+static int
+bridge_ioctl_del(struct bridge_softc *sc, void *arg)
+{
+ struct ifbreq *req = arg;
+ struct bridge_iflist *bif;
+
+ bif = bridge_lookup_member(sc, req->ifbr_ifsname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ bridge_delete_member(sc, bif);
+
+ return (0);
+}
+
+/* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+static int
+bridge_ioctl_purge(struct bridge_softc *sc, void *arg)
+{
+ struct ifbreq *req = arg;
+ struct bridge_iflist *bif;
+ struct ifnet *ifs;
+
+ bif = bridge_lookup_member(sc, req->ifbr_ifsname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ ifs = bif->bif_ifp;
+ bridge_rtpurge(sc, ifs);
+
+ return (0);
+}
+#endif
+
+static int
+bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg)
+{
+ struct ifbreq *req = arg;
+ struct bridge_iflist *bif;
+
+ bif = bridge_lookup_member(sc, req->ifbr_ifsname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ req->ifbr_ifsflags = bif->bif_flags;
+ req->ifbr_state = bif->bif_state;
+ req->ifbr_priority = bif->bif_priority;
+ req->ifbr_path_cost = bif->bif_path_cost;
+ req->ifbr_portno = ifnet_index(bif->bif_ifp) & 0xffff;
+
+ return (0);
+}
+
+static int
+bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg)
+{
+ struct ifbreq *req = arg;
+ struct bridge_iflist *bif;
+
+ bif = bridge_lookup_member(sc, req->ifbr_ifsname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ if (req->ifbr_ifsflags & IFBIF_STP) {
+ switch (ifnet_type(bif->bif_ifp)) {
+ case IFT_ETHER:
+ /* These can do spanning tree. */
+ break;
+
+ default:
+ /* Nothing else can. */
+ return (EINVAL);
+ }
+ }
+
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+ if ((bif->bif_flags & IFBIF_PROXYSTA_DISCOVER) &&
+ ((req->ifbr_ifsflags & IFBIF_PROXYSTA_DISCOVER) == 0))
+ bridge_rtpurge(sc, bif->bif_ifp);
+#endif
+
+ bif->bif_flags = req->ifbr_ifsflags;
+
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING)
+ bstp_initialization(sc);
+
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+ if (bif->bif_flags & IFBIF_PROXYSTA_DISCOVER)
+ bridge_rtdiscovery(sc);
+#endif
+
+ return (0);
+}
+
+static int
+bridge_ioctl_scache(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ sc->sc_brtmax = param->ifbrp_csize;
+ bridge_rttrim(sc);
+
+ return (0);
+}
+
+static int
+bridge_ioctl_gcache(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ param->ifbrp_csize = sc->sc_brtmax;
+
+ return (0);
+}
+
+#define BRIDGE_IOCTL_GIFS \
+ struct bridge_iflist *bif; \
+ struct ifbreq breq; \
+ int count, error = 0; \
+ uint32_t len; \
+ \
+ count = 0; \
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) \
+ count++; \
+ \
+ if (bifc->ifbic_len == 0) { \
+ bifc->ifbic_len = sizeof(breq) * count; \
+ return (0); \
+ } \
+ \
+ count = 0; \
+ len = bifc->ifbic_len; \
+ memset(&breq, 0, sizeof breq); \
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { \
+ if (len < sizeof(breq)) \
+ break; \
+ \
+ snprintf(breq.ifbr_ifsname, sizeof(breq.ifbr_ifsname), "%s%d", \
+ ifnet_name(bif->bif_ifp), ifnet_unit(bif->bif_ifp)); \
+ breq.ifbr_ifsflags = bif->bif_flags; \
+ breq.ifbr_state = bif->bif_state; \
+ breq.ifbr_priority = bif->bif_priority; \
+ breq.ifbr_path_cost = bif->bif_path_cost; \
+ breq.ifbr_portno = ifnet_index(bif->bif_ifp) & 0xffff; \
+ error = copyout(&breq, bifc->ifbic_req + count * sizeof(breq), sizeof(breq)); \
+ if (error) \
+ break; \
+ count++; \
+ len -= sizeof(breq); \
+ } \
+ \
+ bifc->ifbic_len = sizeof(breq) * count
+
+
+static int
+bridge_ioctl_gifs64(struct bridge_softc *sc, void *arg)
+{
+ struct ifbifconf64 *bifc = arg;
+
+ BRIDGE_IOCTL_GIFS;
+
+ return (error);
+}
+
+static int
+bridge_ioctl_gifs32(struct bridge_softc *sc, void *arg)
+{
+ struct ifbifconf32 *bifc = arg;
+
+ BRIDGE_IOCTL_GIFS;
+
+ return (error);
+}
+
+#define BRIDGE_IOCTL_RTS \
+ struct bridge_rtnode *brt; \
+ int count = 0, error = 0; \
+ uint32_t len; \
+ struct timespec now; \
+ \
+ if (bac->ifbac_len == 0) \
+ return (0); \
+ \
+ len = bac->ifbac_len; \
+ LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { \
+ if (len < sizeof(bareq)) \
+ goto out; \
+ memset(&bareq, 0, sizeof(bareq)); \
+ snprintf(bareq.ifba_ifsname, sizeof(bareq.ifba_ifsname), "%s%d", \
+ ifnet_name(brt->brt_ifp), ifnet_unit(brt->brt_ifp)); \
+ memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr)); \
+ if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { \
+ nanouptime(&now); \
+ if (brt->brt_expire >= (unsigned long)now.tv_sec) \
+ bareq.ifba_expire = brt->brt_expire - now.tv_sec; \
+ else \
+ bareq.ifba_expire = 0; \
+ } else \
+ bareq.ifba_expire = 0; \
+ bareq.ifba_flags = brt->brt_flags; \
+ \
+ error = copyout(&bareq, bac->ifbac_req + count * sizeof(bareq), sizeof(bareq)); \
+ if (error) \
+ goto out; \
+ count++; \
+ len -= sizeof(bareq); \
+ } \
+out: \
+ bac->ifbac_len = sizeof(bareq) * count
+
+
+static int
+bridge_ioctl_rts64(struct bridge_softc *sc, void *arg)
+{
+ struct ifbaconf64 *bac = arg;
+ struct ifbareq64 bareq;
+
+ BRIDGE_IOCTL_RTS;
+
+ return (error);
+}
+
+static int
+bridge_ioctl_rts32(struct bridge_softc *sc, void *arg)
+{
+ struct ifbaconf32 *bac = arg;
+ struct ifbareq32 bareq;
+
+ BRIDGE_IOCTL_RTS;
+
+ return (error);
+}
+
+static int
+bridge_ioctl_saddr64(struct bridge_softc *sc, void *arg)
+{
+ struct ifbareq64 *req = arg;
+ struct bridge_iflist *bif;
+ int error;
+
+ bif = bridge_lookup_member(sc, req->ifba_ifsname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ error = bridge_rtupdate(sc, req->ifba_dst, bif->bif_ifp, 1,
+ req->ifba_flags);
+
+ return (error);
+}
+
+static int
+bridge_ioctl_saddr32(struct bridge_softc *sc, void *arg)
+{
+ struct ifbareq32 *req = arg;
+ struct bridge_iflist *bif;
+ int error;
+
+ bif = bridge_lookup_member(sc, req->ifba_ifsname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ error = bridge_rtupdate(sc, req->ifba_dst, bif->bif_ifp, 1,
+ req->ifba_flags);
+
+ return (error);
+}
+
+static int
+bridge_ioctl_sto(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ sc->sc_brttimeout = param->ifbrp_ctime;
+
+ return (0);
+}
+
+static int
+bridge_ioctl_gto(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ param->ifbrp_ctime = sc->sc_brttimeout;
+
+ return (0);
+}
+
+static int
+bridge_ioctl_daddr64(struct bridge_softc *sc, void *arg)
+{
+ struct ifbareq64 *req = arg;
+
+ return (bridge_rtdaddr(sc, req->ifba_dst));
+}
+
+static int
+bridge_ioctl_daddr32(struct bridge_softc *sc, void *arg)
+{
+ struct ifbareq32 *req = arg;
+
+ return (bridge_rtdaddr(sc, req->ifba_dst));
+}
+
+static int
+bridge_ioctl_flush(struct bridge_softc *sc, void *arg)
+{
+ struct ifbreq *req = arg;
+
+ bridge_rtflush(sc, req->ifbr_ifsflags);
+
+ return (0);
+}
+
+static int
+bridge_ioctl_gpri(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ param->ifbrp_prio = sc->sc_bridge_priority;
+
+ return (0);
+}
+
+static int
+bridge_ioctl_spri(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ sc->sc_bridge_priority = param->ifbrp_prio;
+
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING)
+ bstp_initialization(sc);
+
+ return (0);
+}
+
+static int
+bridge_ioctl_ght(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ param->ifbrp_hellotime = sc->sc_bridge_hello_time >> 8;
+
+ return (0);
+}
+
+static int
+bridge_ioctl_sht(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ if (param->ifbrp_hellotime == 0)
+ return (EINVAL);
+ sc->sc_bridge_hello_time = param->ifbrp_hellotime << 8;
+
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING)
+ bstp_initialization(sc);
+
+ return (0);
+}
+
+static int
+bridge_ioctl_gfd(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ param->ifbrp_fwddelay = sc->sc_bridge_forward_delay >> 8;
+
+ return (0);
+}
+
+static int
+bridge_ioctl_sfd(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ if (param->ifbrp_fwddelay == 0)
+ return (EINVAL);
+ sc->sc_bridge_forward_delay = param->ifbrp_fwddelay << 8;
+
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING)
+ bstp_initialization(sc);
+
+ return (0);
+}
+
+static int
+bridge_ioctl_gma(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ param->ifbrp_maxage = sc->sc_bridge_max_age >> 8;
+
+ return (0);
+}
+
+static int
+bridge_ioctl_sma(struct bridge_softc *sc, void *arg)
+{
+ struct ifbrparam *param = arg;
+
+ if (param->ifbrp_maxage == 0)
+ return (EINVAL);
+ sc->sc_bridge_max_age = param->ifbrp_maxage << 8;
+
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING)
+ bstp_initialization(sc);
+
+ return (0);
+}
+
+static int
+bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg)
+{
+ struct ifbreq *req = arg;
+ struct bridge_iflist *bif;
+
+ bif = bridge_lookup_member(sc, req->ifbr_ifsname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ bif->bif_priority = req->ifbr_priority;
+
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING)
+ bstp_initialization(sc);
+
+ return (0);
+}
+
+/* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+static void
+bridge_proxysta_notify_macaddr(struct ifnet *ifp, int op, const uint8_t *mac)
+{
+ struct proxy_sta_event iev;
+
+ memset(&iev, 0, sizeof(iev));
+ memcpy(iev.iev_addr, mac, ETHER_ADDR_LEN);
+
+ rt_proxystamsg(ifp, op, &iev, sizeof(iev));
+}
+
+static void
+bridge_proxysta_discover(struct ifnet *ifp, const uint8_t *mac)
+{
+ bridge_proxysta_notify_macaddr( ifp, RTM_PROXYSTA_DISCOVERY, mac );
+}
+
+static void
+bridge_proxysta_idle_timeout(struct ifnet *ifp, const uint8_t *mac)
+{
+ bridge_proxysta_notify_macaddr( ifp, RTM_PROXYSTA_IDLE_TIMEOUT, mac );
+}
+#endif
+
+static int
+bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg)
+{
+ struct ifbreq *req = arg;
+ struct bridge_iflist *bif;
+
+ bif = bridge_lookup_member(sc, req->ifbr_ifsname);
+ if (bif == NULL)
+ return (ENOENT);
+
+ bif->bif_path_cost = req->ifbr_path_cost;
+
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING)
+ bstp_initialization(sc);
+
+ return (0);
+}
+
+/*
+ * bridge_ifdetach:
+ *
+ * Detach an interface from a bridge. Called when a member
+ * interface is detaching.
+ */
+static void
+bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp)
+{
+ struct bridge_softc *sc = bif->bif_sc;
+ struct ifbreq breq;
+
+ memset(&breq, 0, sizeof(breq));
+ snprintf(breq.ifbr_ifsname, sizeof(breq.ifbr_ifsname), "%s%d",
+ ifnet_name(ifp), ifnet_unit(ifp));
+
+ lck_mtx_lock(sc->sc_mtx);
+
+ (void) bridge_ioctl_del(sc, &breq);
+
+ lck_mtx_unlock(sc->sc_mtx);
+}
+
+/*
+ * bridge_init:
+ *
+ * Initialize a bridge interface.
+ */
+static int
+bridge_init(struct ifnet *ifp)
+{
+ struct bridge_softc *sc = ifnet_softc(ifp);
+ struct timespec ts;
+ errno_t error;
+
+ if (ifnet_flags(ifp) & IFF_RUNNING)
+ return (0);
+
+ ts.tv_sec = bridge_rtable_prune_period;
+ ts.tv_nsec = 0;
+ bsd_timeout(bridge_timer, sc, &ts);
+
+ error = ifnet_set_flags(ifp, IFF_RUNNING, IFF_RUNNING);
+ if (error == 0)
+ bstp_initialization(sc);
+
+ return error;
+}
+
+/*
+ * bridge_stop:
+ *
+ * Stop the bridge interface.
+ */
+static void
+bridge_stop(struct ifnet *ifp, __unused int disable)
+{
+ struct bridge_softc *sc = ifnet_softc(ifp);
+
+ if ((ifnet_flags(ifp) & IFF_RUNNING) == 0)
+ return;
+
+ bsd_untimeout(bridge_timer, sc);
+ bstp_stop(sc);
+
+ bridge_rtflush(sc, IFBF_FLUSHDYN);
+
+ (void) ifnet_set_flags(ifp, 0, IFF_RUNNING);
+}
+
+/*
+ * bridge_enqueue:
+ *
+ * Enqueue a packet on a bridge member interface.
+ *
+ * Note: this is called both on the input and output path so this routine
+ * cannot simply muck with the HW checksum flag. For the time being we
+ * rely on the caller to do the right thing.
+ */
+__private_extern__ void
+bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m)
+{
+ int len, error;
+
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED);
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf("bridge_enqueue sc %s%d to dst_ifp %s%d m %p\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if),
+ ifnet_name(dst_ifp), ifnet_unit(dst_ifp), m);
+#endif /* BRIDGE_DEBUG */
+
+ len = m->m_pkthdr.len;
+ m->m_flags |= M_PROTO1; //set to avoid loops
+
+ error = ifnet_output_raw(dst_ifp, 0, m);
+ if (error == 0) {
+ (void) ifnet_stat_increment_out(sc->sc_if, 1, len, 0);
+ } else {
+ (void) ifnet_stat_increment_out(sc->sc_if, 0, 0, 1);
+ }
+
+ return;
+}
+
+
+#if BRIDGE_MEMBER_OUT_FILTER
+
+/*
+ * bridge_output:
+ *
+ * Send output from a bridge member interface. This
+ * performs the bridging function for locally originated
+ * packets.
+ *
+ * The mbuf has the Ethernet header already attached. We must
+ * enqueue or free the mbuf before returning.
+ */
+static int
+bridge_output(struct bridge_softc *sc, ifnet_t ifp, mbuf_t m)
+{
+ struct ether_header *eh;
+ struct ifnet *dst_if;
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf("bridge_output ifp %p %s%d\n", ifp, ifnet_name(ifp), ifnet_unit(ifp));
+#endif /* BRIDGE_DEBUG */
+
+ if (m->m_len < ETHER_HDR_LEN) {
+ m = m_pullup(m, ETHER_HDR_LEN);
+ if (m == NULL) {
+ printf("bridge_output ifp %p m_pullup failed\n", ifp);
+ return EJUSTRETURN;
+ }
+ }
+
+ eh = mtod(m, struct ether_header *);
+
+ /* APPLE MODIFICATION <jhw@apple.com>
+ * If the packet is an 802.1X ethertype, then only send on the
+ * original output interface.
+ */
+ if (eh->ether_type == htons(ETHERTYPE_PAE)) {
+ dst_if = ifp;
+ goto sendunicast;
+ }
+
+ /*
+ * If bridge is down, but the original output interface is up,
+ * go ahead and send out that interface. Otherwise, the packet
+ * is dropped below.
+ */
+ if ((ifnet_flags(sc->sc_if) & IFF_RUNNING) == 0) {
+ dst_if = ifp;
+ goto sendunicast;
+ }
+
+ lck_mtx_lock(sc->sc_mtx);
+
+ /*
+ * If the packet is a multicast, or we don't know a better way to
+ * get there, send to all interfaces.
+ */
+ if (ETHER_IS_MULTICAST(eh->ether_dhost))
+ dst_if = NULL;
+ else
+ dst_if = bridge_rtlookup(sc, eh->ether_dhost);
+ if (dst_if == NULL) {
+ struct bridge_iflist *bif;
+ struct mbuf *mc;
+ int used = 0;
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ dst_if = bif->bif_ifp;
+ if ((ifnet_flags(dst_if) & IFF_RUNNING) == 0)
+ continue;
+
+ /*
+ * If this is not the original output interface,
+ * and the interface is participating in spanning
+ * tree, make sure the port is in a state that
+ * allows forwarding.
+ */
+ if (dst_if != ifp &&
+ (bif->bif_flags & IFBIF_STP) != 0) {
+ switch (bif->bif_state) {
+ case BSTP_IFSTATE_BLOCKING:
+ case BSTP_IFSTATE_LISTENING:
+ case BSTP_IFSTATE_DISABLED:
+ continue;
+ }
+ }
+
+ if (LIST_NEXT(bif, bif_next) == NULL) {
+ used = 1;
+ mc = m;
+ } else {
+ mc = m_copym(m, 0, M_COPYALL, M_NOWAIT);
+ if (mc == NULL) {
+ printf("bridge_output ifp %p m_copym failed\n", ifp);
+ (void) ifnet_stat_increment_out(sc->sc_if, 0, 0, 1);
+ continue;
+ }
+ }
+
+ bridge_enqueue(sc, dst_if, mc);
+ }
+ if (used == 0) {
+ printf("bridge_output ifp %p not used\n", ifp);
+ m_freem(m);
+ }
+ lck_mtx_unlock(sc->sc_mtx);
+
+ return EJUSTRETURN;
+ }
+
+sendunicast:
+ /*
+ * XXX Spanning tree consideration here?
+ */
+
+ if ((ifnet_flags(dst_if) & IFF_RUNNING) == 0) {
+ printf("bridge_output ifp %p dst_if %p not running\n", ifp, dst_if);
+ m_freem(m);
+
+ return EJUSTRETURN;
+ }
+
+ if (dst_if != ifp) {
+ lck_mtx_lock(sc->sc_mtx);
+
+ bridge_enqueue(sc, dst_if, m);
+
+ lck_mtx_unlock(sc->sc_mtx);
+
+ return EJUSTRETURN;
+ }
+
+ return (0);
+}
+#endif /* BRIDGE_MEMBER_OUT_FILTER */
+
+#if APPLE_BRIDGE_HWCKSUM_SUPPORT
+static struct mbuf* bridge_fix_txcsum( struct mbuf *m )
+{
+ // basic tests indicate that the vast majority of packets being processed
+ // here have an Ethernet header mbuf pre-pended to them (the first case below)
+ // the second highest are those where the Ethernet and IP/TCP/UDP headers are
+ // all in one mbuf (second case below)
+ // the third case has, in fact, never hit for me -- although if I comment out
+ // the first two cases, that code works for them, so I consider it a
+ // decent general solution
+
+ int amt = ETHER_HDR_LEN;
+ int hlen = M_CSUM_DATA_IPv4_IPHL( m->m_pkthdr.csum_data );
+ int off = M_CSUM_DATA_IPv4_OFFSET( m->m_pkthdr.csum_data );
+
+ /*
+ * NOTE we should never get vlan-attached packets here;
+ * support for those COULD be added, but we don't use them
+ * and it really kinda slows things down to worry about them
+ */
+
+#ifdef DIAGNOSTIC
+ if ( m_tag_find( m, PACKET_TAG_VLAN, NULL ) != NULL )
+ {
+ printf( "bridge: transmitting packet tagged with VLAN?\n" );
+ KASSERT( 0 );
+ m_freem( m );
+ return NULL;
+ }
+#endif
+
+ if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 )
+ {
+ amt += hlen;
+ }
+ if ( m->m_pkthdr.csum_flags & M_CSUM_TCPv4 )
+ {
+ amt += off + sizeof( uint16_t );
+ }
+
+ if ( m->m_pkthdr.csum_flags & M_CSUM_UDPv4 )
+ {
+ amt += off + sizeof( uint16_t );
+ }
+
+ if ( m->m_len == ETHER_HDR_LEN )
+ {
+ // this is the case where there's an Ethernet header in an mbuf
+
+ // the first mbuf is the Ethernet header -- just strip it off and do the checksum
+ struct mbuf *m_ip = m->m_next;
+
+ // set up m_ip so the cksum operations work
+ /* APPLE MODIFICATION 22 Apr 2008 <mvega@apple.com>
+ * <rdar://5817385> Clear the m_tag list before setting
+ * M_PKTHDR.
+ *
+ * If this m_buf chain was extended via M_PREPEND(), then
+ * m_ip->m_pkthdr is identical to m->m_pkthdr (see
+ * M_MOVE_PKTHDR()). The only thing preventing access to this
+ * invalid packet header data is the fact that the M_PKTHDR
+ * flag is clear, i.e., m_ip->m_flag & M_PKTHDR == 0, but we're
+ * about to set the M_PKTHDR flag, so to be safe we initialize,
+ * more accurately, we clear, m_ip->m_pkthdr.tags via
+ * m_tag_init().
+ *
+ * Suppose that we do not do this; if m_pullup(), below, fails,
+ * then m_ip will be freed along with m_ip->m_pkthdr.tags, but
+ * we will also free m soon after, via m_freem(), and
+ * consequently attempt to free m->m_pkthdr.tags in the
+ * process. The problem is that m->m_pkthdr.tags will have
+ * already been freed by virtue of being equal to
+ * m_ip->m_pkthdr.tags. Attempts to dereference
+ * m->m_pkthdr.tags in m_tag_delete_chain() will result in a
+ * panic.
+ */
+ m_tag_init(m_ip);
+ /* END MODIFICATION */
+ m_ip->m_flags |= M_PKTHDR;
+ m_ip->m_pkthdr.csum_flags = m->m_pkthdr.csum_flags;
+ m_ip->m_pkthdr.csum_data = m->m_pkthdr.csum_data;
+ m_ip->m_pkthdr.len = m->m_pkthdr.len - ETHER_HDR_LEN;
+
+ // set up the header mbuf so we can prepend it back on again later
+ m->m_pkthdr.csum_flags = 0;
+ m->m_pkthdr.csum_data = 0;
+ m->m_pkthdr.len = ETHER_HDR_LEN;
+ m->m_next = NULL;
+
+
+ // now do the checksums we need -- first IP
+ if ( m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4 )
+ {
+ // make sure the IP header (or at least the part with the cksum) is there
+ m_ip = m_pullup( m_ip, sizeof( struct ip ) );
+ if ( m_ip == NULL )
+ {
+ printf( "bridge: failed to flatten header\n ");
+ m_freem( m );
+ return NULL;
+ }
+
+ // now do the checksum
+ {
+ struct ip *ip = mtod( m_ip, struct ip* );
+ ip->ip_sum = in_cksum( m_ip, hlen );
+
+#ifdef VERY_VERY_VERY_DIAGNOSTIC
+ printf( "bridge: performed IPv4 checksum\n" );
+#endif
+ }
+ }
+
+ // now do a TCP or UDP delayed checksum
+ if ( m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) )
+ {
+ in_delayed_cksum( m_ip );
+
+#ifdef VERY_VERY_VERY_DIAGNOSTIC
+ printf( "bridge: performed TCPv4/UDPv4 checksum\n" );
+#endif
+ }
+
+ // now attach the ethernet header back onto the IP packet
+ m->m_next = m_ip;
+ m->m_pkthdr.len += m_length( m_ip );
+
+ // clear the M_PKTHDR flags on the ip packet (again, we re-attach later)
+ m_ip->m_flags &= ~M_PKTHDR;
+
+ // and clear any csum flags
+ m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4);
+ }
+ else if ( m->m_len >= amt )
+ {
+ // everything fits in the first mbuf, so futz with m->m_data, m->m_len and m->m_pkthdr.len to
+ // make it work
+ m->m_len -= ETHER_HDR_LEN;
+ m->m_data += ETHER_HDR_LEN;
+ m->m_pkthdr.len -= ETHER_HDR_LEN;
+
+ // now do the checksums we need -- first IP
+ if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 )
+ {
+ struct ip *ip = mtod( m, struct ip* );
+ ip->ip_sum = in_cksum( m, hlen );
+
+#ifdef VERY_VERY_VERY_DIAGNOSTIC
+ printf( "bridge: performed IPv4 checksum\n" );
+#endif
+ }
+
+ // now do a TCP or UDP delayed checksum
+ if ( m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) )
+ {
+ in_delayed_cksum( m );
+
+#ifdef VERY_VERY_VERY_DIAGNOSTIC
+ printf( "bridge: performed TCPv4/UDPv4 checksum\n" );
+#endif
+ }
+
+ // now stick the ethernet header back on
+ m->m_len += ETHER_HDR_LEN;
+ m->m_data -= ETHER_HDR_LEN;
+ m->m_pkthdr.len += ETHER_HDR_LEN;
+
+ // and clear any csum flags
+ m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4);
+ }
+ else
+ {
+ struct mbuf *m_ip;
+
+ // general case -- need to simply split it off and deal
+
+ // first, calculate how much needs to be made writable (we may have a read-only mbuf here)
+ hlen = M_CSUM_DATA_IPv4_IPHL( m->m_pkthdr.csum_data );
+#if PARANOID
+ off = M_CSUM_DATA_IPv4_OFFSET( m->m_pkthdr.csum_data );
+
+ if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 )
+ {
+ amt += hlen;
+ }
+
+ if ( m->m_pkthdr.csum_flags & M_CSUM_TCPv4 )
+ {
+ amt += sizeof( struct tcphdr * );
+ amt += off;
+ }
+
+ if ( m->m_pkthdr.csum_flags & M_CSUM_UDPv4 )
+ {
+ amt += sizeof( struct udphdr * );
+ amt += off;
+ }
+#endif
+
+ // now split the ethernet header off of the IP packet (we'll re-attach later)
+ m_ip = m_split( m, ETHER_HDR_LEN, M_NOWAIT );
+ if ( m_ip == NULL )
+ {
+ printf( "bridge_fix_txcsum: could not split ether header\n" );
+
+ m_freem( m );
+ return NULL;
+ }
+
+#if PARANOID
+ // make sure that the IP packet is writable for the portion we need
+ if ( m_makewritable( &m_ip, 0, amt, M_DONTWAIT ) != 0 )
+ {
+ printf( "bridge_fix_txcsum: could not make %d bytes writable\n", amt );
+
+ m_freem( m );
+ m_freem( m_ip );
+ return NULL;
+ }
+#endif
+
+ m_ip->m_pkthdr.csum_flags = m->m_pkthdr.csum_flags;
+ m_ip->m_pkthdr.csum_data = m->m_pkthdr.csum_data;
+
+ m->m_pkthdr.csum_flags = 0;
+ m->m_pkthdr.csum_data = 0;
+
+ // now do the checksums we need -- first IP
+ if ( m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4 )
+ {
+ // make sure the IP header (or at least the part with the cksum) is there
+ m_ip = m_pullup( m_ip, sizeof( struct ip ) );
+ if ( m_ip == NULL )
+ {
+ printf( "bridge: failed to flatten header\n ");
+ m_freem( m );
+ return NULL;
+ }
+
+ // now do the checksum
+ {
+ struct ip *ip = mtod( m_ip, struct ip* );
+ ip->ip_sum = in_cksum( m_ip, hlen );
+
+#ifdef VERY_VERY_VERY_DIAGNOSTIC
+ printf( "bridge: performed IPv4 checksum\n" );
+#endif
+ }
+ }
+
+ // now do a TCP or UDP delayed checksum
+ if ( m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) )
+ {
+ in_delayed_cksum( m_ip );
+
+#ifdef VERY_VERY_VERY_DIAGNOSTIC
+ printf( "bridge: performed TCPv4/UDPv4 checksum\n" );
+#endif
+ }
+
+ // now attach the ethernet header back onto the IP packet
+ m->m_next = m_ip;
+ m->m_pkthdr.len += m_length( m_ip );
+
+ // clear the M_PKTHDR flags on the ip packet (again, we re-attach later)
+ m_ip->m_flags &= ~M_PKTHDR;
+
+ // and clear any csum flags
+ m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4);
+ }
+
+ return m;
+}
+#endif
+
+/*
+ * bridge_start:
+ *
+ * Start output on a bridge.
+ */
+static errno_t
+bridge_start(ifnet_t ifp, mbuf_t m)
+{
+ struct bridge_softc *sc = ifnet_softc(ifp);
+ struct ether_header *eh;
+ struct ifnet *dst_if;
+
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED);
+
+ eh = mtod(m, struct ether_header *);
+
+ if ((m->m_flags & (M_BCAST|M_MCAST)) == 0 &&
+ (dst_if = bridge_rtlookup(sc, eh->ether_dhost)) != NULL) {
+
+ {
+#if APPLE_BRIDGE_HWCKSUM_SUPPORT
+ /*
+ * APPLE MODIFICATION - if the packet needs a checksum (i.e.,
+ * checksum has been deferred for HW support) AND the destination
+ * interface doesn't support HW checksums, then we
+ * need to fix-up the checksum here
+ */
+ if (
+ ( (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4) ) != 0 ) &&
+ ( (dst_if->if_csum_flags_tx & m->m_pkthdr.csum_flags ) != m->m_pkthdr.csum_flags )
+ )
+ {
+ m = bridge_fix_txcsum( m );
+ if ( m == NULL )
+ {
+ goto done;
+ }
+ }
+
+#else
+ if (eh->ether_type == htons(ETHERTYPE_IP))
+ mbuf_outbound_finalize(m, PF_INET, sizeof(struct ether_header));
+ else
+ m->m_pkthdr.csum_flags = 0;
+#endif
+ lck_mtx_lock(sc->sc_mtx);
+ #if NBPFILTER > 0
+ if (sc->sc_bpf_output)
+ bridge_bpf_output(ifp, m);
+ #endif
+ bridge_enqueue(sc, dst_if, m);
+ lck_mtx_unlock(sc->sc_mtx);
+ }
+ } else
+ {
+#if APPLE_BRIDGE_HWCKSUM_SUPPORT
+
+ /*
+ * APPLE MODIFICATION - if the MULTICAST packet needs a checksum (i.e.,
+ * checksum has been deferred for HW support) AND at least one destination
+ * interface doesn't support HW checksums, then we go ahead and fix it up
+ * here, since it doesn't make sense to do it more than once
+ */
+
+ if (
+ (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4)) &&
+ /*
+ * XXX FIX ME: keep track of whether or not we have any interfaces that
+ * do not support checksums (for now, assume we do)
+ */
+ ( 1 )
+ )
+ {
+ m = bridge_fix_txcsum( m );
+ if ( m == NULL )
+ {
+ goto done;
+ }
+ }
+#else
+ if (eh->ether_type == htons(ETHERTYPE_IP))
+ mbuf_outbound_finalize(m, PF_INET, sizeof(struct ether_header));
+ else
+ m->m_pkthdr.csum_flags = 0;
+#endif
+
+ lck_mtx_lock(sc->sc_mtx);
+ #if NBPFILTER > 0
+ if (sc->sc_bpf_output)
+ bridge_bpf_output(ifp, m);
+ #endif
+ bridge_broadcast(sc, ifp, m, 0);
+ lck_mtx_unlock(sc->sc_mtx);
+ }
+#if APPLE_BRIDGE_HWCKSUM_SUPPORT
+done:
+#endif
+
+ return 0;
+}
+
+/*
+ * bridge_forward:
+ *
+ * The forwarding function of the bridge.
+ */
+static void
+bridge_forward(struct bridge_softc *sc, struct mbuf *m)
+{
+ struct bridge_iflist *bif;
+ struct ifnet *src_if, *dst_if;
+ struct ether_header *eh;
+
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED);
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf("bridge_forward %s%d m%p\n", ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if), m);
+#endif /* BRIDGE_DEBUG */
+
+ src_if = m->m_pkthdr.rcvif;
+
+ (void) ifnet_stat_increment_in(sc->sc_if, 1, m->m_pkthdr.len, 0);
+
+ /*
+ * Look up the bridge_iflist.
+ */
+ bif = bridge_lookup_member_if(sc, src_if);
+ if (bif == NULL) {
+ /* Interface is not a bridge member (anymore?) */
+ m_freem(m);
+ return;
+ }
+
+ /* APPLE MODIFICATION <cbz@apple.com> - add the ability to block forwarding of packets; for the guest network */
+#if ( APPLE_HAVE_80211_GUEST_NETWORK )
+ if (bif->bif_flags & IFBIF_NO_FORWARDING) {
+ /* Drop the packet and we're done. */
+ m_freem(m);
+ return;
+ }
+#endif
+
+ if (bif->bif_flags & IFBIF_STP) {
+ switch (bif->bif_state) {
+ case BSTP_IFSTATE_BLOCKING:
+ case BSTP_IFSTATE_LISTENING:
+ case BSTP_IFSTATE_DISABLED:
+ m_freem(m);
+ return;
+ }
+ }
+
+ eh = mtod(m, struct ether_header *);
+
+ /*
+ * If the interface is learning, and the source
+ * address is valid and not multicast, record
+ * the address.
+ */
+ if ((bif->bif_flags & IFBIF_LEARNING) != 0 &&
+ ETHER_IS_MULTICAST(eh->ether_shost) == 0 &&
+ (eh->ether_shost[0] | eh->ether_shost[1] |
+ eh->ether_shost[2] | eh->ether_shost[3] |
+ eh->ether_shost[4] | eh->ether_shost[5]) != 0) {
+ (void) bridge_rtupdate(sc, eh->ether_shost,
+ src_if, 0, IFBAF_DYNAMIC);
+ }
+
+ if ((bif->bif_flags & IFBIF_STP) != 0 &&
+ bif->bif_state == BSTP_IFSTATE_LEARNING) {
+ m_freem(m);
+ return;
+ }
+
+ /*
+ * At this point, the port either doesn't participate
+ * in spanning tree or it is in the forwarding state.
+ */
+
+ /*
+ * If the packet is unicast, destined for someone on
+ * "this" side of the bridge, drop it.
+ */
+ if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) {
+ /* APPLE MODIFICATION <cbz@apple.com> - if the packet came in on a proxy sta discovery interface,
+ we need to not look up the node by DA of the packet; we need to look up the proxy sta which
+ matches the SA. If it's not found yet, drop the packet. */
+#if IEEE80211_PROXYSTA
+ if (bif->bif_flags & IFBIF_PROXYSTA_DISCOVER)
+ {
+ struct bridge_rtnode *brt;
+ dst_if = NULL;
+ brt = bridge_rtnode_lookup(sc, eh->ether_shost);
+ if (brt) {
+ dst_if = brt->brt_ifp_proxysta;
+ }
+ if (dst_if == NULL) {
+ m_freem(m);
+ return;
+ }
+ }
+ else
+#endif
+ dst_if = bridge_rtlookup(sc, eh->ether_dhost);
+ if (src_if == dst_if) {
+ m_freem(m);
+ return;
+ }
+ } else {
+ /* ...forward it to all interfaces. */
+ sc->sc_if->if_imcasts++;
+ dst_if = NULL;
+ }
+
+ /* APPLE MODIFICATION
+ <rnewberry@apple.com> - this is now handled by bridge_input
+ <cbz@apple.com> - turning this back on because all packets are not bpf_mtap'd
+ equally. RSN Preauth were not getting through; we're
+ conditionalizing this call on
+ (eh->ether_type == htons(ETHERTYPE_RSN_PREAUTH))
+ */
+#if 1
+ if (eh->ether_type == htons(ETHERTYPE_RSN_PREAUTH))
+ {
+ m->m_pkthdr.rcvif = sc->sc_if;
+#if NBPFILTER > 0
+ if (sc->sc_bpf_input)
+ bridge_bpf_input(sc->sc_if, m);
+#endif
+ }
+#endif
+
+ if (dst_if == NULL) {
+
+#if APPLE_BRIDGE_HWCKSUM_SUPPORT
+ /*
+ * Clear any in-bound checksum flags for this packet.
+ */
+ m->m_pkthdr.csum_flags = 0;
+#else
+ mbuf_inbound_modified(m);
+#endif
+
+ bridge_broadcast(sc, src_if, m, 1);
+ return;
+ }
+
+ /*
+ * At this point, we're dealing with a unicast frame
+ * going to a different interface.
+ */
+ if ((ifnet_flags(dst_if) & IFF_RUNNING) == 0) {
+ m_freem(m);
+ return;
+ }
+ bif = bridge_lookup_member_if(sc, dst_if);
+ if (bif == NULL) {
+ /* Not a member of the bridge (anymore?) */
+ m_freem(m);
+ return;
+ }
+
+ if (bif->bif_flags & IFBIF_STP) {
+ switch (bif->bif_state) {
+ case BSTP_IFSTATE_DISABLED:
+ case BSTP_IFSTATE_BLOCKING:
+ m_freem(m);
+ return;
+ }
+ }
+
+#if APPLE_BRIDGE_HWCKSUM_SUPPORT
+ /*
+ * Clear any in-bound checksum flags for this packet.
+ */
+ {
+ m->m_pkthdr.csum_flags = 0;
+ }
+#else
+ mbuf_inbound_modified(m);
+#endif
+
+ bridge_enqueue(sc, dst_if, m);
+}
+
+char * ether_ntop(char *, size_t , const u_char *);
+
+__private_extern__ char *
+ether_ntop(char *buf, size_t len, const u_char *ap)
+{
+ snprintf(buf, len, "%02x:%02x:%02x:%02x:%02x:%02x",
+ ap[0], ap[1], ap[2], ap[3], ap[4], ap[5]);
+
+ return buf;
+}
+
+/*
+ * bridge_input:
+ *
+ * Receive input from a member interface. Queue the packet for
+ * bridging if it is not for us.
+ */
+errno_t
+bridge_input(struct bridge_iflist *bif, struct ifnet *ifp, struct mbuf *m, void *frame_header)
+{
+ struct ifnet *bifp;
+ struct ether_header *eh;
+ struct mbuf *mc;
+ int is_for_us = 0;
+ struct bridge_softc *sc = bif->bif_sc;
+ struct bridge_iflist *brm;
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf("bridge_input: %s%d from %s%d m %p data %p\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if),
+ ifnet_name(ifp), ifnet_unit(ifp),
+ m, mbuf_data(m));
+#endif /* BRIDGE_DEBUG */
+
+ if ((ifnet_flags(sc->sc_if) & IFF_RUNNING) == 0) {
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d not running passing along\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif /* BRIDGE_DEBUG */
+ return 0;
+ }
+
+ /* Need to clear the promiscous flags otherwise it will be dropped by DLIL after processing filters */
+ if ((mbuf_flags(m) & MBUF_PROMISC))
+ mbuf_setflags_mask(m, 0, MBUF_PROMISC);
+
+ lck_mtx_lock(sc->sc_mtx);
+
+ bifp = sc->sc_if;
+
+ /* Is it a good idea to reassign a new value to bif ? TBD */
+ bif = bridge_lookup_member_if(sc, ifp);
+ if (bif == NULL) {
+ lck_mtx_unlock(sc->sc_mtx);
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d bridge_lookup_member_if failed\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif /* BRIDGE_DEBUG */
+ return 0;
+ }
+
+ eh = (struct ether_header *)mbuf_data(m);
+
+ /*
+ * If the packet is for us, set the packets source as the
+ * bridge, and return the packet back to ether_input for
+ * local processing.
+ */
+ if (memcmp(eh->ether_dhost, ifnet_lladdr(bifp),
+ ETHER_ADDR_LEN) == 0) {
+
+ /* Mark the packet as arriving on the bridge interface */
+ (void) mbuf_pkthdr_setrcvif(m, bifp);
+ mbuf_pkthdr_setheader(m, frame_header);
+
+ /*
+ * If the interface is learning, and the source
+ * address is valid and not multicast, record
+ * the address.
+ */
+ if ((bif->bif_flags & IFBIF_LEARNING) != 0 &&
+ ETHER_IS_MULTICAST(eh->ether_shost) == 0 &&
+ (eh->ether_shost[0] | eh->ether_shost[1] |
+ eh->ether_shost[2] | eh->ether_shost[3] |
+ eh->ether_shost[4] | eh->ether_shost[5]) != 0) {
+ (void) bridge_rtupdate(sc, eh->ether_shost,
+ ifp, 0, IFBAF_DYNAMIC);
+ }
+
+#if NBPFILTER > 0
+ if (sc->sc_bpf_input)
+ bridge_bpf_input(bifp, m);
+#endif
+
+ (void) mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, mbuf_len(m) - ETHER_HDR_LEN);
+ (void) mbuf_pkthdr_adjustlen(m, - ETHER_HDR_LEN);
+
+ (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(m), 0);
+
+ lck_mtx_unlock(sc->sc_mtx);
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d packet for bridge\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif /* BRIDGE_DEBUG */
+
+ dlil_input_packet_list(bifp, m);
+
+ return EJUSTRETURN;
+ }
+
+ /*
+ * if the destination of the packet is for the MAC address of
+ * the member interface itself, then we don't need to forward
+ * it -- just pass it back. Note that it'll likely just be
+ * dropped by the stack, but if something else is bound to
+ * the interface directly (for example, the wireless stats
+ * protocol -- although that actually uses BPF right now),
+ * then it will consume the packet
+ *
+ * ALSO, note that we do this check AFTER checking for the
+ * bridge's own MAC address, because the bridge may be
+ * using the SAME MAC address as one of its interfaces
+ */
+ if (memcmp(eh->ether_dhost, ifnet_lladdr(ifp),
+ ETHER_ADDR_LEN) == 0) {
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+ if ((bif->bif_flags & IFBIF_PROXYSTA) == 0) {
+#endif
+
+#ifdef VERY_VERY_VERY_DIAGNOSTIC
+ printf("bridge_input: not forwarding packet bound for member interface\n" );
+#endif
+ lck_mtx_unlock(sc->sc_mtx);
+ return 0;
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+ }
+#if VERY_VERY_VERY_DIAGNOSTIC
+ else {
+ printf( "%s: pkt rx on %s [proxysta iface], da is %02x:%02x:%02x:%02x:%02x:%02x\n",
+ __func__, ifp->if_xname, eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2],
+ eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5] );
+ }
+#endif
+#endif
+ }
+
+ if ((m->m_flags & (M_BCAST|M_MCAST))) {
+ struct ifmultiaddr *ifma = NULL;
+
+ if ((m->m_flags & M_BCAST)) {
+ is_for_us = 1;
+ } else {
+#if BRIDGE_DEBUG
+ printf("mulicast: %02x:%02x:%02x:%02x:%02x:%02x\n",
+ eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2],
+ eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5]);
+
+ for (ifma = bifp->if_multiaddrs.lh_first; ifma;
+ ifma = ifma->ifma_link.le_next) {
+
+ if (ifma->ifma_addr == NULL)
+ printf(" <none> ");
+ else if (ifma->ifma_addr->sa_family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)ifma->ifma_addr;
+
+ printf(" %u.%u.%u.%u ",
+ (sin->sin_addr.s_addr & 0xff000000) >> 24,
+ (sin->sin_addr.s_addr & 0x00ff0000) >> 16,
+ (sin->sin_addr.s_addr & 0x0000ff00) >> 8,
+ (sin->sin_addr.s_addr & 0x000000ff));
+ }
+ if (!ifma->ifma_ll || !ifma->ifma_ll->ifma_addr)
+ printf("<none>\n");
+ else {
+ struct sockaddr_dl *sdl = (struct sockaddr_dl *)ifma->ifma_ll->ifma_addr;
+
+ printf("%02x:%02x:%02x:%02x:%02x:%02x\n",
+ CONST_LLADDR(sdl)[0], CONST_LLADDR(sdl)[1], CONST_LLADDR(sdl)[2],
+ CONST_LLADDR(sdl)[3], CONST_LLADDR(sdl)[4], CONST_LLADDR(sdl)[5]);
+
+ }
+ }
+#endif /* BRIDGE_DEBUG */
+
+ /*
+ * the upper layer of the stack have attached a list of multicast addresses to the bridge itself
+ * (for example, the IP stack has bound 01:00:5e:00:00:01 to the 224.0.0.1 all hosts address), since
+ * the IP stack is bound to the bridge. so we need to see if the packets arriving here SHOULD be
+ * passed up as coming from the bridge.
+ *
+ * furthermore, since we know the IP stack is attached to the bridge, and NOTHING is attached
+ * to the underlying devices themselves, we can drop packets that don't need to go up (by returning NULL
+ * from bridge_input to the caller) after we forward the packet to other interfaces
+ */
+
+ for (ifma = bifp->if_multiaddrs.lh_first; ifma;
+ ifma = ifma->ifma_link.le_next) {
+ if (ifma->ifma_ll && ifma->ifma_ll->ifma_addr) {
+ struct sockaddr_dl *sdl = (struct sockaddr_dl *)ifma->ifma_ll->ifma_addr;
+
+ if (memcmp(eh->ether_dhost, CONST_LLADDR(sdl), ETHER_ADDR_LEN) == 0)
+ break;
+ }
+ }
+ if (ifma != NULL) {
+ /* this packet matches the bridge's own filter, so pass it up as coming from us */
+
+ /* Mark the packet as arriving on the bridge interface */
+ // don't do this until AFTER we forward the packet -- bridge_forward uses this information
+ //m->m_pkthdr.rcvif = bifp;
+
+ /* keep track of this to help us decide about forwarding */
+ is_for_us = 1;
+
+#if BRIDGE_DEBUG
+ char addr[sizeof("XX:XX:XX:XX:XX:XX")+1];
+ printf( "bridge_input: multicast frame for us (%s)\n",
+ ether_ntop(addr, sizeof(addr), eh->ether_dhost) );
+#endif
+ } else {
+#if BRIDGE_DEBUG
+ char addr[sizeof("XX:XX:XX:XX:XX:XX")+1];
+ printf( "bridge_input: multicast frame for unbound address (%s), forwarding but not passing to stack\n",
+ ether_ntop(addr, sizeof(addr), eh->ether_dhost) );
+#endif
+ }
+ }
+ /* Tap off 802.1D packets; they do not get forwarded. */
+ if (memcmp(eh->ether_dhost, bstp_etheraddr,
+ ETHER_ADDR_LEN) == 0) {
+ m = bstp_input(sc, ifp, m);
+ if (m == NULL) {
+ lck_mtx_unlock(sc->sc_mtx);
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d mcast BSTP not forwarded\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif /* BRIDGE_DEBUG */
+ return EJUSTRETURN;
+ }
+ }
+
+ if (bif->bif_flags & IFBIF_STP) {
+ switch (bif->bif_state) {
+ case BSTP_IFSTATE_BLOCKING:
+ case BSTP_IFSTATE_LISTENING:
+ case BSTP_IFSTATE_DISABLED:
+ {
+ lck_mtx_unlock(sc->sc_mtx);
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d mcast bridge not learning or forwarding \n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif /* BRIDGE_DEBUG */
+
+ m_freem(m);
+ return EJUSTRETURN;
+ }
+ }
+ }
+
+ /*
+ * If the interface is learning, and the source
+ * address is valid and not multicast, record
+ * the address.
+ */
+ if ((bif->bif_flags & IFBIF_LEARNING) != 0 &&
+ ETHER_IS_MULTICAST(eh->ether_shost) == 0 &&
+ (eh->ether_shost[0] | eh->ether_shost[1] |
+ eh->ether_shost[2] | eh->ether_shost[3] |
+ eh->ether_shost[4] | eh->ether_shost[5]) != 0) {
+ (void) bridge_rtupdate(sc, eh->ether_shost,
+ ifp, 0, IFBAF_DYNAMIC);
+ }
+
+ if (is_for_us) {
+ /*
+ * Make a deep copy of the packet and enqueue the copy
+ * for bridge processing; return the original packet for
+ * local processing.
+ */
+ mc = m_dup(m, M_NOWAIT);
+ if (mc == NULL) {
+#ifdef DIAGNOSTIC
+ printf( "bridge_input: failed to duplicate multicast frame, not forwarding\n" );
+#endif
+#if BRIDGE_DEBUG
+ } else {
+ if (_if_brige_debug) {
+ printf_mbuf(mc, "mc for us: ", "\n");
+ printf_mbuf_data(m, 0, 20);
+ printf("\n");
+ }
+#endif /* BRIDGE_DEBUG */
+ }
+ } else {
+ /*
+ * we'll just pass the original, since we don't need to pass it
+ * up the stack
+ */
+ mc = m;
+ }
+
+ /* Perform the bridge forwarding function with the copy. */
+ if (mc != NULL) {
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d mcast forwarding \n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif /* BRIDGE_DEBUG */
+ bridge_forward(sc, mc);
+ }
+
+ // TBD should have an option for type of bridge
+#if 0
+ /*
+ * Reinject the mbuf as arriving on the bridge so we have a
+ * chance at claiming multicast packets. We can not loop back
+ * here from ether_input as a bridge is never a member of a
+ * bridge.
+ */
+ if (bifp->if_bridge != NULL)
+ panic("brige_input: brige %p in a bridge %p\n", bifp, bifp->if_bridge);
+ mc = m_dup(m, M_NOWAIT);
+ if (mc != NULL) {
+ mc->m_pkthdr.rcvif = bifp;
+#if NBPFILTER > 0
+ if (sc->sc_bpf_input)
+ bridge_bpf_input(bifp, mc);
+#endif
+ }
+#endif
+ /* Return the original packet for local processing. */
+ if ( !is_for_us )
+ {
+ /* we don't free the packet -- bridge_forward already did so */
+ lck_mtx_unlock(sc->sc_mtx);
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d mcast local processing\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif
+
+ return EJUSTRETURN;
+ }
+
+ // mark packet as arriving on the bridge
+ m->m_pkthdr.rcvif = bifp;
+ m->m_pkthdr.header = mbuf_data(m);
+
+#if NBPFILTER > 0
+ if (sc->sc_bpf_input)
+ bridge_bpf_input(bifp, m);
+#endif
+ (void) mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, mbuf_len(m) - ETHER_HDR_LEN);
+ (void) mbuf_pkthdr_adjustlen(m, - ETHER_HDR_LEN);
+
+ (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(m), 0);
+
+ lck_mtx_unlock(sc->sc_mtx);
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d mcast for us\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif /* BRIDGE_DEBUG */
+
+ dlil_input_packet_list(bifp, m);
+
+ return EJUSTRETURN;
+ }
+
+ if (bif->bif_flags & IFBIF_STP) {
+ switch (bif->bif_state) {
+ case BSTP_IFSTATE_BLOCKING:
+ case BSTP_IFSTATE_LISTENING:
+ case BSTP_IFSTATE_DISABLED:
+ lck_mtx_unlock(sc->sc_mtx);
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d ucast bridge not learning or forwarding \n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif /* BRIDGE_DEBUG */
+
+ m_freem(m);
+ return EJUSTRETURN;
+ }
+ }
+
+ /* this code is not needed for Apple's bridge where the stack attaches directly */
+#if 1 /* TBD should be an option */
+ /*
+ * Unicast. Make sure it's not for us.
+ */
+ LIST_FOREACH(brm, &sc->sc_iflist, bif_next) {
+ if(ifnet_type(brm->bif_ifp) != IFT_ETHER)
+ continue;
+
+ /* It is destined for us. */
+ if (memcmp(ifnet_lladdr(brm->bif_ifp), eh->ether_dhost,
+ ETHER_ADDR_LEN) == 0) {
+ if (brm->bif_flags & IFBIF_LEARNING)
+ (void) bridge_rtupdate(sc,
+ eh->ether_shost, ifp, 0, IFBAF_DYNAMIC);
+ m->m_pkthdr.rcvif = brm->bif_ifp;
+ m->m_pkthdr.header = mbuf_data(m);
+
+ (void) mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, mbuf_len(m) - ETHER_HDR_LEN);
+ (void) mbuf_pkthdr_adjustlen(m, - ETHER_HDR_LEN);
+#if BRIDGE_SUPPORT_GIF
+#if NGIF > 0
+ if (ifnet_type(ifp) == IFT_GIF) {
+ m->m_flags |= M_PROTO1;
+ m->m_pkthdr.rcvif = brm->bif_ifp;
+ (*brm->bif_ifp->if_input)(brm->bif_ifp, m);
+ m = NULL;
+ }
+#endif
+#endif
+ lck_mtx_unlock(sc->sc_mtx);
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d ucast to member %s%d\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if),
+ ifnet_name(brm->bif_ifp), ifnet_unit(brm->bif_ifp));
+#endif /* BRIDGE_DEBUG */
+
+ dlil_input_packet_list(brm->bif_ifp, m);
+
+ return EJUSTRETURN;
+ }
+
+ /* We just received a packet that we sent out. */
+ if (memcmp(ifnet_lladdr(brm->bif_ifp), eh->ether_shost,
+ ETHER_ADDR_LEN) == 0) {
+ lck_mtx_unlock(sc->sc_mtx);
+
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d ucast drop packet we sent out\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif /* BRIDGE_DEBUG */
+
+ m_freem(m);
+ return EJUSTRETURN;
+ }
+ }
+#endif
+
+ /*
+ * If the interface is learning, and the source
+ * address is valid and not multicast, record
+ * the address.
+ */
+ if ((bif->bif_flags & IFBIF_LEARNING) != 0 &&
+ ETHER_IS_MULTICAST(eh->ether_shost) == 0 &&
+ (eh->ether_shost[0] | eh->ether_shost[1] |
+ eh->ether_shost[2] | eh->ether_shost[3] |
+ eh->ether_shost[4] | eh->ether_shost[5]) != 0) {
+ (void) bridge_rtupdate(sc, eh->ether_shost,
+ ifp, 0, IFBAF_DYNAMIC);
+ }
+
+ /* Perform the bridge forwarding function. */
+#if BRIDGE_DEBUG
+ if (_if_brige_debug)
+ printf( "bridge_input: %s%d ucast forwarding\n",
+ ifnet_name(sc->sc_if), ifnet_unit(sc->sc_if));
+#endif /* BRIDGE_DEBUG */
+
+ bridge_forward(sc, m);
+ lck_mtx_unlock(sc->sc_mtx);
+ return EJUSTRETURN;
+}
+
+/*
+ * bridge_broadcast:
+ *
+ * Send a frame to all interfaces that are members of
+ * the bridge, except for the one on which the packet
+ * arrived.
+ */
+static void
+bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if,
+ struct mbuf *m, __unused int runfilt)
+{
+ struct bridge_iflist *bif;
+ struct mbuf *mc;
+ struct ifnet *dst_if;
+ int used = 0;
+
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED);
+
+ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) {
+ dst_if = bif->bif_ifp;
+ if (dst_if == src_if)
+ continue;
+
+ if (bif->bif_flags & IFBIF_STP) {
+ switch (bif->bif_state) {
+ case BSTP_IFSTATE_BLOCKING:
+ case BSTP_IFSTATE_DISABLED:
+ continue;
+ }
+ }
+
+ if ((bif->bif_flags & IFBIF_DISCOVER) == 0 &&
+ (m->m_flags & (M_BCAST|M_MCAST)) == 0)
+ continue;
+
+ if ((ifnet_flags(dst_if) & IFF_RUNNING) == 0)
+ continue;
+
+ if (LIST_NEXT(bif, bif_next) == NULL) {
+ mc = m;
+ used = 1;
+ } else {
+ mc = m_copym(m, 0, M_COPYALL, M_DONTWAIT);
+ if (mc == NULL) {
+ (void) ifnet_stat_increment_out(sc->sc_if, 0, 0, 1);
+ continue;
+ }
+ }
+
+ bridge_enqueue(sc, dst_if, mc);
+ }
+ if (used == 0)
+ m_freem(m);
+}
+
+/*
+ * bridge_rtupdate:
+ *
+ * Add a bridge routing entry.
+ */
+static int
+bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst,
+ struct ifnet *dst_if, int setflags, uint8_t flags)
+{
+ struct bridge_rtnode *brt;
+ int error;
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+ struct bridge_iflist *bif;
+ int is_pds; /* are we a proxy sta discovery interface? */
+#endif
+ struct timespec now;
+
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA - is this an interface
+ we want to do proxy sta discovery on? */
+#if IEEE80211_PROXYSTA
+ bif = bridge_lookup_member_if(sc, dst_if);
+ if ((bif) && (bif->bif_flags & IFBIF_PROXYSTA_DISCOVER)) {
+ is_pds = 1;
+ }
+ else {
+ is_pds = 0;
+ }
+#endif
+ /*
+ * A route for this destination might already exist. If so,
+ * update it, otherwise create a new one.
+ */
+ if ((brt = bridge_rtnode_lookup(sc, dst)) == NULL) {
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+ /* don't count this address against the bridge cache (well, allow proxy stas to double that
+ number...put *some* boundary on it.) if we are a proxy sta discovery interface */
+ if (is_pds) {
+ if (sc->sc_brtcnt >= (sc->sc_brtmax+sc->sc_brtmax_proxysta))
+ return (ENOSPC);
+ }
+ else
+#endif
+ if (sc->sc_brtcnt >= sc->sc_brtmax)
+ return (ENOSPC);
+
+ /*
+ * Allocate a new bridge forwarding node, and
+ * initialize the expiration time and Ethernet
+ * address.
+ */
+ brt = zalloc_noblock(bridge_rtnode_pool);
+ if (brt == NULL)
+ return (ENOMEM);
+
+ memset(brt, 0, sizeof(*brt));
+ nanouptime(&now);
+ brt->brt_expire = now.tv_sec + sc->sc_brttimeout;
+ brt->brt_flags = IFBAF_DYNAMIC;
+ memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN);
+
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA - is this an interface
+ we want to do proxy sta discovery on? If so, post a monitoring event */
+#if IEEE80211_PROXYSTA
+ if (is_pds) {
+ brt->brt_flags_ext |= IFBAF_EXT_PROXYSTA;
+#if DIAGNOSTIC
+ printf( "%s: proxysta %02x:%02x:%02x:%02x:%02x:%02x on %s; discovery\n",
+ __func__, dst[0], dst[1], dst[2], dst[3], dst[4], dst[5], dst_if->if_xname );
+#endif
+ bridge_proxysta_discover( dst_if, dst );
+ }
+#endif
+
+ if ((error = bridge_rtnode_insert(sc, brt)) != 0) {
+ zfree(bridge_rtnode_pool, brt);
+ return (error);
+ }
+ }
+
+ brt->brt_ifp = dst_if;
+ if (setflags) {
+ brt->brt_flags = flags;
+ brt->brt_expire = (flags & IFBAF_STATIC) ? 0 :
+ now.tv_sec + sc->sc_brttimeout;
+ }
+
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA - */
+#if IEEE80211_PROXYSTA
+ if (is_pds) {
+#if VERY_VERY_DIAGNOSTIC
+ printf( "%s: proxysta %02x:%02x:%02x:%02x:%02x:%02x on %s; reset timeout\n",
+ __func__, dst[0], dst[1], dst[2], dst[3], dst[4], dst[5], dst_if->if_xname );
+#endif
+ brt->brt_expire = (flags & IFBAF_STATIC) ? 0 :
+ now.tv_sec + sc->sc_brttimeout;
+ }
+#endif
+
+ return (0);
+}
+
+/*
+ * bridge_rtlookup:
+ *
+ * Lookup the destination interface for an address.
+ */
+static struct ifnet *
+bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr)
+{
+ struct bridge_rtnode *brt;
+
+ if ((brt = bridge_rtnode_lookup(sc, addr)) == NULL)
+ return (NULL);
+
+ return (brt->brt_ifp);
+}
+
+/*
+ * bridge_rttrim:
+ *
+ * Trim the routine table so that we have a number
+ * of routing entries less than or equal to the
+ * maximum number.
+ */
+static void
+bridge_rttrim(struct bridge_softc *sc)
+{
+ struct bridge_rtnode *brt, *nbrt;
+
+ /* Make sure we actually need to do this. */
+ if (sc->sc_brtcnt <= sc->sc_brtmax)
+ return;
+
+ /* Force an aging cycle; this might trim enough addresses. */
+ bridge_rtage(sc);
+ if (sc->sc_brtcnt <= sc->sc_brtmax)
+ return;
+
+ for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) {
+ nbrt = LIST_NEXT(brt, brt_list);
+ if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) {
+ bridge_rtnode_destroy(sc, brt);
+ if (sc->sc_brtcnt <= sc->sc_brtmax)
+ return;
+ }
+ }
+}
+
+/*
+ * bridge_timer:
+ *
+ * Aging timer for the bridge.
+ */
+static void
+bridge_timer(void *arg)
+{
+ struct bridge_softc *sc = arg;
+ struct timespec ts;
+
+ lck_mtx_lock(sc->sc_mtx);
+
+ bridge_rtage(sc);
+
+ lck_mtx_unlock(sc->sc_mtx);
+
+ if (ifnet_flags(sc->sc_if) & IFF_RUNNING) {
+ ts.tv_sec = bridge_rtable_prune_period;
+ ts.tv_nsec = 0;
+ bsd_timeout(bridge_timer, sc, &ts);
+ }
+}
+
+/*
+ * bridge_rtage:
+ *
+ * Perform an aging cycle.
+ */
+static void
+bridge_rtage(struct bridge_softc *sc)
+{
+ struct bridge_rtnode *brt, *nbrt;
+ struct timespec now;
+
+ nanouptime(&now);
+
+ for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) {
+ nbrt = LIST_NEXT(brt, brt_list);
+ if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) {
+ if ((unsigned long)now.tv_sec >= brt->brt_expire)
+ bridge_rtnode_destroy(sc, brt);
+ }
+ }
+}
+
+/*
+ * bridge_rtflush:
+ *
+ * Remove all dynamic addresses from the bridge.
+ */
+static void
+bridge_rtflush(struct bridge_softc *sc, int full)
+{
+ struct bridge_rtnode *brt, *nbrt;
+
+ for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) {
+ nbrt = LIST_NEXT(brt, brt_list);
+ if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)
+ bridge_rtnode_destroy(sc, brt);
+ }
+}
+
+/* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+/*
+ * bridge_rtdiscovery:
+ *
+ */
+static void
+bridge_rtdiscovery(struct bridge_softc *sc)
+{
+ struct bridge_rtnode *brt, *nbrt;
+ struct bridge_iflist *bif;
+
+ for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) {
+ nbrt = LIST_NEXT(brt, brt_list);
+ bif = bridge_lookup_member_if(sc, brt->brt_ifp);
+ if ((bif) && (bif->bif_flags & IFBIF_PROXYSTA_DISCOVER) &&
+ ((brt->brt_flags_ext & IFBAF_EXT_PROXYSTA) == 0)) {
+#if DIAGNOSTIC
+ printf( "%s: proxysta %02x:%02x:%02x:%02x:%02x:%02x on %s; found before IFBIF_PROXYSTA_DISCOVER\n",
+ __func__, brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2], brt->brt_addr[3],
+ brt->brt_addr[4], brt->brt_addr[5], brt->brt_ifp->if_xname );
+#endif
+ brt->brt_flags_ext |= IFBAF_EXT_PROXYSTA;
+ }
+
+ if (brt->brt_ifp_proxysta == NULL) {
+#if DIAGNOSTIC
+ printf( "%s: proxysta %02x:%02x:%02x:%02x:%02x:%02x on %s; discovery\n",
+ __func__, brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2], brt->brt_addr[3],
+ brt->brt_addr[4], brt->brt_addr[5], brt->brt_ifp->if_xname );
+#endif
+ bridge_proxysta_discover( brt->brt_ifp, brt->brt_addr );
+ }
+ }
+}
+
+/*
+ * bridge_rtpurge:
+ *
+ * Remove all dynamic addresses from a specific interface on the bridge.
+ */
+static void
+bridge_rtpurge(struct bridge_softc *sc, struct ifnet *ifs)
+{
+ struct bridge_rtnode *brt, *nbrt;
+
+ for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) {
+ nbrt = LIST_NEXT(brt, brt_list);
+ if (brt->brt_ifp == ifs) {
+#if DIAGNOSTIC
+ printf( "%s: purge %s [%02x:%02x:%02x:%02x:%02x:%02x] discovered on %s\n",
+ __func__, brt->brt_ifp_proxysta ? brt->brt_ifp_proxysta->if_xname : brt->brt_ifp->if_xname,
+ brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2],
+ brt->brt_addr[3], brt->brt_addr[4], brt->brt_addr[5], brt->brt_ifp->if_xname );
+#endif
+ bridge_rtnode_destroy(sc, brt);
+ }
+ }
+}
+#endif
+
+/*
+ * bridge_rtdaddr:
+ *
+ * Remove an address from the table.
+ */
+static int
+bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr)
+{
+ struct bridge_rtnode *brt;
+
+ if ((brt = bridge_rtnode_lookup(sc, addr)) == NULL)
+ return (ENOENT);
+
+ bridge_rtnode_destroy(sc, brt);
+ return (0);
+}
+
+/*
+ * bridge_rtdelete:
+ *
+ * Delete routes to a speicifc member interface.
+ */
+__private_extern__ void
+bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full)
+{
+ struct bridge_rtnode *brt, *nbrt;
+
+ for (brt = LIST_FIRST(&sc->sc_rtlist); brt != NULL; brt = nbrt) {
+ nbrt = LIST_NEXT(brt, brt_list);
+ if (brt->brt_ifp == ifp && (full ||
+ (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC))
+ bridge_rtnode_destroy(sc, brt);
+ }
+}
+
+/*
+ * bridge_rtable_init:
+ *
+ * Initialize the route table for this bridge.
+ */
+static int
+bridge_rtable_init(struct bridge_softc *sc)
+{
+ int i;
+
+ sc->sc_rthash = _MALLOC(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE,
+ M_DEVBUF, M_WAITOK);
+ if (sc->sc_rthash == NULL)
+ return (ENOMEM);
+
+ for (i = 0; i < BRIDGE_RTHASH_SIZE; i++)
+ LIST_INIT(&sc->sc_rthash[i]);
+
+ sc->sc_rthash_key = random();
+
+ LIST_INIT(&sc->sc_rtlist);
+
+ return (0);
+}
+
+/*
+ * bridge_rtable_fini:
+ *
+ * Deconstruct the route table for this bridge.
+ */
+static void
+bridge_rtable_fini(struct bridge_softc *sc)
+{
+
+ _FREE(sc->sc_rthash, M_DEVBUF);
+}
+
+/*
+ * The following hash function is adapted from "Hash Functions" by Bob Jenkins
+ * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
+ */
+#define mix(a, b, c) \
+do { \
+a -= b; a -= c; a ^= (c >> 13); \
+b -= c; b -= a; b ^= (a << 8); \
+c -= a; c -= b; c ^= (b >> 13); \
+a -= b; a -= c; a ^= (c >> 12); \
+b -= c; b -= a; b ^= (a << 16); \
+c -= a; c -= b; c ^= (b >> 5); \
+a -= b; a -= c; a ^= (c >> 3); \
+b -= c; b -= a; b ^= (a << 10); \
+c -= a; c -= b; c ^= (b >> 15); \
+} while (/*CONSTCOND*/0)
+
+static uint32_t
+bridge_rthash(__unused struct bridge_softc *sc, const uint8_t *addr)
+{
+ /* APPLE MODIFICATION - wasabi performance improvment - simplify the hash algorithm */
+#if 0
+ uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key;
+
+ b += addr[5] << 8;
+ b += addr[4];
+ a += addr[3] << 24;
+ a += addr[2] << 16;
+ a += addr[1] << 8;
+ a += addr[0];
+
+ mix(a, b, c);
+
+ return (c & BRIDGE_RTHASH_MASK);
+#else
+ return addr[5];
+#endif
+}
+
+#undef mix
+
+/*
+ * bridge_rtnode_lookup:
+ *
+ * Look up a bridge route node for the specified destination.
+ */
+static struct bridge_rtnode *
+bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr)
+{
+ struct bridge_rtnode *brt;
+ uint32_t hash;
+ int dir;
+
+ hash = bridge_rthash(sc, addr);
+ LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) {
+ dir = memcmp(addr, brt->brt_addr, ETHER_ADDR_LEN);
+ if (dir == 0)
+ return (brt);
+ if (dir > 0)
+ return (NULL);
+ }
+
+ return (NULL);
+}
+
+/*
+ * bridge_rtnode_insert:
+ *
+ * Insert the specified bridge node into the route table. We
+ * assume the entry is not already in the table.
+ */
+static int
+bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt)
+{
+ struct bridge_rtnode *lbrt;
+ uint32_t hash;
+ int dir;
+
+ hash = bridge_rthash(sc, brt->brt_addr);
+
+ lbrt = LIST_FIRST(&sc->sc_rthash[hash]);
+ if (lbrt == NULL) {
+ LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash);
+ goto out;
+ }
+
+ do {
+ dir = memcmp(brt->brt_addr, lbrt->brt_addr, ETHER_ADDR_LEN);
+ if (dir == 0)
+ return (EEXIST);
+ if (dir > 0) {
+ LIST_INSERT_BEFORE(lbrt, brt, brt_hash);
+ goto out;
+ }
+ if (LIST_NEXT(lbrt, brt_hash) == NULL) {
+ LIST_INSERT_AFTER(lbrt, brt, brt_hash);
+ goto out;
+ }
+ lbrt = LIST_NEXT(lbrt, brt_hash);
+ } while (lbrt != NULL);
+
+#ifdef DIAGNOSTIC
+ panic("bridge_rtnode_insert: impossible");
+#endif
+
+out:
+ LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list);
+ sc->sc_brtcnt++;
+
+ return (0);
+}
+
+/*
+ * bridge_rtnode_destroy:
+ *
+ * Destroy a bridge rtnode.
+ */
+static void
+bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt)
+{
+ lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED);
+
+ /* APPLE MODIFICATION <cbz@apple.com> - add support for Proxy STA */
+#if IEEE80211_PROXYSTA
+ if (brt->brt_flags_ext & IFBAF_EXT_PROXYSTA) {
+#if DIAGNOSTIC
+ printf( "%s: proxysta %02x:%02x:%02x:%02x:%02x:%02x %s from %s; idle timeout\n",
+ __func__, brt->brt_addr[0], brt->brt_addr[1], brt->brt_addr[2],
+ brt->brt_addr[3], brt->brt_addr[4], brt->brt_addr[5],
+ brt->brt_ifp_proxysta ? brt->brt_ifp_proxysta->if_xname : "unknown",
+ brt->brt_ifp->if_xname );
+#endif
+ bridge_proxysta_idle_timeout( brt->brt_ifp, brt->brt_addr );
+ }
+#endif
+
+ LIST_REMOVE(brt, brt_hash);
+
+ LIST_REMOVE(brt, brt_list);
+ sc->sc_brtcnt--;
+ zfree(bridge_rtnode_pool, brt);
+}
+
+static errno_t
+bridge_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func bpf_callback)
+{
+ struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp);
+
+ //printf("bridge_set_bpf_tap ifp %p mode %d\n", ifp, mode);
+
+ /* TBD locking */
+ if (sc == NULL || (sc->sc_flags & SCF_DETACHING)) {
+ return ENODEV;
+ }
+
+ switch (mode) {
+ case BPF_TAP_DISABLE:
+ sc->sc_bpf_input = sc->sc_bpf_output = NULL;
+ break;
+
+ case BPF_TAP_INPUT:
+ sc->sc_bpf_input = bpf_callback;
+ break;
+
+ case BPF_TAP_OUTPUT:
+ sc->sc_bpf_output = bpf_callback;
+ break;
+
+ case BPF_TAP_INPUT_OUTPUT:
+ sc->sc_bpf_input = sc->sc_bpf_output = bpf_callback;
+ break;
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static void
+bridge_detach(__unused ifnet_t ifp)
+{
+ struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp);
+
+ /* Tear down the routing table. */
+ bridge_rtable_fini(sc);
+
+ lck_rw_lock_exclusive(bridge_list_lock);
+ LIST_REMOVE(sc, sc_list);
+ lck_rw_done(bridge_list_lock);
+
+ ifnet_release(ifp);
+
+ lck_mtx_free(sc->sc_mtx, bridge_lock_grp);
+
+ _FREE(sc, M_DEVBUF);
+ return;
+}
+
+__private_extern__ errno_t bridge_bpf_input(ifnet_t ifp, struct mbuf *m)
+{
+ struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp);
+
+ if (sc->sc_bpf_input) {
+ if (mbuf_pkthdr_rcvif(m) != ifp)
+ printf("bridge_bpf_input rcvif: %p != ifp %p\n", mbuf_pkthdr_rcvif(m), ifp);
+ (*sc->sc_bpf_input)(ifp, m);
+ }
+ return 0;
+}
+
+__private_extern__ errno_t bridge_bpf_output(ifnet_t ifp, struct mbuf *m)
+{
+ struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp);
+
+ if (sc->sc_bpf_output) {
+ (*sc->sc_bpf_output)(ifp, m);
+ }
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2004-2009 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/* $apfw: if_bridgevar,v 1.7 2008/10/24 02:34:06 cbzimmer Exp $ */
+/* $NetBSD: if_bridgevar.h,v 1.8 2005/12/10 23:21:38 elad Exp $ */
+
+/*
+ * Copyright 2001 Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Jason R. Thorpe for Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed for the NetBSD Project by
+ * Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ * or promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Jason L. Wright
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * OpenBSD: if_bridge.h,v 1.14 2001/03/22 03:48:29 jason Exp
+ */
+
+/*
+ * Data structure and control definitions for bridge interfaces.
+ */
+
+#ifndef _NET_IF_BRIDGEVAR_H_
+#define _NET_IF_BRIDGEVAR_H_
+
+#ifdef PRIVATE
+
+#include <sys/queue.h>
+
+#include <net/if.h>
+#include <net/ethernet.h>
+
+/*
+ * Commands used in the SIOCSDRVSPEC ioctl. Note the lookup of the
+ * bridge interface itself is keyed off the ifdrv structure.
+ */
+#define BRDGADD 0 /* add bridge member (ifbreq) */
+#define BRDGDEL 1 /* delete bridge member (ifbreq) */
+#define BRDGGIFFLGS 2 /* get member if flags (ifbreq) */
+#define BRDGSIFFLGS 3 /* set member if flags (ifbreq) */
+#define BRDGSCACHE 4 /* set cache size (ifbrparam) */
+#define BRDGGCACHE 5 /* get cache size (ifbrparam) */
+#define BRDGGIFS 6 /* get member list (ifbifconf) */
+#define BRDGRTS 7 /* get address list (ifbaconf) */
+#define BRDGSADDR 8 /* set static address (ifbareq) */
+#define BRDGSTO 9 /* set cache timeout (ifbrparam) */
+#define BRDGGTO 10 /* get cache timeout (ifbrparam) */
+#define BRDGDADDR 11 /* delete address (ifbareq) */
+#define BRDGFLUSH 12 /* flush address cache (ifbreq) */
+
+#define BRDGGPRI 13 /* get priority (ifbrparam) */
+#define BRDGSPRI 14 /* set priority (ifbrparam) */
+#define BRDGGHT 15 /* get hello time (ifbrparam) */
+#define BRDGSHT 16 /* set hello time (ifbrparam) */
+#define BRDGGFD 17 /* get forward delay (ifbrparam) */
+#define BRDGSFD 18 /* set forward delay (ifbrparam) */
+#define BRDGGMA 19 /* get max age (ifbrparam) */
+#define BRDGSMA 20 /* set max age (ifbrparam) */
+#define BRDGSIFPRIO 21 /* set if priority (ifbreq) */
+#define BRDGSIFCOST 22 /* set if path cost (ifbreq) */
+#define BRDGGFILT 23 /* get filter flags (ifbrparam) */
+#define BRDGSFILT 24 /* set filter flags (ifbrparam) */
+#define BRDGPURGE 25 /* purge address cache for a particular interface (ifbreq) */
+
+/*
+ * Generic bridge control request.
+ */
+#pragma pack(4)
+
+struct ifbreq {
+ char ifbr_ifsname[IFNAMSIZ]; /* member if name */
+ uint32_t ifbr_ifsflags; /* member if flags */
+ uint16_t ifbr_portno; /* member if port number */
+ uint8_t ifbr_state; /* member if STP state */
+ uint8_t ifbr_priority; /* member if STP priority */
+ uint8_t ifbr_path_cost; /* member if STP cost */
+};
+
+#pragma pack()
+
+/* BRDGGIFFLAGS, BRDGSIFFLAGS */
+#define IFBIF_LEARNING 0x01 /* if can learn */
+#define IFBIF_DISCOVER 0x02 /* if sends packets w/ unknown dest. */
+#define IFBIF_STP 0x04 /* if participates in spanning tree */
+/* APPLE MODIFICATION <cbz@apple.com>
+ add the following bits for ProxySTA:
+ IFBIF_PROXYSTA, IFBIF_PROXYSTA_DISCOVER
+ add the following bits for Guest Network
+ IFBIF_NO_FORWARDING
+ */
+#define IFBIF_PROXYSTA 0x08 /* if interface is a proxy sta */
+#define IFBIF_PROXYSTA_DISCOVER 0x10 /* if interface is used to discover proxy sta candidates */
+#define IFBIF_NO_FORWARDING 0x20 /* if interface cannot forward traffic from one interface to the next */
+
+/* APPLE MODIFICATION <cbz@apple.com>
+ add the following bits for ProxySTA:
+ PROXYSTA, PROXYSTA_DISCOVER
+ add the following bits for Guest Network
+ NO_FORWARDING
+ this was...
+
+ #define IFBIFBITS "\020\1LEARNING\2DISCOVER\3STP"
+ */
+#define IFBIFBITS "\020\1LEARNING\2DISCOVER\3STP\4PROXYSTA\5PROXYSTA_DISCOVER\6NO_FORWARDING"
+
+/* BRDGFLUSH */
+#define IFBF_FLUSHDYN 0x00 /* flush learned addresses only */
+#define IFBF_FLUSHALL 0x01 /* flush all addresses */
+
+/* BRDGSFILT */
+#define IFBF_FILT_USEIPF 0x00000001 /* run pfil hooks on the bridge
+interface */
+#define IFBF_FILT_MEMBER 0x00000002 /* run pfil hooks on the member
+interfaces */
+#define IFBF_FILT_ONLYIP 0x00000004 /* only pass IP[46] packets when
+pfil is enabled */
+#define IFBF_FILT_MASK 0x00000007 /* mask of valid values */
+
+
+/* APPLE MODIFICATION <jhw@apple.com>: Default is to pass non-IP packets. */
+#define IFBF_FILT_DEFAULT ( IFBF_FILT_USEIPF | IFBF_FILT_MEMBER )
+#if 0
+#define IFBF_FILT_DEFAULT (IFBF_FILT_USEIPF | \
+IFBF_FILT_MEMBER | \
+IFBF_FILT_ONLYIP)
+#endif
+
+/* STP port states */
+#define BSTP_IFSTATE_DISABLED 0
+#define BSTP_IFSTATE_LISTENING 1
+#define BSTP_IFSTATE_LEARNING 2
+#define BSTP_IFSTATE_FORWARDING 3
+#define BSTP_IFSTATE_BLOCKING 4
+
+/*
+ * Interface list structure.
+ */
+
+#pragma pack(4)
+
+struct ifbifconf {
+ uint32_t ifbic_len; /* buffer size */
+ union {
+ caddr_t ifbicu_buf;
+ struct ifbreq *ifbicu_req;
+ } ifbic_ifbicu;
+#define ifbic_buf ifbic_ifbicu.ifbicu_buf
+#define ifbic_req ifbic_ifbicu.ifbicu_req
+};
+
+#ifdef KERNEL_PRIVATE
+struct ifbifconf32 {
+ uint32_t ifbic_len; /* buffer size */
+ union {
+ user32_addr_t ifbicu_buf;
+ user32_addr_t ifbicu_req;
+ } ifbic_ifbicu;
+};
+
+struct ifbifconf64 {
+ uint32_t ifbic_len; /* buffer size */
+ union {
+ user64_addr_t ifbicu_buf;
+ user64_addr_t ifbicu_req;
+ } ifbic_ifbicu;
+};
+#endif /* KERNEL_PRIVATE */
+
+#pragma pack()
+
+/*
+ * Bridge address request.
+ */
+
+#pragma pack(4)
+
+struct ifbareq {
+ char ifba_ifsname[IFNAMSIZ]; /* member if name */
+ unsigned long ifba_expire; /* address expire time */
+ uint8_t ifba_flags; /* address flags */
+ uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */
+};
+
+#ifdef KERNEL_PRIVATE
+struct ifbareq32 {
+ char ifba_ifsname[IFNAMSIZ]; /* member if name */
+ uint32_t ifba_expire; /* address expire time */
+ uint8_t ifba_flags; /* address flags */
+ uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */
+};
+
+struct ifbareq64 {
+ char ifba_ifsname[IFNAMSIZ]; /* member if name */
+ uint64_t ifba_expire; /* address expire time */
+ uint8_t ifba_flags; /* address flags */
+ uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */
+};
+#endif /* KERNEL_PRIVATE */
+
+#pragma pack()
+
+#define IFBAF_TYPEMASK 0x03 /* address type mask */
+#define IFBAF_DYNAMIC 0x00 /* dynamically learned address */
+#define IFBAF_STATIC 0x01 /* static address */
+
+#define IFBAFBITS "\020\1STATIC"
+
+/*
+ * Address list structure.
+ */
+
+#pragma pack(4)
+
+struct ifbaconf {
+ uint32_t ifbac_len; /* buffer size */
+ union {
+ caddr_t ifbacu_buf;
+ struct ifbareq *ifbacu_req;
+ } ifbac_ifbacu;
+#define ifbac_buf ifbac_ifbacu.ifbacu_buf
+#define ifbac_req ifbac_ifbacu.ifbacu_req
+};
+
+#ifdef KERNEL_PRIVATE
+struct ifbaconf32 {
+ uint32_t ifbac_len; /* buffer size */
+ union {
+ user32_addr_t ifbacu_buf;
+ user32_addr_t ifbacu_req;
+ } ifbac_ifbacu;
+};
+
+struct ifbaconf64 {
+ uint32_t ifbac_len; /* buffer size */
+ union {
+ user64_addr_t ifbacu_buf;
+ user64_addr_t ifbacu_req;
+ } ifbac_ifbacu;
+};
+#endif /* KERNEL_PRIVATE */
+
+#pragma pack()
+
+/*
+ * Bridge parameter structure.
+ */
+
+#pragma pack(4)
+
+struct ifbrparam {
+ union {
+ uint32_t ifbrpu_int32;
+ uint16_t ifbrpu_int16;
+ uint8_t ifbrpu_int8;
+ } ifbrp_ifbrpu;
+};
+
+#pragma pack()
+
+#define ifbrp_csize ifbrp_ifbrpu.ifbrpu_int32 /* cache size */
+#define ifbrp_ctime ifbrp_ifbrpu.ifbrpu_int32 /* cache time (sec) */
+#define ifbrp_prio ifbrp_ifbrpu.ifbrpu_int16 /* bridge priority */
+#define ifbrp_hellotime ifbrp_ifbrpu.ifbrpu_int8 /* hello time (sec) */
+#define ifbrp_fwddelay ifbrp_ifbrpu.ifbrpu_int8 /* fwd time (sec) */
+#define ifbrp_maxage ifbrp_ifbrpu.ifbrpu_int8 /* max age (sec) */
+#define ifbrp_filter ifbrp_ifbrpu.ifbrpu_int32 /* filtering flags */
+
+#ifdef KERNEL
+/*
+ * Timekeeping structure used in spanning tree code.
+ */
+struct bridge_timer {
+ uint16_t active;
+ uint16_t value;
+};
+
+struct bstp_config_unit {
+ uint64_t cu_rootid;
+ uint64_t cu_bridge_id;
+ uint32_t cu_root_path_cost;
+ uint16_t cu_message_age;
+ uint16_t cu_max_age;
+ uint16_t cu_hello_time;
+ uint16_t cu_forward_delay;
+ uint16_t cu_port_id;
+ uint8_t cu_message_type;
+ uint8_t cu_topology_change_acknowledgment;
+ uint8_t cu_topology_change;
+};
+
+struct bstp_tcn_unit {
+ uint8_t tu_message_type;
+};
+
+struct bridge_softc;
+
+/*
+ * Bridge interface list entry.
+ * (VL) bridge_ifmember would be a better name, more descriptive
+ */
+struct bridge_iflist {
+ LIST_ENTRY(bridge_iflist) bif_next;
+ uint64_t bif_designated_root;
+ uint64_t bif_designated_bridge;
+ uint32_t bif_path_cost;
+ uint32_t bif_designated_cost;
+ struct bridge_timer bif_hold_timer;
+ struct bridge_timer bif_message_age_timer;
+ struct bridge_timer bif_forward_delay_timer;
+ uint16_t bif_port_id;
+ uint16_t bif_designated_port;
+ struct bstp_config_unit bif_config_bpdu;
+ uint8_t bif_state;
+ uint8_t bif_topology_change_acknowledge;
+ uint8_t bif_config_pending;
+ uint8_t bif_change_detection_enabled;
+ uint8_t bif_priority;
+ struct ifnet *bif_ifp; /* member if */
+ uint32_t bif_flags; /* member if flags */
+ int bif_mutecap; /* member muted caps */
+ interface_filter_t bif_iff_ref;
+ struct bridge_softc *bif_sc;
+};
+
+/*
+ * Bridge route node.
+ */
+struct bridge_rtnode {
+ LIST_ENTRY(bridge_rtnode) brt_hash; /* hash table linkage */
+ LIST_ENTRY(bridge_rtnode) brt_list; /* list linkage */
+ struct ifnet *brt_ifp; /* destination if */
+ unsigned long brt_expire; /* expiration time */
+ uint8_t brt_flags; /* address flags */
+ uint8_t brt_addr[ETHER_ADDR_LEN];
+ /* APPLE MODIFICATION <cbz@apple.com> - add the following elements:
+ brt_flags_ext, brt_ifp_proxysta */
+#define IFBAF_EXT_PROXYSTA 0x01
+ uint8_t brt_flags_ext; /* extended flags */
+ struct ifnet *brt_ifp_proxysta; /* proxy sta if */
+};
+
+
+/*
+ * Software state for each bridge.
+ */
+struct bridge_softc {
+ LIST_ENTRY(bridge_softc) sc_list;
+ struct ifnet *sc_if;
+ uint64_t sc_designated_root;
+ uint64_t sc_bridge_id;
+ struct bridge_iflist *sc_root_port;
+ uint32_t sc_root_path_cost;
+ uint16_t sc_max_age;
+ uint16_t sc_hello_time;
+ uint16_t sc_forward_delay;
+ uint16_t sc_bridge_max_age;
+ uint16_t sc_bridge_hello_time;
+ uint16_t sc_bridge_forward_delay;
+ uint16_t sc_topology_change_time;
+ uint16_t sc_hold_time;
+ uint16_t sc_bridge_priority;
+ uint8_t sc_topology_change_detected;
+ uint8_t sc_topology_change;
+ struct bridge_timer sc_hello_timer;
+ struct bridge_timer sc_topology_change_timer;
+ struct bridge_timer sc_tcn_timer;
+ uint32_t sc_brtmax; /* max # of addresses */
+ uint32_t sc_brtcnt; /* cur. # of addresses */
+ /* APPLE MODIFICATION <cbz@apple.com> - add the following elements:
+ sc_brtmax_proxysta */
+ uint32_t sc_brtmax_proxysta; /* max # of proxy sta addresses */
+ uint32_t sc_brttimeout; /* rt timeout in seconds */
+ LIST_HEAD(, bridge_iflist) sc_iflist; /* member interface list */
+ LIST_HEAD(, bridge_rtnode) *sc_rthash; /* our forwarding table */
+ LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */
+ uint32_t sc_rthash_key; /* key for hash */
+ uint32_t sc_filter_flags; /* ipf and flags */
+
+ //(VL)
+ char sc_if_xname[IFNAMSIZ];
+ bpf_packet_func sc_bpf_input;
+ bpf_packet_func sc_bpf_output;
+ u_int32_t sc_flags;
+ lck_mtx_t *sc_mtx;
+};
+
+#define SCF_DETACHING 0x1
+
+extern const uint8_t bstp_etheraddr[];
+
+int bridgeattach(int);
+void bridge_enqueue(struct bridge_softc *, struct ifnet *, struct mbuf *);
+void bridge_rtdelete(struct bridge_softc *, struct ifnet *, int);
+
+void bstp_initialization(struct bridge_softc *);
+void bstp_stop(struct bridge_softc *);
+struct mbuf *bstp_input(struct bridge_softc *, struct ifnet *, struct mbuf *);
+
+
+#endif /* KERNEL */
+#endif /* PRIVATE */
+#endif /* !_NET_IF_BRIDGEVAR_H_ */
+
/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000, 2009 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
extern struct ifqueue pkintrq;
#endif
-#if BRIDGE
-#include <net/bridge.h>
-#endif
-
/* #include "vlan.h" */
#if NVLAN > 0
#include <net/if_vlan_var.h>
/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000,2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)if_llc.h 8.1 (Berkeley) 6/10/93
+ * @(#)if_llc.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NET_IF_LLC_H_
*/
struct llc {
- u_char llc_dsap;
- u_char llc_ssap;
+ u_int8_t llc_dsap;
+ u_int8_t llc_ssap;
union {
struct {
- u_char control;
- u_char format_id;
- u_char class_id;
- u_char window_x2;
+ u_int8_t control;
+ u_int8_t format_id;
+ u_int8_t class_id;
+ u_int8_t window_x2;
} type_u;
struct {
- u_char num_snd_x2;
- u_char num_rcv_x2;
+ u_int8_t num_snd_x2;
+ u_int8_t num_rcv_x2;
} type_i;
struct {
- u_char control;
- u_char num_rcv_x2;
+ u_int8_t control;
+ u_int8_t num_rcv_x2;
} type_s;
struct {
- u_char control;
- struct frmrinfo {
- u_char rej_pdu_0;
- u_char rej_pdu_1;
- u_char frmr_control;
- u_char frmr_control_ext;
- u_char frmr_cause;
- } frmrinfo;
+ u_int8_t control;
+ /*
+ * We cannot put the following fields in a structure because
+ * the structure rounding might cause padding.
+ */
+ u_int8_t frmr_rej_pdu0;
+ u_int8_t frmr_rej_pdu1;
+ u_int8_t frmr_control;
+ u_int8_t frmr_control_ext;
+ u_int8_t frmr_cause;
} type_frmr;
struct {
- u_char control;
- u_char org_code[3];
- u_short ether_type;
- } type_snap;
+ u_int8_t control;
+ u_int8_t org_code[3];
+ u_int16_t ether_type;
+ } type_snap __attribute__((__packed__));
struct {
- u_char control;
- u_char control_ext;
+ u_int8_t control;
+ u_int8_t control_ext;
} type_raw;
} llc_un;
-};
-#define llc_control llc_un.type_u.control
-#define llc_control_ext llc_un.type_raw.control_ext
-#define llc_fid llc_un.type_u.format_id
-#define llc_class llc_un.type_u.class_id
-#define llc_window llc_un.type_u.window_x2
-#define llc_frmrinfo llc_un.type_frmr.frmrinfo
-#define llc_frmr_pdu0 llc_un.type_frmr.frmrinfo.rej_pdu0
-#define llc_frmr_pdu1 llc_un.type_frmr.frmrinfo.rej_pdu1
-#define llc_frmr_control llc_un.type_frmr.frmrinfo.frmr_control
-#define llc_frmr_control_ext llc_un.type_frmr.frmrinfo.frmr_control_ext
-#define llc_frmr_cause llc_un.type_frmr.frmrinfo.frmr_control_ext
+} __attribute__((__packed__));
+
+struct frmrinfo {
+ u_int8_t frmr_rej_pdu0;
+ u_int8_t frmr_rej_pdu1;
+ u_int8_t frmr_control;
+ u_int8_t frmr_control_ext;
+ u_int8_t frmr_cause;
+} __attribute__((__packed__));
+
+#define llc_control llc_un.type_u.control
+#define llc_control_ext llc_un.type_raw.control_ext
+#define llc_fid llc_un.type_u.format_id
+#define llc_class llc_un.type_u.class
+#define llc_window llc_un.type_u.window_x2
+#define llc_frmrinfo llc_un.type_frmr.frmr_rej_pdu0
+#define llc_frmr_pdu0 llc_un.type_frmr.frmr_rej_pdu0
+#define llc_frmr_pdu1 llc_un.type_frmr.frmr_rej_pdu1
+#define llc_frmr_control llc_un.type_frmr.frmr_control
+#define llc_frmr_control_ext llc_un.type_frmr.frmr_control_ext
+#define llc_frmr_cause llc_un.type_frmr.frmr_cause
+#define llc_snap llc_un.type_snap
/*
* Don't use sizeof(struct llc_un) for LLC header sizes
#define LLC_ISFRAMELEN 4
#define LLC_UFRAMELEN 3
#define LLC_FRMRLEN 7
+#define LLC_SNAPFRAMELEN 8
/*
* Unnumbered LLC format commands
/*
* ISO PDTR 10178 contains among others
*/
+#define LLC_8021D_LSAP 0x42
#define LLC_X25_LSAP 0x7e
#define LLC_SNAP_LSAP 0xaa
#define LLC_ISO_LSAP 0xfe
-#endif
+/*
+ * LLC XID definitions from 802.2, as needed
+ */
+
+#define LLC_XID_FORMAT_BASIC 0x81
+#define LLC_XID_BASIC_MINLEN (LLC_UFRAMELEN + 3)
+
+#define LLC_XID_CLASS_I 0x1
+#define LLC_XID_CLASS_II 0x3
+#define LLC_XID_CLASS_III 0x5
+#define LLC_XID_CLASS_IV 0x7
+
+
+#endif /* !_NET_IF_LLC_H_ */
/*
- * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#define IFT_L2VLAN 0x87 /* Layer 2 Virtual LAN using 802.1Q */
#define IFT_IEEE8023ADLAG 0x88 /* IEEE802.3ad Link Aggregate */
#define IFT_IEEE1394 0x90 /* IEEE1394 High Performance SerialBus*/
+#define IFT_BRIDGE 0xd1 /* Transparent bridge interface */
/*
* These are not based on IANA assignments:
/*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
void *if_fwd_route_lock;
#endif
struct route if_fwd_route; /* cached IPv4 forwarding route */
+ void *if_bridge; /* bridge glue */
};
#ifndef __APPLE__
/*
- * Copyright (c) 2003-2008 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2003-2009 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <net/multicast_list.h>
#include <net/ether_if_module.h>
-#define IF_MAXUNIT 0x7fff /* historical value */
-
#define VLANNAME "vlan"
typedef int (bpf_callback_func)(struct ifnet *, struct mbuf *);
/*
- * Copyright (c) 2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
-/* $apfw: pf.c,v 1.37 2008/12/05 23:10:20 jhw Exp $ */
+/* $apfw: git commit 7c8016ea91f7b68950cf41729c92dd8e3e423ba7 $ */
/* $OpenBSD: pf.c,v 1.567 2008/02/20 23:40:13 henning Exp $ */
/*
void *, struct pf_pdesc *, u_short *);
static int pf_test_state_udp(struct pf_state **, int,
struct pfi_kif *, struct mbuf *, int,
- void *, struct pf_pdesc *);
+ void *, struct pf_pdesc *, u_short *);
static int pf_test_state_icmp(struct pf_state **, int,
struct pfi_kif *, struct mbuf *, int,
void *, struct pf_pdesc *, u_short *);
#define BOUND_IFACE(r, k) \
((r)->rule_flag & PFRULE_IFBOUND) ? (k) : pfi_all
-#define STATE_INC_COUNTERS(s) \
- do { \
- s->rule.ptr->states++; \
- if (s->anchor.ptr != NULL) \
- s->anchor.ptr->states++; \
- if (s->nat_rule.ptr != NULL) \
- s->nat_rule.ptr->states++; \
+#define STATE_INC_COUNTERS(s) \
+ do { \
+ s->rule.ptr->states++; \
+ VERIFY(s->rule.ptr->states != 0); \
+ if (s->anchor.ptr != NULL) { \
+ s->anchor.ptr->states++; \
+ VERIFY(s->anchor.ptr->states != 0); \
+ } \
+ if (s->nat_rule.ptr != NULL) { \
+ s->nat_rule.ptr->states++; \
+ VERIFY(s->nat_rule.ptr->states != 0); \
+ } \
} while (0)
-#define STATE_DEC_COUNTERS(s) \
- do { \
- if (s->nat_rule.ptr != NULL) \
- s->nat_rule.ptr->states--; \
- if (s->anchor.ptr != NULL) \
- s->anchor.ptr->states--; \
- s->rule.ptr->states--; \
+#define STATE_DEC_COUNTERS(s) \
+ do { \
+ if (s->nat_rule.ptr != NULL) { \
+ VERIFY(s->nat_rule.ptr->states > 0); \
+ s->nat_rule.ptr->states--; \
+ } \
+ if (s->anchor.ptr != NULL) { \
+ VERIFY(s->anchor.ptr->states > 0); \
+ s->anchor.ptr->states--; \
+ } \
+ VERIFY(s->rule.ptr->states > 0); \
+ s->rule.ptr->states--; \
} while (0)
static __inline int pf_src_compare(struct pf_src_node *, struct pf_src_node *);
#define PF_DT_SKIP_EXTGWY 0x02
#ifndef NO_APPLE_EXTENSIONS
-static const u_int16_t PF_PPTP_PORT = htons(1723);
-static const u_int32_t PF_PPTP_MAGIC_NUMBER = htonl(0x1A2B3C4D);
+static const u_int16_t PF_PPTP_PORT = 1723;
+static const u_int32_t PF_PPTP_MAGIC_NUMBER = 0x1A2B3C4D;
struct pf_pptp_hdr {
u_int16_t length;
*/
};
-static const u_int16_t PF_IKE_PORT = htons(500);
+static const u_int16_t PF_IKE_PORT = 500;
struct pf_ike_hdr {
u_int64_t initiator_cookie, responder_cookie;
int bad = 0;
(*state)->src_node->conn++;
+ VERIFY((*state)->src_node->conn != 0);
(*state)->src.tcp_est = 1;
pf_add_threshold(&(*state)->src_node->conn_rate);
TAILQ_INSERT_TAIL(&state_list, s, entry_list);
pf_status.fcounters[FCNT_STATE_INSERT]++;
pf_status.states++;
+ VERIFY(pf_status.states != 0);
pfi_kif_ref(kif, PFI_KIF_REF_STATE);
#if NPFSYNC
pfsync_insert_state(s);
lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED);
if (s->src_node != NULL) {
- if (s->src.tcp_est)
+ if (s->src.tcp_est) {
+ VERIFY(s->src_node->conn > 0);
--s->src_node->conn;
+ }
+ VERIFY(s->src_node->states > 0);
if (--s->src_node->states <= 0) {
t = s->rule.ptr->timeout[PFTM_SRC_NODE];
if (!t)
}
}
if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
+ VERIFY(s->nat_src_node->states > 0);
if (--s->nat_src_node->states <= 0) {
t = s->rule.ptr->timeout[PFTM_SRC_NODE];
if (!t)
return;
#endif
VERIFY(cur->timeout == PFTM_UNLINKED);
+ VERIFY(cur->rule.ptr->states > 0);
if (--cur->rule.ptr->states <= 0 &&
cur->rule.ptr->src_nodes <= 0)
pf_rm_rule(NULL, cur->rule.ptr);
- if (cur->nat_rule.ptr != NULL)
+ if (cur->nat_rule.ptr != NULL) {
+ VERIFY(cur->nat_rule.ptr->states > 0);
if (--cur->nat_rule.ptr->states <= 0 &&
cur->nat_rule.ptr->src_nodes <= 0)
pf_rm_rule(NULL, cur->nat_rule.ptr);
- if (cur->anchor.ptr != NULL)
+ }
+ if (cur->anchor.ptr != NULL) {
+ VERIFY(cur->anchor.ptr->states > 0);
if (--cur->anchor.ptr->states <= 0)
pf_rm_rule(NULL, cur->anchor.ptr);
+ }
pf_normalize_tcp_cleanup(cur);
pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE);
TAILQ_REMOVE(&state_list, cur, entry_list);
pf_tag_unref(cur->tag);
pool_put(&pf_state_pl, cur);
pf_status.fcounters[FCNT_STATE_REMOVALS]++;
+ VERIFY(pf_status.states > 0);
pf_status.states--;
}
unsigned int cut;
sa_family_t af = pd->af;
u_int8_t proto = pd->proto;
- unsigned int low = ntohs(r->rpool.proxy_port[0]);
- unsigned int high = ntohs(r->rpool.proxy_port[1]);
+ unsigned int low = r->rpool.proxy_port[0];
+ unsigned int high = r->rpool.proxy_port[1];
#else
u_int16_t cut;
#endif
if (proto == IPPROTO_UDP) {
/*--- Never float IKE source port ---*/
- if (sxport->port == PF_IKE_PORT) {
+ if (ntohs(sxport->port) == PF_IKE_PORT) {
nxport->port = sxport->port;
return (0);
}
return (0);
}
}
+ } else if (proto == IPPROTO_TCP) {
+ struct pf_state* s;
+ /*
+ * APPLE MODIFICATION: <rdar://problem/6546358>
+ * Fix allows....NAT to use a single binding for TCP session
+ * with same source IP and source port
+ */
+ TAILQ_FOREACH(s, &state_list, entry_list) {
+ struct pf_state_key* sk = s->state_key;
+ if (!sk)
+ continue;
+ if (s->nat_rule.ptr != r)
+ continue;
+ if (sk->proto != IPPROTO_TCP || sk->af != af)
+ continue;
+ if (sk->lan.xport.port != sxport->port)
+ continue;
+ if (!(PF_AEQ(&sk->lan.addr, saddr, af)))
+ continue;
+ nxport->port = sk->gwy.xport.port;
+ return (0);
+ }
}
#endif
-
do {
key.af = af;
key.proto = proto;
#else
key.ext.port = dport;
#endif
-
/*
* port search; start random, step;
* similar 2 portloop in in_pcbbind
src->neg, kif))
r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR :
PF_SKIP_DST_ADDR].ptr;
- else if (!pf_match_xport(r->proto, r->proto_variant, &src->xport,
- sxport))
+ else if (!pf_match_xport(r->proto,
+ r->proto_variant, &src->xport, sxport))
#else
else if (PF_MISMATCHAW(&src->addr, saddr, pd->af,
src->neg, kif))
case AF_INET:
inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4, dport,
0, NULL);
+#if INET6
+ if (inp == NULL) {
+ struct in6_addr s6, d6;
+
+ memset(&s6, 0, sizeof (s6));
+ s6.s6_addr16[5] = htons(0xffff);
+ memcpy(&s6.s6_addr32[3], &saddr->v4,
+ sizeof (saddr->v4));
+
+ memset(&d6, 0, sizeof (d6));
+ d6.s6_addr16[5] = htons(0xffff);
+ memcpy(&d6.s6_addr32[3], &daddr->v4,
+ sizeof (daddr->v4));
+
+ inp = in6_pcblookup_hash(pi, &s6, sport,
+ &d6, dport, 0, NULL);
+ if (inp == NULL) {
+ inp = in_pcblookup_hash(pi, saddr->v4, sport,
+ daddr->v4, dport, INPLOOKUP_WILDCARD, NULL);
+ if (inp == NULL) {
+ inp = in6_pcblookup_hash(pi, &s6, sport,
+ &d6, dport, INPLOOKUP_WILDCARD,
+ NULL);
+ if (inp == NULL)
+ return (-1);
+ }
+ }
+ }
+#else
if (inp == NULL) {
inp = in_pcblookup_hash(pi, saddr->v4, sport,
daddr->v4, dport, INPLOOKUP_WILDCARD, NULL);
if (inp == NULL)
return (-1);
}
+#endif /* !INET6 */
break;
#endif /* INET */
#if INET6
struct udphdr *uh = pd->hdr.udp;
size_t plen = m->m_pkthdr.len - off - sizeof (*uh);
- if (uh->uh_sport == PF_IKE_PORT &&
- uh->uh_dport == PF_IKE_PORT &&
+ if (ntohs(uh->uh_sport) == PF_IKE_PORT &&
+ ntohs(uh->uh_dport) == PF_IKE_PORT &&
plen >= PF_IKE_PACKET_MINSIZE) {
if (plen > PF_IKE_PACKET_MINSIZE)
plen = PF_IKE_PACKET_MINSIZE;
if (sn != NULL) {
s->src_node = sn;
s->src_node->states++;
+ VERIFY(s->src_node->states != 0);
}
if (nsn != NULL) {
PF_ACPY(&nsn->raddr, &pd->naddr, af);
s->nat_src_node = nsn;
s->nat_src_node->states++;
+ VERIFY(s->nat_src_node->states != 0);
}
if (pd->proto == IPPROTO_TCP) {
if ((pd->flags & PFDESC_TCP_NORM) &&
sk->af = af;
#ifndef NO_APPLE_EXTENSIONS
if (pd->proto == IPPROTO_UDP) {
- if (pd->hdr.udp->uh_sport == PF_IKE_PORT &&
- pd->hdr.udp->uh_dport == PF_IKE_PORT) {
+ if (ntohs(pd->hdr.udp->uh_sport) == PF_IKE_PORT &&
+ ntohs(pd->hdr.udp->uh_dport) == PF_IKE_PORT) {
sk->proto_variant = PF_EXTFILTER_APD;
} else {
sk->proto_variant = nr ? nr->extfilter :
u_int16_t dport = (direction == PF_OUT) ?
sk->ext.xport.port : sk->gwy.xport.port;
- if (nr != NULL && dport == PF_PPTP_PORT) {
+ if (nr != NULL &&
+ ntohs(dport) == PF_PPTP_PORT) {
struct pf_app_state *as;
as = pool_get(&pf_app_state_pl,
case IPPROTO_UDP: {
struct udphdr *uh = pd->hdr.udp;
- if (nr != NULL && uh->uh_sport == PF_IKE_PORT &&
- uh->uh_dport == PF_IKE_PORT) {
+ if (nr != NULL &&
+ ntohs(uh->uh_sport) == PF_IKE_PORT &&
+ ntohs(uh->uh_dport) == PF_IKE_PORT) {
struct pf_app_state *as;
as = pool_get(&pf_app_state_pl,
as = &s->state_key->app_state->u.pptp;
m_copydata(m, off, plen, &cm);
- if (cm.hdr.magic != PF_PPTP_MAGIC_NUMBER)
+ if (ntohl(cm.hdr.magic) != PF_PPTP_MAGIC_NUMBER)
return;
- if (cm.hdr.type != htons(1))
+ if (ntohs(cm.hdr.type) != 1)
return;
sk = s->state_key;
gsk->gwy.xport.call_id = 0;
gsk->ext.xport.call_id = 0;
+ STATE_INC_COUNTERS(gs);
as->grev1_state = gs;
} else {
gsk = gs->state_key;
}
m = pf_lazy_makewritable(pd, m, off + plen);
- if (!m)
+ if (!m) {
+ as->grev1_state = NULL;
+ STATE_DEC_COUNTERS(gs);
+ pool_put(&pf_state_pl, gs);
return;
+ }
m_copyback(m, off, plen, &cm);
}
gs->creation = pf_time_second();
gs->expire = pf_time_second();
gs->timeout = PFTM_GREv1_FIRST_PACKET;
- if (gs->src_node) ++gs->src_node->states;
- if (gs->nat_src_node) ++gs->nat_src_node->states;
+ if (gs->src_node != NULL) {
+ ++gs->src_node->states;
+ VERIFY(gs->src_node->states != 0);
+ }
+ if (gs->nat_src_node != NULL) {
+ ++gs->nat_src_node->states;
+ VERIFY(gs->nat_src_node->states != 0);
+ }
pf_set_rt_ifp(gs, &sk->lan.addr);
if (pf_insert_state(BOUND_IFACE(s->rule.ptr, kif), gs)) {
* succeed. Failures are expected to be rare enough
* that fixing this is a low priority.
*/
-
+ as->grev1_state = NULL;
+ pd->lmw = -1;
pf_src_tree_remove_state(gs);
STATE_DEC_COUNTERS(gs);
pool_put(&pf_state_pl, gs);
>> sws;
dws = dst->wscale & PF_WSCALE_MASK;
} else {
+#ifndef NO_APPLE_MODIFICATION
+ /*
+ * <rdar://5786370>
+ *
+ * Window scale negotiation has failed,
+ * therefore we must restore the window
+ * scale in the state record that we
+ * optimistically removed in
+ * pf_test_rule(). Care is required to
+ * prevent arithmetic overflow from
+ * zeroing the window when it's
+ * truncated down to 16-bits. --jhw
+ */
+ u_int32_t _win = dst->max_win;
+ _win <<= dst->wscale & PF_WSCALE_MASK;
+ dst->max_win = MIN(0xffff, _win);
+#else
/* fixup other window */
dst->max_win <<= dst->wscale &
PF_WSCALE_MASK;
+#endif
/* in case of a retrans SYN|ACK */
dst->wscale = 0;
}
* the crappy stack check or if we picked up the connection
* after establishment)
*/
+#ifndef NO_APPLE_MODIFICATIONS
+ if (src->seqhi == 1 ||
+ SEQ_GEQ(end + MAX(1, (u_int32_t)dst->max_win << dws),
+ src->seqhi))
+ src->seqhi = end + MAX(1, (u_int32_t)dst->max_win << dws);
+#else
if (src->seqhi == 1 ||
SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
src->seqhi = end + MAX(1, dst->max_win << dws);
+#endif
if (win > src->max_win)
src->max_win = win;
#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
if (SEQ_GEQ(src->seqhi, end) &&
/* Last octet inside other's window space */
+#ifndef NO_APPLE_MODIFICATIONS
+ SEQ_GEQ(seq, src->seqlo - ((u_int32_t)dst->max_win << dws)) &&
+#else
SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
+#endif
/* Retrans: not more than one window back */
(ackskew >= -MAXACKWINDOW) &&
/* Acking not more than one reassembled fragment backwards */
if (SEQ_GT(end, src->seqlo))
src->seqlo = end;
/* slide the window of what the other end can send */
+#ifndef NO_APPLE_MODIFICATIONS
+ if (SEQ_GEQ(ack + ((u_int32_t)win << sws), dst->seqhi))
+ dst->seqhi = ack + MAX(((u_int32_t)win << sws), 1);
+#else
if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
dst->seqhi = ack + MAX((win << sws), 1);
-
+#endif
/* update states */
if (th->th_flags & TH_SYN)
if (SEQ_GT(end, src->seqlo))
src->seqlo = end;
/* slide the window of what the other end can send */
+#ifndef NO_APPLE_MODIFICATIONS
+ if (SEQ_GEQ(ack + ((u_int32_t)win << sws), dst->seqhi))
+ dst->seqhi = ack + MAX(((u_int32_t)win << sws), 1);
+#else
if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
dst->seqhi = ack + MAX((win << sws), 1);
+#endif
/*
* Cannot set dst->seqhi here since this could be a shotgunned
"fwd" : "rev");
printf("pf: State failure on: %c %c %c %c | %c %c\n",
SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
+#ifndef NO_APPLE_MODIFICATIONS
+ SEQ_GEQ(seq,
+ src->seqlo - ((u_int32_t)dst->max_win << dws)) ?
+#else
SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
+#endif
' ': '2',
(ackskew >= -MAXACKWINDOW) ? ' ' : '3',
(ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
static int
pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
- struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
+ struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
{
#pragma unused(h)
struct pf_state_peer *src, *dst;
}
#ifndef NO_APPLE_EXTENSIONS
- if (uh->uh_sport == PF_IKE_PORT && uh->uh_dport == PF_IKE_PORT) {
+ if (ntohs(uh->uh_sport) == PF_IKE_PORT &&
+ ntohs(uh->uh_dport) == PF_IKE_PORT) {
struct pf_ike_hdr ike;
size_t plen = m->m_pkthdr.len - off - sizeof (*uh);
if (plen < PF_IKE_PACKET_MINSIZE) {
(*state)->state_key->app_state->handler) {
(*state)->state_key->app_state->handler(*state, direction,
off + uh->uh_ulen, pd, kif);
+ if (pd->lmw < 0) {
+ REASON_SET(reason, PFRES_MEMORY);
+ return (PF_DROP);
+ }
m = pd->mp;
}
#endif
}
if (!SEQ_GEQ(src->seqhi, seq) ||
+#ifndef NO_APPLE_MODIFICATION
+ !SEQ_GEQ(seq,
+ src->seqlo - ((u_int32_t)dst->max_win << dws))) {
+#else
!SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))) {
+#endif
if (pf_status.debug >= PF_DEBUG_MISC) {
printf("pf: BAD ICMP %d:%d ",
icmptype, pd->hdr.icmp->icmp_code);
#ifndef NO_APPLE_EXTENSIONS
key.proto_variant = PF_EXTFILTER_APD;
- if (uh.uh_sport == PF_IKE_PORT &&
- uh.uh_dport == PF_IKE_PORT) {
+ if (ntohs(uh.uh_sport) == PF_IKE_PORT &&
+ ntohs(uh.uh_dport) == PF_IKE_PORT) {
struct pf_ike_hdr ike;
size_t plen =
m->m_pkthdr.len - off2 - sizeof (uh);
h = mtod(m, struct ip *); \
} \
} while (0)
-#else
-#define PF_APPLE_UPDATE_PDESC_IPv4()
#endif
int
if ((th.th_flags & TH_ACK) && pd.p_len == 0)
pqid = 1;
action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
- if (action == PF_DROP)
+#ifndef NO_APPLE_EXTENSIONS
+ if (pd.lmw < 0)
goto done;
PF_APPLE_UPDATE_PDESC_IPv4();
+#endif
+ if (action == PF_DROP)
+ goto done;
action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
&reason);
#ifndef NO_APPLE_EXTENSIONS
REASON_SET(&reason, PFRES_SHORT);
goto done;
}
- action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
+ action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd,
+ &reason);
#ifndef NO_APPLE_EXTENSIONS
if (pd.lmw < 0)
goto done;
}
done:
+#ifndef NO_APPLE_EXTENSIONS
+ *m0 = pd.mp;
PF_APPLE_UPDATE_PDESC_IPv4();
+#endif
if (action == PF_PASS && h->ip_hl > 5 &&
!((s && s->allow_opts) || r->allow_opts)) {
}
#ifndef NO_APPLE_EXTENSIONS
+ VERIFY(m == NULL || pd.mp == NULL || pd.mp == m);
+
if (*m0) {
if (pd.lmw < 0) {
+ REASON_SET(&reason, PFRES_MEMORY);
+ action = PF_DROP;
+ }
+
+ if (action == PF_DROP) {
m_freem(*m0);
*m0 = NULL;
return (PF_DROP);
h = mtod(m, struct ip6_hdr *); \
} \
} while (0)
-#else
-#define PF_APPLE_UPDATE_PDESC_IPv6()
#endif
int
}
pd.p_len = pd.tot_len - off - (th.th_off << 2);
action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
- if (action == PF_DROP)
+#ifndef NO_APPLE_EXTENSIONS
+ if (pd.lmw < 0)
goto done;
PF_APPLE_UPDATE_PDESC_IPv6();
+#endif
+ if (action == PF_DROP)
+ goto done;
action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
&reason);
#ifndef NO_APPLE_EXTENSIONS
REASON_SET(&reason, PFRES_SHORT);
goto done;
}
- action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
+ action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd,
+ &reason);
#ifndef NO_APPLE_EXTENSIONS
if (pd.lmw < 0)
goto done;
}
done:
+#ifndef NO_APPLE_EXTENSIONS
+ *m0 = pd.mp;
PF_APPLE_UPDATE_PDESC_IPv6();
+#endif
if (n != m) {
m_freem(n);
pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
#else
#ifndef NO_APPLE_EXTENSIONS
+ VERIFY(m == NULL || pd.mp == NULL || pd.mp == m);
+
if (*m0) {
if (pd.lmw < 0) {
+ REASON_SET(&reason, PFRES_MEMORY);
+ action = PF_DROP;
+ }
+
+ if (action == PF_DROP) {
m_freem(*m0);
*m0 = NULL;
return (PF_DROP);
{
struct timeval t;
+ microuptime(&t);
+ return (t.tv_sec);
+}
+
+uint64_t
+pf_calendar_time_second(void)
+{
+ struct timeval t;
+
microtime(&t);
return (t.tv_sec);
}
error = ENOMEM;
} else {
pf_status.running = 1;
- pf_status.since = pf_time_second();
+ pf_status.since = pf_calendar_time_second();
if (pf_status.stateid == 0) {
pf_status.stateid = pf_time_second();
pf_status.stateid = pf_status.stateid << 32;
mbuf_growth_normal();
pf_detach_hooks();
pf_status.running = 0;
- pf_status.since = pf_time_second();
+ pf_status.since = pf_calendar_time_second();
wakeup(pf_purge_thread_fn);
DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n"));
}
break;
}
pf_default_rule.states++;
+ VERIFY(pf_default_rule.states != 0);
break;
}
bzero(pf_status.counters, sizeof (pf_status.counters));
bzero(pf_status.fcounters, sizeof (pf_status.fcounters));
bzero(pf_status.scounters, sizeof (pf_status.scounters));
- pf_status.since = pf_time_second();
+ pf_status.since = pf_calendar_time_second();
if (*pf_status.ifname)
pfi_update_status(pf_status.ifname, NULL);
break;
__private_extern__ void *pool_get(struct pool *, int);
__private_extern__ void pool_put(struct pool *, void *);
__private_extern__ u_int64_t pf_time_second(void);
+__private_extern__ u_int64_t pf_calendar_time_second(void);
#endif /* KERNEL */
union sockaddr_union {
#if INET && MROUTING
return mrt_ioctl(req, data);
#else
+#pragma unused(req)
+#pragma unused(data)
return ENXIO;
#endif
}
/*
- * Copyright (c) 2004-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <net/if_arp.h>
#include <net/if_dl.h>
#include <net/dlil.h>
+#include <net/if_types.h>
#include <net/route.h>
#include <netinet/if_ether.h>
#include <netinet/in_var.h>
struct llinfo_arp *llinfo;
errno_t error;
int created_announcement = 0;
-
+ int bridged = 0, is_bridge = 0;
+
/* Do not respond to requests for 0.0.0.0 */
if (target_ip->sin_addr.s_addr == 0 && arpop == ARPOP_REQUEST)
goto done;
+
+ if (ifp->if_bridge)
+ bridged = 1;
+ if (ifp->if_type == IFT_BRIDGE)
+ is_bridge = 1;
/*
* Determine if this ARP is for us
+ * For a bridge, we want to check the address irrespective
+ * of the receive interface.
*/
lck_rw_lock_shared(in_ifaddr_rwlock);
TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), ia_hash) {
- /* do_bridge should be tested here for bridging */
- if (ia->ia_ifp == ifp &&
+ if (((bridged && ia->ia_ifp->if_bridge != NULL) ||
+ (ia->ia_ifp == ifp)) &&
ia->ia_addr.sin_addr.s_addr == target_ip->sin_addr.s_addr) {
- best_ia = ia;
- ifaref(&best_ia->ia_ifa);
- lck_rw_done(in_ifaddr_rwlock);
- goto match;
+ best_ia = ia;
+ ifaref(&best_ia->ia_ifa);
+ lck_rw_done(in_ifaddr_rwlock);
+ goto match;
}
}
TAILQ_FOREACH(ia, INADDR_HASH(sender_ip->sin_addr.s_addr), ia_hash) {
- /* do_bridge should be tested here for bridging */
- if (ia->ia_ifp == ifp &&
+ if (((bridged && ia->ia_ifp->if_bridge != NULL) ||
+ (ia->ia_ifp == ifp)) &&
ia->ia_addr.sin_addr.s_addr == sender_ip->sin_addr.s_addr) {
- best_ia = ia;
- ifaref(&best_ia->ia_ifa);
- lck_rw_done(in_ifaddr_rwlock);
- goto match;
+ best_ia = ia;
+ ifaref(&best_ia->ia_ifa);
+ lck_rw_done(in_ifaddr_rwlock);
+ goto match;
+ }
+ }
+
+#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \
+ (ia->ia_ifp->if_bridge == ifp->if_softc && \
+ !bcmp(ifnet_lladdr(ia->ia_ifp), ifnet_lladdr(ifp), ifp->if_addrlen) && \
+ addr == ia->ia_addr.sin_addr.s_addr)
+ /*
+ * Check the case when bridge shares its MAC address with
+ * some of its children, so packets are claimed by bridge
+ * itself (bridge_input() does it first), but they are really
+ * meant to be destined to the bridge member.
+ */
+ if (is_bridge) {
+ TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), ia_hash) {
+ if (BDG_MEMBER_MATCHES_ARP(target_ip->sin_addr.s_addr, ifp, ia)) {
+ ifp = ia->ia_ifp;
+ best_ia = ia;
+ ifaref(&best_ia->ia_ifa);
+ lck_rw_done(in_ifaddr_rwlock);
+ goto match;
+ }
}
}
lck_rw_done(in_ifaddr_rwlock);
continue;
best_ia = (struct in_ifaddr *)ifa;
ifaref(&best_ia->ia_ifa);
- break;
+ ifnet_lock_done(ifp);
+ goto match;
}
ifnet_lock_done(ifp);
- /* If we don't have an IP address on this interface, ignore the packet */
- if (best_ia == NULL)
+ /*
+ * If we're not a bridge member, or if we are but there's no
+ * IPv4 address to use for the interface, drop the packet.
+ */
+ if (!bridged || best_ia == NULL)
goto done;
match:
}
/* Check for a conflict */
- if (sender_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr) {
+ if (!bridged && sender_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr) {
struct kev_msg ev_msg;
struct kev_in_collision *in_collision;
u_char storage[sizeof(struct kev_in_collision) + MAX_HW_LEN];
RT_LOCK_ASSERT_HELD(route);
gateway = SDL(route->rt_gateway);
- if (route->rt_ifp != ifp) {
+ if (!bridged && route->rt_ifp != ifp) {
if (!IN_LINKLOCAL(ntohl(sender_ip->sin_addr.s_addr)) || (ifp->if_eflags & IFEF_ARPLL) == 0) {
if (log_arp_warnings)
log(LOG_ERR, "arp: %s is on %s%d but got reply from %s on %s%d\n",
if (error == 0) {
RT_LOCK_ASSERT_HELD(route);
+ /*
+ * Return proxied ARP replies only on the interface
+ * or bridge cluster where this network resides.
+ * Otherwise we may conflict with the host we are
+ * proxying for.
+ */
+ if (route->rt_ifp != ifp &&
+ (route->rt_ifp->if_bridge != ifp->if_bridge ||
+ ifp->if_bridge == NULL)) {
+ RT_REMREF_LOCKED(route);
+ RT_UNLOCK(route);
+ goto done;
+ }
proxied = *SDL(route->rt_gateway);
target_hw = &proxied;
} else {
/*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <netinet/ip_dummynet.h>
#include <netinet/ip_var.h>
-#if BRIDGE
-#include <netinet/if_ether.h> /* for struct arpcom */
-#include <net/bridge.h>
-#endif
-
/*
* We keep a private variable for the simulation time, but we could
* probably use an existing one ("softticks" in sys/kern/kern_timer.c)
proto_inject(PF_INET, m);
break ;
-#if BRIDGE
- case DN_TO_BDG_FWD :
- /*
- * The bridge requires/assumes the Ethernet header is
- * contiguous in the first mbuf header. Insure this is true.
- */
- if (BDG_LOADED) {
- if (m->m_len < ETHER_HDR_LEN &&
- (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
- printf("dummynet/bridge: pullup fail, dropping pkt\n");
- break;
- }
- m = bdg_forward_ptr(m, pkt->ifp);
- } else {
- /* somebody unloaded the bridge module. Drop pkt */
- /* XXX rate limit */
- printf("dummynet: dropping bridged packet trapped in pipe\n");
- }
- if (m)
- m_freem(m);
- break;
-#endif
default:
printf("dummynet: bad switch %d!\n", pkt->dn_dir);
m_freem(m);
struct ip_moptions **imop;
{
int error = 0;
- int i;
struct in_addr addr;
struct ip_mreq mreq;
struct ifnet *ifp = NULL;
switch (sopt->sopt_name) {
/* store an index number for the vif you wanna use in the send */
#if MROUTING
- case IP_MULTICAST_VIF:
- if (legal_vif_num == 0) {
- error = EOPNOTSUPP;
- break;
- }
- error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
- if (error)
- break;
- if (!legal_vif_num(i) && (i != -1)) {
- error = EINVAL;
+ case IP_MULTICAST_VIF:
+ {
+ int i;
+ if (legal_vif_num == 0) {
+ error = EOPNOTSUPP;
+ break;
+ }
+ error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
+ if (error)
+ break;
+ if (!legal_vif_num(i) && (i != -1)) {
+ error = EINVAL;
+ break;
+ }
+ imo->imo_multicast_vif = i;
break;
}
- imo->imo_multicast_vif = i;
- break;
#endif /* MROUTING */
case IP_MULTICAST_IF:
goto drop;
#endif
+ /* Radar 7377561: Avoid processing packets while closing a listen socket */
+ if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN) == 0)
+ goto drop;
+
if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
#if TCPDEBUG
if (so->so_options & SO_DEBUG) {
KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
}
}
-
#if 1
lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
#endif
/*
- * Copyright (c) 2003-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
getmicrotime(&timenow);
privileged = (proc_suser(p) == 0);
-
+#if MROUTING
switch (cmd) {
case SIOCGETSGCNT_IN6:
case SIOCGETMIFCNT_IN6_32:
case SIOCGETMIFCNT_IN6_64:
return (mrt6_ioctl(cmd, data));
}
-
+#endif
if (ifp == NULL)
return (EOPNOTSUPP);
case SIOCPROTOATTACH_IN6_32:
case SIOCPROTOATTACH_IN6_64:
- switch (ifp->if_type) {
-#if IFT_BRIDGE /*OpenBSD 2.8*/
- /* some of the interfaces are inherently not IPv6 capable */
- case IFT_BRIDGE:
- return;
- /* NOTREACHED */
-#endif
- default:
- if ((error = proto_plumb(PF_INET6, ifp)))
- printf("SIOCPROTOATTACH_IN6: %s "
- "error=%d\n", if_name(ifp), error);
- break;
-
- }
+ if ((error = proto_plumb(PF_INET6, ifp)))
+ printf("SIOCPROTOATTACH_IN6: %s "
+ "error=%d\n", if_name(ifp), error);
return (error);
/* NOTREACHED */
#if IFT_IEEE80211
case IFT_IEEE80211:
#endif
+ case IFT_BRIDGE:
/* IEEE802/EUI64 cases - what others? */
/* IEEE1394 uses 16byte length address starting with EUI64 */
if (addrlen > 8)
0, rip_unlock, 0,
{ 0, 0 }, NULL, { 0 }
},
+#if MROUTING
{ SOCK_RAW, &inet6domain, IPPROTO_PIM, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
pim6_input, rip6_pr_output, 0, rip6_ctloutput,
0,
0, rip_unlock, 0,
{ 0, 0 }, NULL, { 0 }
},
+#else
+{ SOCK_RAW, &inet6domain, IPPROTO_PIM, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
+ 0, 0, 0, rip6_ctloutput,
+ 0,
+ 0, 0, 0, 0,
+ 0,
+ &rip6_usrreqs,
+ 0, rip_unlock, 0,
+ { 0, 0 }, NULL, { 0 }
+},
+#endif
/* raw wildcard */
{ SOCK_RAW, &inet6domain, 0, PR_ATOMIC|PR_ADDR|PR_LASTHDR,
rip6_input, rip6_pr_output, 0, rip6_ctloutput,
auto_linklocal, CTLFLAG_RW, &ip6_auto_linklocal, 0, "");
SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD,
&rip6stat, rip6stat, "");
+#if MROUTING
SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RD,
&mrt6stat, mrt6stat, "");
+#endif
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NEIGHBORGCTHRESH,
neighborgcthresh, CTLFLAG_RW, &ip6_neighborgcthresh, 0, "");
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXIFPREFIXES,
ifnet_lock_done(ifp);
if (in6m)
ours = 1;
+#if MROUTING
else if (!ip6_mrouter) {
+#else
+ else {
+#endif
ip6stat.ip6s_notmember++;
ip6stat.ip6s_cantforward++;
in6_ifstat_inc(ifp, ifs6_in_discard);
* ip6_mforward() returns a non-zero value, the packet
* must be discarded, else it may be accepted below.
*/
+#if MROUTING
if (ip6_mrouter && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) {
ip6stat.ip6s_cantforward++;
m_freem(m);
lck_mtx_unlock(ip6_mutex);
return;
}
+#endif
if (!ours) {
m_freem(m);
lck_mtx_unlock(ip6_mutex);
struct socket *ip6_mrouter = NULL;
int ip6_mrouter_ver = 0;
int ip6_mrtproto = IPPROTO_PIM; /* for netstat only */
+
+#if MROUTING
+
struct mrt6stat mrt6stat;
#define NO_RTE_FOUND 0x1
rip6_input(&m, offp);
return(IPPROTO_DONE);
}
+#endif
};
#endif /* _NETINET_IP_MROUTE_H_ */
+#if MROUTING
#ifdef KERNEL_PRIVATE
extern struct mrt6stat mrt6stat;
extern int mrt6_ioctl(u_long, caddr_t);
#endif /* KERNEL_PRIVATE */
#endif /* PRIVATE */
+#endif
#endif /* !_NETINET6_IP6_MROUTE_H_ */
* above, will be forwarded by the ip6_input() routine,
* if necessary.
*/
+#if MROUTING
if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
if (ip6_mforward(ip6, ifp, m) != 0) {
m_freem(m);
goto done;
}
}
+#endif
}
/*
* Multicasts with a hoplimit of zero may be looped back,
extern int ip6_maxifprefixes; /* Max acceptable prefixes via RA per IF */
extern int ip6_maxifdefrouters; /* Max acceptable def routers via RA */
extern int ip6_maxdynroutes; /* Max # of routes created via redirect */
-
+#ifdef MROUTING
extern struct socket *ip6_mrouter; /* multicast routing daemon */
+#endif
extern int ip6_sendredirects; /* send IP redirects when forwarding? */
extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */
extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */
struct ip *ip;
struct sockaddr_in* dst4;
struct route *ro4 = NULL;
+ struct ip_out_args ipoa = { IFSCOPE_NONE };
/*
* must be last isr because encapsulated IPv6 packet
dst4->sin_family = AF_INET;
dst4->sin_len = sizeof(*dst4);
dst4->sin_addr = ip->ip_dst;
- rtalloc(ro4);
}
- if (ro4->ro_rt == NULL) {
- OSAddAtomic(1, &ipstat.ips_noroute);
- error = EHOSTUNREACH;
- goto bad;
- }
-
state->m = ipsec4_splithdr(state->m);
if (!state->m) {
error = ENOMEM;
}
ip = mtod(state->m, struct ip *);
ip->ip_len = ntohs(ip->ip_len); /* flip len field before calling ip_output */
- ip_output(state->m, NULL, ro4, 0, NULL, NULL);
+ error = ip_output(state->m, NULL, ro4, IP_OUTARGS, NULL, &ipoa);
state->m = NULL;
+ if (error != 0)
+ goto bad;
goto done;
} else {
ipseclog((LOG_ERR, "ipsec6_output_tunnel: "
struct udphdr *uh;
struct ip *ip;
int error;
+ struct ip_out_args ipoa = { IFSCOPE_NONE };
lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED);
uh->uh_sum = 0;
*(u_int8_t*)((char*)m_mtod(m) + sizeof(struct ip) + sizeof(struct udphdr)) = 0xFF;
- error = ip_output(m, NULL, &sav->sah->sa_route, IP_NOIPSEC, NULL, NULL);
+ error = ip_output(m, NULL, &sav->sah->sa_route, IP_OUTARGS | IP_NOIPSEC, NULL, &ipoa);
if (error == 0) {
sav->natt_last_activity = natt_now;
return TRUE;
* Request loopback of the report if we are acting as a multicast
* router, so that the process-level routing daemon can hear it.
*/
+#if MROUTING
im6o.im6o_multicast_loop = (ip6_mrouter != NULL);
+#else
+ im6o.im6o_multicast_loop = 0;
+#endif
/* increment output statictics */
icmp6stat.icp6s_outhist[type]++;
/*
- * Copyright (c) 2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
dr = defrouter_lookup(&((struct sockaddr_in6 *)rt_key(rt))->
sin6_addr, rt->rt_ifp);
- if (ln->ln_router || dr) {
+ if ((ln && ln->ln_router) || dr) {
/*
* rt6_flush must be called whether or not the neighbor
* is in the Default Router List.
#if IFT_IEEE80211
case IFT_IEEE80211:
#endif
+ case IFT_BRIDGE:
case IFT_GIF: /* XXX need more cases? */
return(1);
default:
#if IFT_IEEE80211
case IFT_IEEE80211:
#endif
+ case IFT_BRIDGE:
ETHER_MAP_IPV6_MULTICAST(&SIN6(dst)->sin6_addr,
desten);
return(1);
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
case MRT6_PIM:
+#if MROUTING
error = ip6_mrouter_get(so, sopt);
break;
+#endif
default:
error = ip6_ctloutput(so, sopt);
break;
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
case MRT6_PIM:
+#if MROUTING
error = ip6_mrouter_set(so, sopt);
break;
+#endif
default:
error = ip6_ctloutput(so, sopt);
break;
if (inp == 0)
panic("rip6_detach");
/* xxx: RSVP */
+#if MROUTING
if (so == ip6_mrouter)
ip6_mrouter_done();
+#endif
if (inp->in6p_icmp6filt) {
FREE(inp->in6p_icmp6filt, M_PCB);
inp->in6p_icmp6filt = NULL;
if (slp->ns_sotype == SOCK_STREAM) {
/*
* If there are already records on the queue, defer soreceive()
- * to an nfsd so that there is feedback to the TCP layer that
+ * to an(other) nfsd so that there is feedback to the TCP layer that
* the nfs servers are heavily loaded.
*/
- if (slp->ns_rec && waitflag == MBUF_DONTWAIT) {
+ if (slp->ns_rec) {
ns_flag = SLP_NEEDQ;
goto dorecs;
}
* Parameters for buffer cache garbage collection
*/
#define BUF_STALE_THRESHHOLD 30 /* Collect if untouched in the last 30 seconds */
+#define BUF_MAX_GC_COUNT 1000 /* Generally 6-8 MB */
/*
* mask used by buf_flags... these are the readable external flags
typedef struct jetsam_priority_entry {
pid_t pid;
uint32_t flags;
+ int32_t hiwat_pages;
+ int32_t hiwat_reserved1;
+ int32_t hiwat_reserved2;
+ int32_t hiwat_reserved3;
} jetsam_priority_entry_t;
/*
* NFS export related mount flags.
*/
#define MNT_EXPORTED 0x00000100 /* file system is exported */
+#ifdef PRIVATE
+#define MNT_IMGSRC 0x00000200
+#endif /* CONFIG_IMGSRC_ACCESS */
/*
* MAC labeled / "quarantined" flag
* because the bits here were broken out from the high bits
* of the mount flags.
*/
+#ifdef CONFIG_IMGSRC_ACCESS
+#define MNTK_HAS_MOVED 0x00002000
+#define MNTK_BACKS_ROOT 0x00004000
+#endif /* CONFIG_IMGSRC_ACCESS */
#define MNTK_AUTH_CACHE_TTL 0x00008000 /* rights cache has TTL - TTL of 0 disables cache */
#define MNTK_PATH_FROM_ID 0x00010000 /* mounted file system supports id-to-path lookups */
#define MNTK_UNMOUNT_PREFLIGHT 0x00020000 /* mounted file system wants preflight check during unmount */
TAILQ_ENTRY(threadlist) th_entry;
thread_t th_thread;
int th_flags;
- uint32_t th_suspended;
uint16_t th_affinity_tag;
uint8_t th_priority;
uint8_t th_policy;
#define TH_LIST_BLOCKED 0x04
#define TH_LIST_SUSPENDED 0x08
#define TH_LIST_BUSY 0x10
+#define TH_LIST_NEED_WAKEUP 0x20
struct workitem {
TAILQ_ENTRY(workitem) wi_entry;
#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)
#define PRIO_DARWIN_THREAD 3 /* Second argument is always 0 (current thread) */
+#define PRIO_DARWIN_PROCESS 4 /* Second argument is a PID */
/*
* Range limitations for the value of the third parameter to setpriority().
#define PRIO_MIN -20
#define PRIO_MAX 20
-/* use PRIO_DARWIN_BG to set the current thread into "background" state
+/*
+ * use PRIO_DARWIN_BG to set the current thread into "background" state
* which lowers CPU, disk IO, and networking priorites until thread terminates
* or "background" state is revoked
*/
/*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
short so_options; /* from socket call, see socket.h */
short so_linger; /* time to linger while closing */
short so_state; /* internal state flags SS_*, below */
- caddr_t so_pcb; /* protocol control block */
+ void *so_pcb; /* protocol control block */
struct protosw *so_proto; /* protocol handle */
/*
* Variables for connection queueing.
/*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#define SIOCGIFBOND _IOWR('i', 71, struct ifreq) /* get bond if config */
#define SIOCIFCREATE _IOWR('i', 120, struct ifreq) /* create clone if */
#define SIOCIFDESTROY _IOW('i', 121, struct ifreq) /* destroy clone if */
+
+#define SIOCSDRVSPEC _IOW('i', 123, struct ifdrv) /* set driver-specific
+ parameters */
+#define SIOCGDRVSPEC _IOWR('i', 123, struct ifdrv) /* get driver-specific
+ parameters */
+#ifdef KERNEL_PRIVATE
+#define SIOCSDRVSPEC32 _IOW('i', 123, struct ifdrv32) /* set driver-specific
+ parameters */
+#define SIOCGDRVSPEC32 _IOWR('i', 123, struct ifdrv32) /* get driver-specific
+ parameters */
+#define SIOCSDRVSPEC64 _IOW('i', 123, struct ifdrv64) /* set driver-specific
+ parameters */
+#define SIOCGDRVSPEC64 _IOWR('i', 123, struct ifdrv64) /* get driver-specific
+ parameters */
+
+#endif /* KERNEL_PRIVATE */
#define SIOCSIFVLAN _IOW('i', 126, struct ifreq) /* set VLAN config */
#define SIOCGIFVLAN _IOWR('i', 127, struct ifreq) /* get VLAN config */
#define SIOCSETVLAN SIOCSIFVLAN
extern struct zone *ubc_info_zone;
+/*
+ * Maximum number of vfs clusters per vnode
+ */
+#define MAX_CLUSTERS CONFIG_MAX_CLUSTERS
-#define MAX_CLUSTERS 8 /* maximum number of vfs clusters per vnode */
#define SPARSE_PUSH_LIMIT 4 /* limit on number of concurrent sparse pushes outside of the cl_lockw */
/* once we reach this limit, we'll hold the lock */
*/
extern struct vnode *rootvnode; /* root (i.e. "/") vnode */
+#ifdef CONFIG_IMGSRC_ACCESS
+extern struct vnode *imgsrc_rootvnode;
+#endif /* CONFIG_IMGSRC_ACCESS */
+
/*
* Mods for exensibility.
static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
static int buf_iterprepare(vnode_t vp, struct buflists *, int flags);
static void buf_itercomplete(vnode_t vp, struct buflists *, int flags);
-static boolean_t buffer_cache_gc(void);
+boolean_t buffer_cache_gc(void);
__private_extern__ int bdwrite_internal(buf_t, int);
return(0);
}
-static boolean_t
+boolean_t
buffer_cache_gc(void)
{
buf_t bp;
boolean_t did_large_zfree = FALSE;
int now = buf_timestamp();
+ uint32_t count = 0;
lck_mtx_lock_spin(buf_mtxp);
bp = TAILQ_FIRST(&bufqueues[BQ_META]);
/* Only collect buffers unused in the last N seconds. Note: ordered by timestamp. */
- while ((bp != NULL) && ((now - bp->b_timestamp) > BUF_STALE_THRESHHOLD)) {
+ while ((bp != NULL) && ((now - bp->b_timestamp) > BUF_STALE_THRESHHOLD) && (count < BUF_MAX_GC_COUNT)) {
int result, size;
boolean_t is_zalloc;
did_large_zfree = TRUE;
}
bp = TAILQ_FIRST(&bufqueues[BQ_META]);
+ count++;
}
lck_mtx_unlock(buf_mtxp);
* because IO_HEADZEROFILL and IO_TAILZEROFILL not set
*/
if ((start_offset + total_size) > max_io_size)
- total_size -= start_offset;
+ total_size = max_io_size - start_offset;
xfer_resid = total_size;
retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
#define DRT_HASH_SMALL_MODULUS 23
#define DRT_HASH_LARGE_MODULUS 401
+/*
+ * Physical memory required before the large hash modulus is permitted.
+ *
+ * On small memory systems, the large hash modulus can lead to phsyical
+ * memory starvation, so we avoid using it there.
+ */
+#define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
+
#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
* see whether we should grow to the large one.
*/
if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
- /* if the ring is nearly full */
- if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
+ /*
+ * If the ring is nearly full and we are allowed to
+ * use the large modulus, upgrade.
+ */
+ if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
+ (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
nsize = DRT_HASH_LARGE_MODULUS;
} else {
nsize = DRT_HASH_SMALL_MODULUS;
*/
struct mount *rootfs;
struct vnode *rootvnode;
+
+#ifdef CONFIG_IMGSRC_ACCESS
+struct vnode *imgsrc_rootvnode;
+#endif /* IMGSRC_ACESS */
+
int (*mountroot)(void) = NULL;
/*
} while (!eofflag);
/*
* If we've made it here all the files in the dir are ._ files.
- * As we iterate through to delete them, we will verify that
- * they are true AppleDouble files.
* We can delete the files even though the node is suspended
* because we are the owner of the file.
*/
(dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'))
) {
- /*
- * This is a ._ file, so verify it is actually an AppleDouble
- * file by checking the header before we remove it.
- */
- vnode_t xvp = NULL;
- int did_namei = 0;
-
- NDINIT(&nd_temp, DELETE, USEDVP | LOCKPARENT,
- UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), ctx);
+ NDINIT(&nd_temp, DELETE, USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), ctx);
nd_temp.ni_dvp = vp;
- error = namei(&nd_temp);
-
- if (error) {
- if (error == ENOENT) {
- error = 0;
- } else {
- error = ENOTEMPTY;
- }
- goto out1;
- }
- did_namei = 1;
-
- xvp = nd_temp.ni_vp;
-
- error = check_appledouble_header(xvp, ctx);
- if (error) {
- error = ENOTEMPTY;
- goto out1;
- }
-
- /* Remove the file. */
- error = VNOP_REMOVE(vp, xvp, &nd_temp.ni_cnd, 0, ctx);
- if (error) {
- if (error == ENOENT) {
- error = 0;
- }
- goto out1;
- }
-
-out1:
- /* drop extra reference on vp from LOCKPARENT namei */
- vnode_put (vp);
-
- if (did_namei) {
- nameidone(&nd_temp);
- did_namei = 0;
- }
- if (xvp) {
- vnode_put(xvp);
- xvp = NULL;
- }
- if (error) {
+ error = unlink1(ctx, &nd_temp, 0);
+ if (error && error != ENOENT) {
goto outsc;
}
-
}
cpos += dp->d_reclen;
dp = (struct dirent*)cpos;
static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp,
user_addr_t bufp);
static int fsync_common(proc_t p, struct fsync_args *uap, int flags);
+
+#ifdef CONFIG_IMGSRC_ACCESS
+static int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname);
+static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx);
+static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx);
+static void undo_place_on_covered_vp(mount_t mp, vnode_t vp);
+static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags);
+static void mount_end_update(mount_t mp);
+static int relocate_imageboot_source(vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs);
+#endif /* CONFIG_IMGSRC_ACCESS */
+
int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
__private_extern__
if (error)
goto out1;
+#ifdef CONFIG_IMGSRC_ACCESS
+ if (uap->flags == MNT_IMGSRC) {
+ error = relocate_imageboot_source(vp, &nd.ni_cnd, fstypename, ctx, is_64bit, fsmountargs);
+ vnode_put(pvp);
+ vnode_put(vp);
+ return error;
+ }
+#endif /* CONFIG_IMGSRC_ACCESS */
+
if (uap->flags & MNT_UPDATE) {
if ((vp->v_flag & VROOT) == 0) {
error = EINVAL;
error = ENOTSUP;
goto out1;
}
+
+#ifdef CONFIG_IMGSRC_ACCESS
+ /* Can't downgrade the backer of the root FS */
+ if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) &&
+ (!vfs_isrdonly(mp)) && (uap->flags & MNT_RDONLY))
+ {
+ error = ENOTSUP;
+ goto out1;
+ }
+#endif /* CONFIG_IMGSRC_ACCESS */
+
/*
* Only root, or the user that did the original mount is
* permitted to update it.
return(error);
}
+#ifdef CONFIG_IMGSRC_ACCESS
+/*
+ * Flush in-core data, check for competing mount attempts,
+ * and set VMOUNT
+ */
+static int
+prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname)
+{
+ struct vnode_attr va;
+ int error;
+
+ /*
+ * If the user is not root, ensure that they own the directory
+ * onto which we are attempting to mount.
+ */
+ VATTR_INIT(&va);
+ VATTR_WANTED(&va, va_uid);
+ if ((error = vnode_getattr(vp, &va, ctx)) ||
+ (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
+ (!vfs_context_issuser(ctx)))) {
+ error = EPERM;
+ goto out;
+ }
+
+ if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) )
+ goto out;
+
+ if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) )
+ goto out;
+
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+
+ if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) {
+ error = EBUSY;
+ goto out;
+ }
+
+#if CONFIG_MACF
+ error = mac_mount_check_mount(ctx, vp,
+ cnp, fsname);
+ if (error != 0)
+ goto out;
+#endif
+
+ vnode_lock_spin(vp);
+ SET(vp->v_flag, VMOUNT);
+ vnode_unlock(vp);
+
+out:
+ return error;
+}
+
+static int
+authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx)
+{
+ struct nameidata nd;
+ vnode_t vp;
+ mode_t accessmode;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx);
+ if ( (error = namei(&nd)) )
+ return error;
+
+ strncpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN);
+ vp = nd.ni_vp;
+ nameidone(&nd);
+
+ if (vp->v_type != VBLK) {
+ error = ENOTBLK;
+ goto out;
+ }
+ if (major(vp->v_rdev) >= nblkdev) {
+ error = ENXIO;
+ goto out;
+ }
+ /*
+ * If mount by non-root, then verify that user has necessary
+ * permissions on the device.
+ */
+ if (!vfs_context_issuser(ctx)) {
+ accessmode = KAUTH_VNODE_READ_DATA;
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ accessmode |= KAUTH_VNODE_WRITE_DATA;
+ if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0)
+ goto out;
+ }
+
+ *devvpp = vp;
+out:
+ if (error) {
+ vnode_put(vp);
+ }
+
+ return error;
+}
+
+/*
+ * Clear VMOUNT, set v_mountedhere, and mnt_vnodecovered, ref the vnode,
+ * and call checkdirs()
+ */
+static int
+place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx)
+{
+ int error;
+
+ mp->mnt_vnodecovered = vp; /* XXX This is normally only set at init-time ... */
+
+ vnode_lock_spin(vp);
+ CLR(vp->v_flag, VMOUNT);
+ vp->v_mountedhere = mp;
+ vnode_unlock(vp);
+
+ /*
+ * taking the name_cache_lock exclusively will
+ * insure that everyone is out of the fast path who
+ * might be trying to use a now stale copy of
+ * vp->v_mountedhere->mnt_realrootvp
+ * bumping mount_generation causes the cached values
+ * to be invalidated
+ */
+ name_cache_lock();
+ mount_generation++;
+ name_cache_unlock();
+
+ error = vnode_ref(vp);
+ if (error != 0) {
+ goto out;
+ }
+
+ error = checkdirs(vp, ctx);
+ if (error != 0) {
+ /* Unmount the filesystem as cdir/rdirs cannot be updated */
+ vnode_rele(vp);
+ goto out;
+ }
+
+out:
+ if (error != 0) {
+ mp->mnt_vnodecovered = NULLVP;
+ }
+ return error;
+}
+
+static void
+undo_place_on_covered_vp(mount_t mp, vnode_t vp)
+{
+ vnode_rele(vp);
+ vnode_lock_spin(vp);
+ vp->v_mountedhere = (mount_t)NULL;
+ vnode_unlock(vp);
+
+ mp->mnt_vnodecovered = NULLVP;
+}
+
+static int
+mount_begin_update(mount_t mp, vfs_context_t ctx, int flags)
+{
+ int error;
+
+ /* unmount in progress return error */
+ mount_lock_spin(mp);
+ if (mp->mnt_lflag & MNT_LUNMOUNT) {
+ mount_unlock(mp);
+ return EBUSY;
+ }
+ mount_unlock(mp);
+ lck_rw_lock_exclusive(&mp->mnt_rwlock);
+
+ /*
+ * We only allow the filesystem to be reloaded if it
+ * is currently mounted read-only.
+ */
+ if ((flags & MNT_RELOAD) &&
+ ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+ error = ENOTSUP;
+ goto out;
+ }
+
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to update it.
+ */
+ if (mp->mnt_vfsstat.f_owner != kauth_cred_getuid(vfs_context_ucred(ctx)) &&
+ (!vfs_context_issuser(ctx))) {
+ error = EPERM;
+ goto out;
+ }
+#if CONFIG_MACF
+ error = mac_mount_check_remount(ctx, mp);
+ if (error != 0) {
+ goto out;
+ }
+#endif
+
+out:
+ if (error) {
+ lck_rw_done(&mp->mnt_rwlock);
+ }
+
+ return error;
+}
+
+static void
+mount_end_update(mount_t mp)
+{
+ lck_rw_done(&mp->mnt_rwlock);
+}
+
+static int
+relocate_imageboot_source(vnode_t vp, struct componentname *cnp,
+ const char *fsname, vfs_context_t ctx,
+ boolean_t is64bit, user_addr_t fsmountargs)
+{
+ int error;
+ mount_t mp;
+ boolean_t placed = FALSE;
+ vnode_t devvp;
+ struct vfstable *vfsp;
+ user_addr_t devpath;
+ char *old_mntonname;
+
+ /* If we didn't imageboot, nothing to move */
+ if (imgsrc_rootvnode == NULLVP) {
+ return EINVAL;
+ }
+
+ /* Only root can do this */
+ if (!vfs_context_issuser(ctx)) {
+ return EPERM;
+ }
+
+ error = vnode_get(imgsrc_rootvnode);
+ if (error != 0) {
+ return error;
+ }
+
+ MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK);
+
+ /* Can only move once */
+ mp = vnode_mount(imgsrc_rootvnode);
+ if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
+ error = EBUSY;
+ goto out0;
+ }
+
+ /* Get exclusive rwlock on mount, authorize update on mp */
+ error = mount_begin_update(mp , ctx, 0);
+ if (error != 0) {
+ goto out0;
+ }
+
+ /*
+ * It can only be moved once. Flag is set under the rwlock,
+ * so we're now safe to proceed.
+ */
+ if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) {
+ goto out1;
+ }
+
+ /* Mark covered vnode as mount in progress, authorize placing mount on top */
+ error = prepare_coveredvp(vp, ctx, cnp, fsname);
+ if (error != 0) {
+ goto out1;
+ }
+
+ /* Sanity check the name caller has provided */
+ vfsp = mp->mnt_vtable;
+ if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) {
+ error = EINVAL;
+ goto out2;
+ }
+
+ /* Check the device vnode and update mount-from name, for local filesystems */
+ if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) {
+ if (is64bit) {
+ if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) )
+ goto out2;
+ fsmountargs += sizeof(devpath);
+ } else {
+ user32_addr_t tmp;
+ if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) )
+ goto out2;
+ /* munge into LP64 addr */
+ devpath = CAST_USER_ADDR_T(tmp);
+ fsmountargs += sizeof(tmp);
+ }
+
+ if (devpath != USER_ADDR_NULL) {
+ error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx);
+ if (error) {
+ goto out2;
+ }
+
+ vnode_put(devvp);
+ }
+ }
+
+ /*
+ * Place mp on top of vnode, ref the vnode, call checkdirs(),
+ * and increment the name cache's mount generation
+ */
+ error = place_mount_and_checkdirs(mp, vp, ctx);
+ if (error != 0) {
+ goto out2;
+ }
+
+ placed = TRUE;
+
+ strncpy(old_mntonname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
+ strncpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN);
+
+ /* Forbid future moves */
+ mount_lock(mp);
+ mp->mnt_kern_flag |= MNTK_HAS_MOVED;
+ mount_unlock(mp);
+
+ /* Finally, add to mount list, completely ready to go */
+ error = mount_list_add(mp);
+ if (error != 0) {
+ goto out3;
+ }
+
+ mount_end_update(mp);
+ vnode_put(imgsrc_rootvnode);
+ FREE(old_mntonname, M_TEMP);
+
+ return 0;
+out3:
+ strncpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN);
+
+ mount_lock(mp);
+ mp->mnt_kern_flag &= ~(MNTK_HAS_MOVED);
+ mount_unlock(mp);
+
+out2:
+ /*
+ * Placing the mp on the vnode clears VMOUNT,
+ * so cleanup is different after that point
+ */
+ if (placed) {
+ /* Rele the vp, clear VMOUNT and v_mountedhere */
+ undo_place_on_covered_vp(mp, vp);
+ } else {
+ vnode_lock_spin(vp);
+ CLR(vp->v_flag, VMOUNT);
+ vnode_unlock(vp);
+ }
+out1:
+ mount_end_update(mp);
+
+out0:
+ vnode_put(imgsrc_rootvnode);
+ FREE(old_mntonname, M_TEMP);
+ return error;
+}
+
+#endif /* CONFIG_IMGSRC_ACCESS */
+
void
enablequotas(struct mount *mp, vfs_context_t ctx)
{
goto out;
}
+#ifdef CONFIG_IMGSRC_ACCESS
+ if (mp->mnt_kern_flag & MNTK_BACKS_ROOT) {
+ error = EBUSY;
+ goto out;
+ }
+#endif /* CONFIG_IMGSRC_ACCESS */
+
return (dounmount(mp, flags, 1, ctx));
out:
do_build_install: install_symbol_sets
+EXPORTS_FILE_LIST = $(addprefix $(SOURCE)/,$(foreach set,$(SYMBOL_COMPONENT_LIST), $(set).exports $(set).$(ARCH_CONFIG_LC).exports))
+EXPORTS_FILE_LIST_NOSYSTEM60 = $(addprefix $(SOURCE)/,$(foreach set, $(filter-out System6.0,$(SYMBOL_COMPONENT_LIST)), $(set).exports $(set).$(ARCH_CONFIG_LC).exports))
+
+# Does not include "whole-kernel" clients
+build_mach_kernel_exports:
+ $(_v)if [ $(SUPPORT_SYSTEM60_KEXT) -eq 1 ]; then \
+ $(SOURCE)/generate_linker_exports.sh $(OBJPATH)/kernel-kpi.exp \
+ $(EXPORTS_FILE_LIST) || exit 1; \
+ else \
+ $(SOURCE)/generate_linker_exports.sh $(OBJPATH)/kernel-kpi.exp \
+ $(EXPORTS_FILE_LIST_NOSYSTEM60) || exit 1; \
+ fi;
+
include $(MakeInc_rule)
include $(MakeInc_dir)
-10.2.0
+10.3.0
# The first line of this file contains the master version number for the kernel.
# All other instances of the kernel version in xnu are derived from this file.
--- /dev/null
+#!/bin/sh
+
+set -e
+
+if [ $# -lt 2 ]; then
+ echo "Usage: $0 output.exp input1 [input2 ... ]" 1>&2
+ exit 1
+fi
+
+OUTPUT="$1"
+shift
+
+( grep -h -v ":" "$@"; grep -h ":" "$@" | awk -F: '{print $2}' ) | sort -u > "$OUTPUT"
+
+exit 0
if (state->fLocalMapper)
{
- state->fLocalMapperPageCount = atop_64(round_page(state->fPreparedLength));
+ state->fLocalMapperPageCount = atop_64(round_page(
+ state->fPreparedLength + ((state->fPreparedOffset + fMDSummary.fPageAlign) & page_mask)));
state->fLocalMapperPageAlloc = fMapper->iovmAllocDMACommand(this, state->fLocalMapperPageCount);
state->fMapContig = true;
}
&powerState, changeFlags);
#endif
+ // Invalidate the last recorded tickle power state when a power transition
+ // is about to occur, and not as a result of a tickle request.
+
+ if ((getPMRequestType() != kIOPMRequestTypeActivityTickle) &&
+ (fActivityTicklePowerState != -1))
+ {
+ IOLockLock(fActivityLock);
+ fActivityTicklePowerState = -1;
+ IOLockUnlock(fActivityLock);
+ }
+
// Initialize the change note.
fHeadNoteFlags = changeFlags;
| showtaskvme Display info about the task's vm_map entries
| showtaskipc Display info about the specified task's ipc space
| showtaskrights Display info about the task's ipc space entries
+| showtaskrightsbt Display info about the task's ipc space entries with back traces
| showtaskbusyports Display all of the task's ports with unread messages
|
| showact Display info about a thread specified by activation
| showallgdbcorestacks Corefile equivalent of "showallgdbstacks"
| kdp-reenter Schedule reentry into the debugger and continue.
| kdp-reboot Restart remote target
-| kdp-version Get KDP version number
+| kdp-version Get KDP version number
+| kdp-connect "shorthand" connection macro
|
| zstack Print zalloc caller stack (zone leak debugging)
| findoldest Find oldest zone leak debugging record
if $kgm_ie.ie_bits & 0x001f0000
set $kgm_name = (($kgm_iindex << 8)|($kgm_ie.ie_bits >> 24))
showipceint $kgm_iep $kgm_name
+ if $arg2 != 0 && $kgm_ie.ie_object != 0 && ($kgm_ie.ie_bits & 0x00070000) && ((ipc_port_t) $kgm_ie.ie_object)->ip_callstack[0] != 0
+ printf " user bt: "
+ showportbt $kgm_ie.ie_object $kgm_is.is_task
+ end
end
set $kgm_iindex = $kgm_iindex + 1
set $kgm_iep = &($kgm_is.is_table[$kgm_iindex])
define showipc
set $kgm_isp = (ipc_space_t)$arg0
showipcheader
- showipcint $kgm_isp 0
+ showipcint $kgm_isp 0 0
end
document showipc
Syntax: (gdb) showipc <ipc_space>
define showrights
set $kgm_isp = (ipc_space_t)$arg0
showipcheader
- showipcint $kgm_isp 1
+ showipcint $kgm_isp 1 0
end
document showrights
Syntax: (gdb) showrights <ipc_space>
showtaskheader
showipcheader
showtaskint $kgm_taskp
- showipcint $kgm_taskp->itk_space 0
+ showipcint $kgm_taskp->itk_space 0 0
end
document showtaskipc
Syntax: (gdb) showtaskipc <task>
showtaskheader
showipcheader
showtaskint $kgm_taskp
- showipcint $kgm_taskp->itk_space 1
+ showipcint $kgm_taskp->itk_space 1 0
end
document showtaskrights
Syntax: (gdb) showtaskrights <task>
| Routine to print info about the ipc rights for a task
end
+define showtaskrightsbt
+ set $kgm_taskp = (task_t)$arg0
+ showtaskheader
+ showipcheader
+ showtaskint $kgm_taskp
+ showipcint $kgm_taskp->itk_space 1 1
+end
+document showtaskrightsbt
+Syntax: (gdb) showtaskrightsbt <task>
+| Routine to print info about the ipc rights for a task with backtraces
+end
+
define showallipc
set $kgm_head_taskp = &tasks
set $kgm_cur_taskp = (struct task *)($kgm_head_taskp->next)
showtaskheader
showipcheader
showtaskint $kgm_cur_taskp
- showipcint $kgm_cur_taskp->itk_space 0
+ showipcint $kgm_cur_taskp->itk_space 0 0
set $kgm_cur_taskp = (struct task *)($kgm_cur_taskp->tasks.next)
end
end
showtaskheader
showipcheader
showtaskint $kgm_cur_taskp
- showipcint $kgm_cur_taskp->itk_space 1
+ showipcint $kgm_cur_taskp->itk_space 1 0
set $kgm_cur_taskp = (struct task *)($kgm_cur_taskp->tasks.next)
end
end
printf "0x%08x\n", $kgm_portp->ip_messages.data.port.msgcount
end
+define showportbt
+ set $kgm_iebt = ((ipc_port_t) $arg0)->ip_callstack
+ set $kgm_iepid = ((ipc_port_t) $arg0)->ip_spares[0]
+ set $kgm_procpid = ((proc_t) (((task_t) $arg1)->bsd_info))->p_pid
+ if $kgm_iebt[0] != 0
+ showptr $kgm_iebt[0]
+ set $kgm_iebt_loop_ctr = 1
+ while ($kgm_iebt_loop_ctr < 16 && $kgm_iebt[$kgm_iebt_loop_ctr])
+ printf " "
+ showptr $kgm_iebt[$kgm_iebt_loop_ctr]
+ set $kgm_iebt_loop_ctr = $kgm_iebt_loop_ctr + 1
+ end
+ if $kgm_iepid != $kgm_procpid
+ printf " (%d)", $kgm_iepid
+ end
+ printf "\n"
+ end
+end
+
define showportint
printf "0x%08x ", $arg0
set $kgm_portp = (struct ipc_port *)$arg0
dumpinfoint KDP_DUMPINFO_GETINFO
set $kgm_dumpinfo = (kdp_dumpinfo_reply_t *) manual_pkt.data
if $kgm_dumpinfo->type & KDP_DUMPINFO_REBOOT
- printf "Sysem will reboot after kernel info gets dumped.\n"
+ printf "System will reboot after kernel info gets dumped.\n"
else
printf "Sysem will not reboot after kernel info gets dumped.\n"
end
|Routine to print information about all receive rights on the system that
|have enqueued messages.
end
+
+define kdp-connect
+ if $argc > 0
+ kdp-reattach $arg0
+ else
+ printf "Attempting to attach to localhost...\n"
+ kdp-reattach localhost
+ end
+end
+
+document kdp-connect
+Syntax: (gdb) kdpconnect <address-of-remote-host>
+| Attach to the machine with given hostname or IP address, or 'localhost' if blank
+end
Boolean alreadyLoaded = false;
OSKext * lastLoadedKext = NULL;
- if (!sLoadEnabled) {
- if (!isLoaded() || (!isStarted() && startOpt != kOSKextExcludeNone) ||
- (startMatchingOpt != kOSKextExcludeNone)) {
-
- OSKextLog(this,
- kOSKextLogErrorLevel |
- kOSKextLogLoadFlag,
- "Kext loading is disabled "
- "(attempt to load/start/start matching for kext %s).",
- getIdentifierCString());
- }
- result = kOSKextReturnDisabled;
- goto finish;
- }
-
if (isLoaded()) {
alreadyLoaded = true;
result = kOSReturnSuccess;
goto loaded;
}
+ if (!sLoadEnabled) {
+ OSKextLog(this,
+ kOSKextLogErrorLevel |
+ kOSKextLogLoadFlag,
+ "Kext loading is disabled (attempt to load kext %s).",
+ getIdentifierCString());
+ result = kOSKextReturnDisabled;
+ goto finish;
+ }
+
/* If we've pushed the next available load tag to the invalid value,
* we can't load any more kexts.
*/
OSKext::saveLoadedKextPanicList();
loaded:
- /* This is a bit of a hack, because we shouldn't be handling
- * personalities within the load function.
- */
+
if (declaresExecutable() && (startOpt == kOSKextExcludeNone)) {
result = start();
if (result != kOSReturnSuccess) {
/* If not excluding matching, send the personalities to the kernel.
* This never affects the result of the load operation.
+ * This is a bit of a hack, because we shouldn't be handling
+ * personalities within the load function.
*/
if (result == kOSReturnSuccess && startMatchingOpt == kOSKextExcludeNone) {
- sendPersonalitiesToCatalog(true, personalityNames);
+ result = sendPersonalitiesToCatalog(true, personalityNames);
}
-
finish:
+
+ /* More hack! If the kext doesn't declare an executable, even if we
+ * "loaded" it, we have to remove any personalities naming it, or we'll
+ * never see the registry go quiet. Errors here do not count for the
+ * load operation itself.
+ *
+ * Note that in every other regard it's perfectly ok for a kext to
+ * not declare an executable and serve only as a package for personalities
+ * naming another kext, so we do have to allow such kexts to be "loaded"
+ * so that those other personalities get added & matched.
+ */
+ if (!declaresExecutable()) {
+ OSKextLog(this,
+ kOSKextLogStepLevel | kOSKextLogLoadFlag,
+ "Kext %s has no executable; removing any personalities naming it.",
+ getIdentifierCString());
+ removePersonalitiesFromCatalog();
+ }
+
if (result != kOSReturnSuccess) {
OSKextLog(this,
kOSKextLogErrorLevel |
goto finish;
}
+ if (!sLoadEnabled) {
+ OSKextLog(this,
+ kOSKextLogErrorLevel |
+ kOSKextLogLoadFlag,
+ "Kext loading is disabled (attempt to start kext %s).",
+ getIdentifierCString());
+ result = kOSKextReturnDisabled;
+ goto finish;
+ }
+
result = validateKextMapping(/* start? */ true);
if (result != kOSReturnSuccess) {
goto finish;
/*********************************************************************
Might want to change this to a bool return?
*********************************************************************/
-void
+OSReturn
OSKext::sendPersonalitiesToCatalog(
bool startMatching,
OSArray * personalityNames)
{
- OSArray * personalitiesToSend = NULL; // must release
- OSDictionary * kextPersonalities = NULL; // do not release
+ OSReturn result = kOSReturnSuccess;
+ OSArray * personalitiesToSend = NULL; // must release
+ OSDictionary * kextPersonalities = NULL; // do not release
int count, i;
+ if (!sLoadEnabled) {
+ OSKextLog(this,
+ kOSKextLogErrorLevel |
+ kOSKextLogLoadFlag,
+ "Kext loading is disabled (attempt to start matching for kext %s).",
+ getIdentifierCString());
+ result = kOSKextReturnDisabled;
+ goto finish;
+ }
+
if (sSafeBoot && !isLoadableInSafeBoot()) {
OSKextLog(this,
kOSKextLogErrorLevel |
"Kext %s is not loadable during safe boot; "
"not sending personalities to the IOCatalogue.",
getIdentifierCString());
- return;
+ result = kOSKextReturnNotLoadable;
+ goto finish;
}
if (!personalityNames || !personalityNames->getCount()) {
kextPersonalities = OSDynamicCast(OSDictionary,
getPropertyForHostArch(kIOKitPersonalitiesKey));
if (!kextPersonalities || !kextPersonalities->getCount()) {
+ // not an error
goto finish;
}
personalitiesToSend = OSArray::withCapacity(0);
if (!personalitiesToSend) {
+ result = kOSKextReturnNoMemory;
goto finish;
}
count = personalityNames->getCount();
if (personalitiesToSend) {
personalitiesToSend->release();
}
- return;
+ return result;
}
/*********************************************************************
+* xxx - We should allow removing the kext's declared personalities,
+* xxx - even with other bundle identifiers.
*********************************************************************/
void
OSKext::removePersonalitiesFromCatalog(void)
options CONFIG_KXLD # kxld/runtime linking of kexts # <config_kxld>
+# secure_kernel - secure kernel from user programs
+options SECURE_KERNEL # <secure_kernel>
+
libkern/kxld/kxld.c optional config_kxld
libkern/kxld/kxld_array.c optional config_kxld
libkern/kxld/kxld_copyright.c optional config_kxld
+libkern/kxld/kxld_demangle.c optional config_kxld
libkern/kxld/kxld_dict.c optional config_kxld
libkern/kxld/kxld_kext.c optional config_kxld
libkern/kxld/kxld_reloc.c optional config_kxld
-isysroot $(SDKROOT)
LDFLAGS=$(ARCHS) -dynamiclib -install_name $(LIBKXLDNAME) \
-compatibility_version $(COMPATIBILITY_VERSION) \
- -current_version $(CURRENT_VERSION) -isysroot $(SDKROOT)
+ -current_version $(CURRENT_VERSION) -isysroot $(SDKROOT) -lstdc++
INCLUDES=-I$(HDRSRC) $(INCFLAGS_EXTERN)
# Tools
# Files
HDR_NAMES=kxld.h kxld_types.h
-OBJ_NAMES=kxld.o kxld_array.o kxld_copyright.o kxld_dict.o kxld_kext.o kxld_reloc.o \
- kxld_sect.o kxld_seg.o kxld_sym.o kxld_state.o kxld_symtab.o kxld_util.o \
- kxld_uuid.o kxld_vtable.o
+OBJ_NAMES=kxld.o kxld_array.o kxld_copyright.o kxld_demangle.o kxld_dict.o \
+ kxld_kext.o kxld_reloc.o kxld_sect.o kxld_seg.o kxld_sym.o kxld_state.o \
+ kxld_symtab.o kxld_util.o kxld_uuid.o kxld_vtable.o
HDRS=$(addprefix $(HDRSRC)/, $(HDR_NAMES))
OBJS=$(addprefix $(OBJROOT)/, $(OBJ_NAMES))
*/
if (array->maxitems < nitems) {
STAILQ_FOREACH_SAFE(srcpool, &array->pools, entries, tmp) {
- STAILQ_INSERT_TAIL(&srcpools, srcpool, entries);
STAILQ_REMOVE(&array->pools, srcpool, kxld_array_pool, entries);
+ STAILQ_INSERT_TAIL(&srcpools, srcpool, entries);
}
srcpool_capacity = array->pool_capacity;
bzero(array, sizeof(*array));
--- /dev/null
+#if !KERNEL
+
+#include <stdlib.h>
+
+/* This demangler is part of the C++ ABI. We don't include it directly from
+ * <cxxabi.h> so that we can avoid using C++ in the kernel linker.
+ */
+extern char *
+__cxa_demangle(const char* __mangled_name, char* __output_buffer,
+ size_t* __length, int* __status);
+
+#endif /* !KERNEL */
+
+#include "kxld_demangle.h"
+
+/*******************************************************************************
+*******************************************************************************/
+const char *
+kxld_demangle(const char *str, char **buffer __unused, size_t *length __unused)
+{
+#if KERNEL
+ return str;
+#else
+ const char *rval = NULL;
+ char *demangled = NULL;
+ int status;
+
+ if (!str) goto finish;
+
+ rval = str;
+
+ if (!buffer || !length) goto finish;
+
+ /* Symbol names in the symbol table have an extra '_' prepended to them,
+ * so we skip the first character to make the demangler happy.
+ */
+ demangled = __cxa_demangle(str+1, *buffer, length, &status);
+ if (!demangled || status) goto finish;
+
+ *buffer = demangled;
+ rval = demangled;
+finish:
+ return rval;
+#endif
+}
+
--- /dev/null
+#ifndef _KXLD_DEMANGLE_H_
+#define _KXLD_DEMANGLE_H_
+
+#include <sys/types.h>
+
+/* @function kxld_demangle
+
+ * @abstract Demangles c++ symbols.
+ *
+ * @param str The C-string to be demangled.
+ * @param buffer A pointer to a character buffer for storing the result.
+ * If NULL, a buffer will be malloc'd and stored here.
+ * If the buffer is not large enough, it will be realloc'd.
+ *
+ * @param length The length of the buffer.
+ *
+ * @result If the input string could be demangled, it returns the
+ * demangled string. Otherwise, returns the input string.
+ *
+ */
+const char * kxld_demangle(const char *str, char **buffer, size_t *length)
+ __attribute__((pure, nonnull, visibility("hidden")));
+
+#endif /* !_KXLD_DEMANGLE_H_ */
#define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld"
#include <AssertMacros.h>
+#include "kxld_demangle.h"
#include "kxld_dict.h"
#include "kxld_kext.h"
#include "kxld_reloc.h"
char class_name[KXLD_MAX_NAME_LEN];
char vtable_name[KXLD_MAX_NAME_LEN];
char meta_vtable_name[KXLD_MAX_NAME_LEN];
+ char *demangled_name1 = NULL;
+ char *demangled_name2 = NULL;
+ size_t demangled_length1 = 0;
+ size_t demangled_length2 = 0;
u_int i = 0;
u_int nvtables = 0;
} else {
kxld_log(kKxldLogPatching, kKxldLogErr,
"Warning: " kKxldLogMissingVtable,
- meta_vtable_name, class_name);
+ kxld_demangle(meta_vtable_name, &demangled_name1,
+ &demangled_length1),
+ kxld_demangle(class_name, &demangled_name2,
+ &demangled_length2));
kxld_array_resize(&kext->vtables, --nvtables);
}
}
rval = KERN_SUCCESS;
finish:
+
+ if (demangled_name1) kxld_free(demangled_name1, demangled_length1);
+ if (demangled_name2) kxld_free(demangled_name2, demangled_length2);
+
return rval;
}
boolean_t tests_for_weak = FALSE;
boolean_t error = FALSE;
boolean_t warning = FALSE;
+ char *demangled_name = NULL;
+ size_t demangled_length = 0;
check(kext);
check(defined_symbols);
"The following symbols were defined more than once:");
}
- kxld_log(kKxldLogLinking, kKxldLogErr,
- "\t%s: %p - %p", sym->name,
+ kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s: %p - %p",
+ kxld_demangle(sym->name, &demangled_name, &demangled_length),
(void *) (uintptr_t) sym->link_addr,
(void *) (uintptr_t) addr);
}
"The following are common symbols:");
}
}
- kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", sym->name);
+ kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s",
+ kxld_demangle(sym->name, &demangled_name, &demangled_length));
} else {
if (obsolete_symbols && kxld_dict_find(obsolete_symbols, name)) {
kxld_log(kKxldLogLinking, kKxldLogWarn,
- "This kext uses obsolete symbol %s.", name);
+ "This kext uses obsolete symbol %s.",
+ kxld_demangle(name, &demangled_name, &demangled_length));
}
} else if (kext->link_type == KXLD_LINK_PSEUDO_KEXT) {
"This symbol set has the following unresolved symbols:");
warning = TRUE;
}
- kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", sym->name);
+ kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s",
+ kxld_demangle(sym->name, &demangled_name, &demangled_length));
kxld_sym_delete(sym);
} else if (kxld_sym_is_weak(sym)) {
rval = KERN_SUCCESS;
finish:
+ if (demangled_name) kxld_free(demangled_name, demangled_length);
return rval;
}
char vtable_name[KXLD_MAX_NAME_LEN];
char super_vtable_name[KXLD_MAX_NAME_LEN];
char final_sym_name[KXLD_MAX_NAME_LEN];
+ char *demangled_name1 = NULL;
+ char *demangled_name2 = NULL;
+ size_t demangled_length1 = 0;;
+ size_t demangled_length2 = 0;
size_t len = 0;
u_int nvtables = 0;
u_int npatched = 0;
if (failure) {
kxld_log(kKxldLogPatching, kKxldLogErr,
- "\t%s (super vtable %s)", vtable_name, super_vtable_name);
+ "\t'%s' (super vtable '%s')",
+ kxld_demangle(vtable_name, &demangled_name1,
+ &demangled_length1),
+ kxld_demangle(super_vtable_name, &demangled_name2,
+ &demangled_length2));
continue;
}
require_action(!final_sym, finish,
rval=KERN_FAILURE;
kxld_log(kKxldLogPatching, kKxldLogErr,
- "Class %s is a subclass of final class %s.",
- class_name, super_class_name));
+ "Class '%s' is a subclass of final class '%s'.",
+ kxld_demangle(class_name, &demangled_name1,
+ &demangled_length1),
+ kxld_demangle(super_class_name, &demangled_name2,
+ &demangled_length2)));
/* Patch the class's vtable */
rval = kxld_vtable_patch(vtable, super_vtable, kext->symtab,
rval = KERN_SUCCESS;
finish:
+ if (demangled_name1) kxld_free(demangled_name1, demangled_length1);
+ if (demangled_name2) kxld_free(demangled_name2, demangled_length2);
+
return rval;
}
KXLDSymtabIterator iter;
KXLDSym *sym = NULL;
u_int error = FALSE;
+ char *demangled_name = NULL;
+ size_t demangled_length = 0;
/* Check for any unresolved symbols */
kxld_symtab_iterator_init(&iter, kext->symtab, kxld_sym_is_unresolved, FALSE);
kxld_log(kKxldLogLinking, kKxldLogErr,
"The following symbols are unresolved for this kext:");
}
- kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s", sym->name);
+ kxld_log(kKxldLogLinking, kKxldLogErr, "\t%s",
+ kxld_demangle(sym->name, &demangled_name, &demangled_length));
}
require_noerr_action(error, finish, rval=KERN_FAILURE);
rval = KERN_SUCCESS;
finish:
+ if (demangled_name) kxld_free(demangled_name, demangled_length);
return rval;
}
#define kKxldLogArchNotSupported "The target architecture (cputype 0x%x) is not supported by kxld."
#define kKxldLogArchNotFound "The kext does not contain a fat slice for the target architecture."
#define kKxldLogFiletypeNotSupported "The Mach-O filetype 0x%x is not supported on the target architecture."
-#define kKxldLogTruncatedMachO "The Mach-O file has been truncated. Make sure the Mach-O header structures are correct."
+#define kKxldLogTruncatedMachO "The Mach-O file has been truncated. Make sure the Mach-O header structures are correct."
#define kKxldLogMalformedMachO "The Mach-O file is malformed: "
-#define kKxldLogMalformedVTable "The vtable %s is malformed. Make sure your kext has been built against the correct headers."
-#define kKxldLogMissingVtable "Cannot find the vtable %s for class %s. This vtable symbol is required for binary compatibility, and it may have been stripped."
-#define kKxldLogParentOutOfDate "The super class vtable %s for vtable %s is out of date. Make sure your kext has been built against the correct headers."
+#define kKxldLogMalformedVTable "The vtable '%s' is malformed. Make sure your kext has been built against the correct headers."
+#define kKxldLogMissingVtable "Cannot find the vtable '%s' for class '%s'. This vtable symbol is required for binary compatibility, and it may have been stripped."
+#define kKxldLogParentOutOfDate "The super class vtable '%s' for vtable '%s' is out of date. Make sure your kext has been built against the correct headers."
#define kKxldLogNoKmodInfo "The kext is missing its kmod_info structure."
#define kKxldLogInvalidSectReloc "Relocation entry %u from section %s,%s cannot be processed."
#define kKxldLogInvalidExtReloc "External relocation entry %u cannot be processed."
#define kKxldLogInvalidIntReloc "Internal relocation entry %u cannot be processed."
-#define kKxldLogRelocationOverflow "A relocation entry has overflowed. The kext may be too far from one " \
- "of its dependencies. Check your kext's load address."
+#define kKxldLogRelocationOverflow "A relocation entry has overflowed. The kext may be too far from one " \
+ "of its dependencies. Check your kext's load address."
/*******************************************************************************
* Allocators
#define DEBUG_ASSERT_COMPONENT_NAME_STRING "kxld"
#include <AssertMacros.h>
+#include "kxld_demangle.h"
#include "kxld_reloc.h"
#include "kxld_sect.h"
#include "kxld_state.h"
const KXLDRelocator *relocator)
{
kern_return_t rval = KERN_FAILURE;
+ char *demangled_name = NULL;
+ size_t demangled_length = 0;
check(vtable);
check(sym);
require_action(kxld_sect_get_num_relocs(sect) == 0, finish,
rval=KERN_FAILURE;
kxld_log(kKxldLogPatching, kKxldLogErr,
- kKxldLogMalformedVTable, vtable->name));
+ kKxldLogMalformedVTable,
+ kxld_demangle(vtable->name, &demangled_name, &demangled_length)));
rval = init_by_entries(vtable, symtab, relocator);
require_noerr(rval, finish);
rval = KERN_SUCCESS;
finish:
-
if (rval) kxld_vtable_deinit(vtable);
+ if (demangled_name) kxld_free(demangled_name, demangled_length);
return rval;
}
const KXLDRelocator *relocator)
{
kern_return_t rval = KERN_FAILURE;
+ char *demangled_name = NULL;
+ size_t demangled_length = 0;
check(vtable);
check(sym);
require_action(kxld_sect_get_num_relocs(sect) > 0, finish,
rval=KERN_FAILURE;
kxld_log(kKxldLogPatching, kKxldLogErr,
- kKxldLogMalformedVTable, vtable->name));
+ kKxldLogMalformedVTable,
+ kxld_demangle(vtable->name, &demangled_name, &demangled_length)));
rval = init_by_relocs(vtable, sym, sect, symtab, relocator);
require_noerr(rval, finish);
rval = KERN_SUCCESS;
finish:
-
if (rval) kxld_vtable_deinit(vtable);
+ if (demangled_name) kxld_free(demangled_name, demangled_length);
return rval;
}
const KXLDRelocator *relocator, const KXLDArray *relocs)
{
kern_return_t rval = KERN_FAILURE;
+ char *demangled_name = NULL;
+ size_t demangled_length = 0;
check(vtable);
check(sym);
require_action(kxld_sect_get_num_relocs(sect) == 0, finish,
rval=KERN_FAILURE;
kxld_log(kKxldLogPatching, kKxldLogErr,
- kKxldLogMalformedVTable, vtable->name));
+ kKxldLogMalformedVTable,
+ kxld_demangle(vtable->name, &demangled_name, &demangled_length)));
rval = init_by_entries_and_relocs(vtable, sym, symtab,
relocator, relocs);
finish:
if (rval) kxld_vtable_deinit(vtable);
+ if (demangled_name) kxld_free(demangled_name, demangled_length);
return rval;
}
kxld_addr_t entry_offset = 0;
u_int nentries = 0;
u_int i = 0;
+ char *demangled_name1 = NULL;
+ size_t demangled_length1 = 0;
check(vtable);
check(sym);
require_action(reloc, finish,
rval=KERN_FAILURE;
kxld_log(kKxldLogPatching, kKxldLogErr,
- kKxldLogMalformedVTable, vtable->name));
+ kKxldLogMalformedVTable,
+ kxld_demangle(vtable->name, &demangled_name1,
+ &demangled_length1)));
tmpsym = kxld_reloc_get_symbol(relocator, reloc,
/* data */ NULL, symtab);
KXLDSym *sym = NULL;
u_int symindex = 0;
u_int i = 0;
+ char *demangled_name1 = NULL;
+ char *demangled_name2 = NULL;
+ char *demangled_name3 = NULL;
+ size_t demangled_length1 = 0;
+ size_t demangled_length2 = 0;
+ size_t demangled_length3 = 0;
check(vtable);
check(super_vtable);
require_action(!vtable->is_patched, finish, rval=KERN_SUCCESS);
require_action(vtable->entries.nitems >= super_vtable->entries.nitems, finish,
rval=KERN_FAILURE;
- kxld_log(kKxldLogPatching, kKxldLogErr,
- kKxldLogMalformedVTable, vtable->name));
+ kxld_log(kKxldLogPatching, kKxldLogErr, kKxldLogMalformedVTable,
+ kxld_demangle(vtable->name, &demangled_name1, &demangled_length1)));
for (i = 0; i < super_vtable->entries.nitems; ++i) {
child_entry = kxld_array_get_item(&vtable->entries, i);
require_action(!kxld_sym_name_is_padslot(parent_entry->patched.name),
finish, rval=KERN_FAILURE;
kxld_log(kKxldLogPatching, kKxldLogErr,
- kKxldLogParentOutOfDate, super_vtable->name, vtable->name));
+ kKxldLogParentOutOfDate,
+ kxld_demangle(super_vtable->name, &demangled_name1,
+ &demangled_length1),
+ kxld_demangle(vtable->name, &demangled_name2,
+ &demangled_length2)));
#if KXLD_USER_OR_STRICT_PATCHING
/* 5) If we are doing strict patching, we prevent kexts from declaring
require_noerr(rval, finish);
kxld_log(kKxldLogPatching, kKxldLogDetail,
- "In vtable %s, patching %s with %s.",
- vtable->name, child_entry->unpatched.sym->name, sym->name);
+ "In vtable '%s', patching '%s' with '%s'.",
+ kxld_demangle(vtable->name, &demangled_name1, &demangled_length1),
+ kxld_demangle(child_entry->unpatched.sym->name,
+ &demangled_name2, &demangled_length2),
+ kxld_demangle(sym->name, &demangled_name3, &demangled_length3));
kxld_sym_patch(child_entry->unpatched.sym);
child_entry->unpatched.sym = sym;
rval = KERN_SUCCESS;
finish:
+ if (demangled_name1) kxld_free(demangled_name1, demangled_length1);
+ if (demangled_name2) kxld_free(demangled_name2, demangled_length2);
+ if (demangled_name3) kxld_free(demangled_name3, demangled_length3);
+
return rval;
}
* reading and updating of values.
*/
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
/*!
* @function OSCompareAndSwap64
static void sendAllKextPersonalitiesToCatalog(
bool startMatching = false);
- virtual void sendPersonalitiesToCatalog(
+ virtual OSReturn sendPersonalitiesToCatalog(
bool startMatching = false,
OSArray * personalityNames = NULL);
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
+#include <stdint.h> // For uintptr_t.
#include <string.h>
#include <libkern/mkext.h>
+
#define BASE 65521L /* largest prime smaller than 65536 */
-#define NMAX 5000
-// NMAX (was 5521) the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
+#define NMAX 5552 // the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
#define DO1(buf,i) {s1 += buf[i]; s2 += s1;}
#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1);
unsigned long s2 = 0; // (adler >> 16) & 0xffff;
int k;
+#if defined _ARM_ARCH_6
+
+ /* align buf to 16-byte boundary */
+ while ((((uintptr_t)buf)&15)&&(len>0)) { /* not on a 16-byte boundary */
+ len--;
+ s1 += *buf++;
+ s2 += s1;
+ if (s1 >= BASE) s1 -= BASE;
+ }
+ s2 %= BASE;
+
+ if (len>=16) {
+ return adler32_vec(s1, s2, buf, len);
+ }
+
+#endif
+
while (len > 0) {
k = len < NMAX ? len : NMAX;
len -= k;
/* @(#) $Id$ */
+#include <stdint.h> // For uintptr_t.
+
+
#define ZLIB_INTERNAL
#if KERNEL
#include <libkern/zlib.h>
#include "zlib.h"
#endif /* KERNEL */
+#if defined _ARM_ARCH_6
+ extern uLong adler32_vec(uLong adler, uLong sum2, const Bytef *buf, uInt len);
+#endif
+
#define BASE 65521UL /* largest prime smaller than 65536 */
#define NMAX 5552
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
uInt len;
{
unsigned long sum2;
+#if !defined _ARM_ARCH_6
unsigned n;
+#endif
/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
return adler | (sum2 << 16);
}
+#if defined _ARM_ARCH_6
+ /* align buf to 16-byte boundary */
+ while (((uintptr_t)buf)&15) { /* not on a 16-byte boundary */
+ len--;
+ adler += *buf++;
+ sum2 += adler;
+ if (adler >= BASE) adler -= BASE;
+ MOD4(sum2); /* only added so many BASE's */
+ }
+
+ return adler32_vec(adler, sum2, buf, len); // armv7 neon vectorized implementation
+
+#else // _ARM_ARCH_6
+
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
len -= NMAX;
/* return recombined sums */
return adler | (sum2 << 16);
+
+#endif // _ARM_ARCH_6
}
/* ========================================================================= */
--- /dev/null
+#include <arm/arch.h>
+
+#define BASE 65521 /* largest prime smaller than 65536 */
+#define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+
+// Note: buf should have been 16-byte aligned in the caller function,
+
+// uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef* buf, int len) {
+// unsigned n;
+// while (len >= NMAX) {
+// len -= NMAX;
+// n = NMAX / 16; /* NMAX is divisible by 16 */
+// do {
+// DO16(buf); /* 16 sums unrolled */
+// buf += 16;
+// } while (--n);
+// MOD(adler);
+// MOD(sum2);
+// }
+// if (len) { /* avoid modulos if none remaining */
+// while (len >= 16) {
+// len -= 16;
+// DO16(buf);
+// buf += 16;
+// }
+// while (len--) {
+// adler += *buf++;
+// sum2 += adler;
+// }
+// MOD(adler);
+// MOD(sum2);
+// }
+// return adler | (sum2 << 16); /* return recombined sums */
+// }
+
+
+/*
+ DO16 vectorization:
+ given initial unsigned int sum2 and adler, and a new set of 16 input bytes (x[0:15]), it can be shown that
+ sum2 += (16*adler + 16*x[0] + 15*x[1] + ... + 1*x[15]);
+ adler += (x[0] + x[1] + ... + x[15]);
+
+ therefore, this is what can be done to vectorize the above computation
+ 1. 16-byte aligned vector load into q2 (x[0:x15])
+ 2. sum2 += (adler<<4);
+ 3. vmull.u8 (q9,q8),q2,d2 where d2 = (1,1,1,1...,1), (q9,q8) + 16 16-bit elements x[0:15]
+ 4. vmull.u8 (q11,q10),q2,q0 where q0 = (1,2,3,4...,16), (q11,q10) + 16 16-bit elements (16:1)*x[0:15]
+ 5. parallel add (with once expansion to 32-bit) (q9,q8) and (q11,q10) all the way to accumulate to adler and sum2
+
+ In this revision, whenever possible, 2 DO16 loops are combined into a DO32 loop.
+ 1. 32-byte aligned vector load into q2,q14 (x[0:x31])
+ 2. sum2 += (adler<<5);
+ 3. vmull.u8 (4 q registers),(q2,q14),d2 where d2 = (1,1,1,1...,1), (4 q registers) : 32 16-bit elements x[0:31]
+ 4. vmull.u8 (4 q registers),(q2,q14),(q0,q15) where q0 = (1,...,32), (4 q regs) : 32 16-bit elements (32:1)*x[0:31]
+ 5. parallel add (with once expansion to 32-bit) the pair of (4 q regs) all the way to accumulate to adler and sum2
+
+ This change improves the performance by ~ 0.55 cycle/uncompress byte on ARM Cortex-A8.
+
+*/
+
+/*
+ MOD implementation:
+ adler%BASE = adler - floor(adler*(1/BASE))*BASE; where (1/BASE) = 0x80078071 in Q47
+ 1. vmull.u32 q2,(adler,sum2),(1/BASE) // *(1/BASE) in Q47
+ 2. vshr.u64 q2,q2,#47 // floor function
+ 3. vpadd.u32 d4,d4,d5 // merge into a double word in d4
+ 4. vmls.u32 (adler,sum2),d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
+
+*/
+
+#if defined _ARM_ARCH_6 // this file would be used only for armv6 or above
+
+
+ .text
+ .align 2
+ .globl _adler32_vec
+_adler32_vec:
+
+#if (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7) // for armv6 or armv7 without neon support
+
+
+ #define adler r0
+ #define sum2 r1
+ #define buf r2
+ #define len r3
+ #define one_by_base r4
+ #define base r5
+ #define nmax r6
+ #define t r12
+ #define vecs lr
+ #define x0 r8
+ #define x1 r10
+ #define x2 r11
+ #define x3 r12
+ #define zero r9
+
+ // this macro performs adler/sum2 update for 4 input bytes
+
+ .macro DO4
+ add sum2, adler, lsl #2 // sum2 += 4*adler;
+ ldr x0,[buf] // 4 bytes in 1 32-bit word
+ usada8 adler, x0, zero, adler // adler += sum(x0:x3)
+ ldrb x0,[buf], #4 // x0
+ ldrb x2,[buf,#-2] // x2
+ ldrb x1,[buf,#-3] // x1
+ ldrb x3,[buf,#-1] // x3
+ add sum2, x0, lsl #2 // sum2 += 4*x0
+ add x3, x3, x1, lsl #1 // x3+2*x1
+ add sum2, x2, lsl #1 // sum2 += 2*x2
+ add x3, x1 // x3+3*x1
+ add sum2, x3 // sum2 += x3+3*x1
+ .endm
+
+ // the following macro cascades 4 DO4 into a adler/sum2 update for 16 bytes
+ .macro DO16
+ DO4 // adler/sum2 update for 4 input bytes
+ DO4 // adler/sum2 update for 4 input bytes
+ DO4 // adler/sum2 update for 4 input bytes
+ DO4 // adler/sum2 update for 4 input bytes
+ .endm
+
+ // the following macro performs adler sum2 modulo BASE
+ .macro modulo_base
+ umull x0,x1,adler,one_by_base // adler/BASE in Q47
+ umull x2,x3,sum2,one_by_base // sum2/BASE in Q47
+ lsr x1, #15 // x1 >> 15 = floor(adler/BASE)
+ lsr x3, #15 // x3 >> 15 = floor(sum2/BASE)
+ mla adler, x1, base, adler // adler %= base;
+ mla sum2, x3, base, sum2 // sum2 %= base;
+ .endm
+
+ adr t, coeffs
+ push {r4-r6, r8-r11, lr}
+ ldmia t, {one_by_base, base, nmax} // load up coefficients
+
+ subs len, nmax // pre-subtract len by NMAX
+ eor zero, zero // a dummy zero register to use usada8 instruction
+ blt len_lessthan_NMAX // if (len < NMAX) skip the while loop
+
+while_lengenmax_loop: // do {
+ lsr vecs, nmax, #4 // vecs = NMAX/16;
+
+len16_loop: // do {
+
+ DO16
+
+ subs vecs, #1 // vecs--;
+ bgt len16_loop // } while (vec>0);
+
+ modulo_base // adler sum2 modulo BASE
+
+ subs len, nmax // len -= NMAX
+ bge while_lengenmax_loop // } while (len >= NMAX);
+
+len_lessthan_NMAX:
+ adds len, nmax // post-subtract len by NMAX
+
+ subs len, #16 // pre-decrement len by 16
+ blt len_lessthan_16
+
+len16_loop2:
+
+ DO16
+
+ subs len, #16
+ bge len16_loop2
+
+len_lessthan_16:
+ adds len, #16 // post-increment len by 16
+ beq len_is_zero
+
+remaining_buf:
+ ldrb x0, [buf], #1
+ subs len, #1
+ add adler, x0
+ add sum2, adler
+ bgt remaining_buf
+
+len_is_zero:
+
+ modulo_base // adler sum2 modulo BASE
+
+ add r0, adler, sum2, lsl #16 // to return sum2<<16 | adler
+
+ pop {r4-r6, r8-r11, pc}
+
+ .align 2
+coeffs:
+ .long -2146992015
+ .long -BASE
+ .long NMAX
+
+#else // KERNEL_SUPPORT_NEON
+
+
+
+ #define adler r0
+ #define sum2 r1
+ #define buf r2
+ #define len r3
+ #define nmax r4
+ #define vecs lr // vecs = NMAX/16
+ #define n r5
+
+ #define t r12
+
+ #define sum2_coeff q0
+ #define sum2_coeff0 d0
+ #define sum2_coeff1 d1
+ #define alder_coeff q1
+ #define ones d2
+ #define x0_x15 q2
+ #define x0_x7 d4
+ #define x8_x15 d5
+ #define adlersum2 d6
+ #define adler16 d25
+
+#if defined _ARM_ARCH_7
+
+ adr t, vec_table // address to vec_table[]
+ stmfd sp!, {r4, r5, lr}
+
+ vld1.32 {q0-q1},[t,:128]! // loading up coefficients for adler/sum2 computation
+ vld1.32 {q15},[t,:128]! // for sum2 computation
+ ldr nmax, [t] // NMAX
+
+ vmov adlersum2, sum2, adler // pack up adler/sum2 into a double register
+
+ cmp len, nmax // len vs NMAX
+ lsr vecs, nmax, #4 // vecs = NMAX/16;
+ blt len_lessthan_NMAX // if (len < NMAX) skip the while loop
+
+ sub len, nmax // pre-decrement len by NMAX
+
+while_len_ge_NMAX_loop: // while (len>=NMAX) {
+
+ mov n, vecs, lsr #1 // n = NMAX/16;
+
+do_loop: // do {
+
+ vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2)
+ vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15
+ vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
+ vld1.32 {q14}, [buf,:128]! // x16:x31
+ vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
+ vadd.u32 adlersum2,adler16 // sum2 += old adler*32;
+ vmull.u8 q12, d28, ones // 16-bit x16-x23
+ vmull.u8 q13, d29, ones // 16-bit x24-x31
+ vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9
+ vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1
+ vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler
+ vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25
+ vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17
+ vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler
+ vadd.u16 q10, q11 // 8 16-bit elements for sum2
+ vadd.u16 q8, q12 // 8 16-bit elements for adler
+ vadd.u16 q9, q14 // 8 16-bit elements for sum2
+ vadd.u16 q10, q9 // 8 16-bit elements for sum2
+ vpaddl.u16 q8, q8 // 4 32-bit elements for adler
+ vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
+ vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
+ vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
+ subs n, #1 // --n
+ vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
+ vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
+
+ bgt do_loop // } while (--n);
+
+ vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2)
+
+ vld1.32 {x0_x15},[buf,:128]! // 16-byte input
+
+ vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
+ vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
+ vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9
+ vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1
+
+ vadd.u16 q8, q8, q9 // 8 16-bit elements for adler
+ vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2
+ vpaddl.u16 q8, q8 // 4 32-bit elements for adler
+ vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
+ vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
+ vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
+ vadd.u32 adlersum2,adler16 // sum2 += old adler;
+ vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
+ vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
+
+ // mod(alder,BASE); mod(sum2,BASE);
+ vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47
+ vshr.u64 q2,q2,#47 // take the integer part
+ vpadd.u32 d4,d4,d5 // merge into a double word in d4
+ vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
+
+ subs len, nmax // len -= NMAX;
+ bge while_len_ge_NMAX_loop // repeat while len >= NMAX
+
+ add len, nmax // post-increment len by NMAX
+
+len_lessthan_NMAX:
+
+ cmp len, #0
+ beq len_is_zero // if len==0, branch to skip the following
+
+
+ subs len, #32 // pre-decrement len by 32
+ blt len_lessthan_32 // if len < 32, branch to len16_loop
+
+len32_loop:
+
+ vshll.u32 q12, adlersum2, #5 // d25 = (0,32*adler) to be added into (adler,sum2)
+ vld1.32 {x0_x15},[buf,:128]! // 16-byte input x0:x15
+ vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
+ vld1.32 {q14}, [buf,:128]! // x16:x31
+ vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
+ vadd.u32 adlersum2,adler16 // sum2 += old adler*32;
+ vmull.u8 q12, d28, ones // 16-bit x16-x23
+ vmull.u8 q13, d29, ones // 16-bit x24-x31
+ vmull.u8 q10, d28, sum2_coeff0 // 16-bit x16*16, x17*15, ..., x23*9
+ vmull.u8 q11, d29, sum2_coeff1 // 16-bit x24*8, x25*7, ..., x31*1
+ vadd.u16 q8, q8, q9 // q8 = (x0+x8):(x7+x15) 8 16-bit elements for adler
+ vmull.u8 q9, x0_x7, d30 // 16-bit x0*32,...,x7*25
+ vmull.u8 q14, x8_x15, d31 // 16-bit x8*24,...,x15*17
+ vadd.u16 q12, q12, q13 // q12 = (x16+x24):(x23+x31) 8 16-bit elements for adler
+ vadd.u16 q10, q11 // 8 16-bit elements for sum2
+ vadd.u16 q8, q12 // 8 16-bit elements for adler
+ vadd.u16 q9, q14 // 8 16-bit elements for sum2
+ vadd.u16 q10, q9 // 8 16-bit elements for sum2
+ vpaddl.u16 q8, q8 // 4 32-bit elements for adler
+ vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
+ vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
+ vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
+ subs len, #32 // len -= 32;
+ vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
+ vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
+
+ bge len32_loop
+
+len_lessthan_32:
+
+ adds len, #(32-16) // post-increment len by 32, then pre-decrement by 16
+ blt len_lessthan_16 // if len < 16, branch to len_lessthan_16
+
+ vshll.u32 q12, adlersum2, #4 // d25 = (0,16*adler) to be added into (adler,sum2)
+
+ vld1.32 {x0_x15},[buf,:128]! // 16-byte input
+
+
+ vmull.u8 q8, x0_x7, ones // 16-bit x0-x7
+ vmull.u8 q9, x8_x15, ones // 16-bit x8-x15
+ vmull.u8 q10, x0_x7, sum2_coeff0 // 16-bit x0*16, x1*15, ..., x7*9
+ vmull.u8 q11, x8_x15, sum2_coeff1 // 16-bit x8*8, x9*7, ..., x15*1
+
+ vadd.u16 q8, q8, q9 // 8 16-bit elements for adler
+ vadd.u16 q10, q10, q11 // 8 16-bit elements for sum2
+ vpaddl.u16 q8, q8 // 4 32-bit elements for adler
+ vpaddl.u16 q10, q10 // 4 32-bit elements for sum2
+ vpadd.u32 d16,d16,d17 // 2 32-bit elements for adler
+ vpadd.u32 d17,d20,d21 // 2 32-bit elements for sum2
+ subs len, #16 // decrement len by 16
+ vadd.u32 adlersum2,adler16 // sum2 += old adler;
+ vpadd.u32 d4,d17,d16 // s8 : 32-bit elements for sum2, s9 : 32-bit element for adler
+ vadd.u32 adlersum2,d4 // update adler/sum2 with the new 16 bytes input
+
+len_lessthan_16:
+ adds len, #16 // post-increment len by 16
+ beq len_is_zero_internal // if len==0, branch to len_is_zero_internal
+
+ // restore adler/sum2 into general registers for remaining (<16) bytes
+
+ vmov sum2, adler, adlersum2
+remaining_len_loop:
+ ldrb t, [buf], #1 // *buf++;
+ subs len, #1 // len--;
+ add adler,t // adler += *buf
+ add sum2,adler // sum2 += adler
+ bgt remaining_len_loop // break if len<=0
+
+ vmov adlersum2, sum2, adler // move to double register for modulo operation
+
+len_is_zero_internal:
+
+ // mod(alder,BASE); mod(sum2,BASE);
+
+ vmull.u32 q2,adlersum2,d3[1] // alder/BASE, sum2/BASE in Q47
+ vshr.u64 q2,q2,#47 // take the integer part
+ vpadd.u32 d4,d4,d5 // merge into a double word in d4
+ vmls.u32 adlersum2,d4,d3[0] // (adler,sum2) -= floor[(adler,sum2)/BASE]*BASE
+
+len_is_zero:
+
+ vmov sum2, adler, adlersum2 // restore adler/sum2 from (s12=sum2, s13=adler)
+ add r0, adler, sum2, lsl #16 // to return adler | (sum2 << 16);
+ ldmfd sp!, {r4, r5, pc} // restore registers and return
+
+
+ // constants to be loaded into q registers
+ .align 4 // 16 byte aligned
+
+vec_table:
+
+ // coefficients for computing sum2
+ .long 0x0d0e0f10 // s0
+ .long 0x090a0b0c // s1
+ .long 0x05060708 // s2
+ .long 0x01020304 // s3
+
+ // coefficients for computing adler
+ .long 0x01010101 // s4/d2
+ .long 0x01010101 // s5
+
+ .long BASE // s6 : BASE
+ .long 0x80078071 // s7 : 1/BASE in Q47
+
+ // q15 : d30.d31
+ .long 0x1d1e1f20 // s0
+ .long 0x191a1b1c // s1
+ .long 0x15161718 // s2
+ .long 0x11121314 // s3
+
+NMAX_loc:
+ .long NMAX // NMAX
+
+#endif // _ARM_ARCH_7
+
+#endif // (!KERNEL_SUPPORT_NEON) || (!defined _ARM_ARCH_7)
+
+#endif // _ARM_ARCH_6
+
--- /dev/null
+#include <arm/arch.h>
+
+// the follow assembly code was hard wired to POSTINC not defined,
+
+#if 0 // #ifdef POSTINC
+# define OFF 0
+# define PUP(a) *(a)++
+#else
+# define OFF 1
+# define PUP(a) *++(a)
+#endif
+
+// the code uses r9, therefore, it does not meet the register protocol for armv5 and below
+// the code can only be used for armv6 and above
+
+#if defined _ARM_ARCH_6
+
+ .cstring
+ .align 2
+LC0:
+ .ascii "invalid distance too far back\0"
+ .align 2
+LC1:
+ .ascii "invalid distance code\0"
+ .align 2
+LC2:
+ .ascii "invalid literal/length code\0"
+
+ // renaming the register and stack memory use
+
+ #define out r0
+ #define strm r10
+ #define state r5
+ #define in r11
+ #define write r9
+ #define distcode r8
+ #define bits lr
+ #define hold r4
+
+ // stack memory allocation
+
+ #define window_loc [sp,#0]
+ #define last_loc [sp,#4]
+ #define beg_loc [sp,#8]
+ #define end_loc [sp,#12]
+ #define wsize_loc [sp,#16]
+ #define whave_loc [sp,#20]
+ #define windowm1_loc [sp,#28]
+ #define lmask_loc [sp,#32]
+ #define dmask_loc [sp,#36]
+ #define dist_loc [sp,#48]
+
+ #define local_size 52
+
+ // the following defines the variable offset in the inflate_state structure (in inflate.h)
+
+ #define state_mode [state, #0]
+ #define state_last [state, #4]
+ #define state_wrap [state, #8]
+ #define state_havedict [state, #12]
+ #define state_flags [state, #16]
+ #define state_dmax [state, #20]
+ #define state_wbits [state, #36]
+ #define state_wsize [state, #40]
+ #define state_whave [state, #44]
+ #define state_write [state, #48]
+ #define state_window [state, #52]
+ #define state_hold [state, #56]
+ #define state_bits [state, #60]
+ #define state_lencode [state, #76]
+ #define state_distcode [state, #80]
+ #define state_lenbits [state, #84]
+ #define state_distbits [state, #88]
+
+
+// void inflate_fast(z_streamp strm, unsigned start)
+// input :
+// r0 = strm, (move to r10)
+// r1 = start
+
+ .text
+ .align 2
+ .globl _inflate_fast
+_inflate_fast:
+
+ stmfd sp!, {r4-r6,r8-r11,lr}
+ sub sp, sp, #local_size
+
+#if defined(_ARM_ARCH_5)
+ ldrd r2,r3,[r0, #0] // r2 = strm->next_in, r3 = strm->avail_in
+#else
+ ldmia r0, {r2-r3}
+#endif
+
+ sub in, r2, #OFF // in = strm->next_in - OFF;
+ sub r2, #(OFF+5) // next_in -= (OFF+5);
+ ldr state, [r0, #28] // state = (struct inflate_state FAR *)strm->state;
+ add r3, r3, r2 // last = next_in - OFF + (avail_in - 5); next_in already updated
+ mov strm, r0
+ str r3, last_loc // store last to release r3
+
+ ldr r3, [r0, #12] // next_out
+ ldr r2, [strm, #16] // avail_out
+
+ sub out, r3, #OFF // out = strm->next_out - OFF; r0 is used as out from this point on
+
+ sub r3, r3, #256 // next_out - 256
+ rsb r1, r2, r1 // start - avail_out
+ sub r3, r3, #(1+OFF) // next_out-OFF-257
+ add r3, r3, r2 // r3 = end = avail_out + (next_out-OFF) - 257 = avail_out + out - 257
+ rsb r2, r1, out // r2 = beg = out - (start - avail_out);
+#if defined(_ARM_ARCH_5)
+ strd r2,r3, beg_loc // store beg/end
+ ldrd r2,r3, state_wsize // wsize/whave
+ strd r2,r3, wsize_loc // store wsize/whave
+ //ldrd r6,hold, state_window // window/hold, hold use r7
+ ldr r6, state_window // state->window
+ ldr hold, state_hold // state->hold
+ nop
+#else
+ // for architecture < armv5, ldrd/strd is not available
+ str r2, beg_loc // store beg
+ str r3, end_loc // store end
+ ldr r2, state_wsize // state->wsize
+ ldr r3, state_whave // state->whave
+ str r2, wsize_loc // store wsize
+ str r3, whave_loc // store whave
+ ldr r6, state_window // state->window
+ ldr hold, state_hold // state->hold
+#endif
+
+ ldr ip, state_lencode // lencode
+ mov r3, #1 // used to derive lmask and dmask
+ ldr write, state_write // write (r9 from this point on) : window write index
+ nop
+ str ip, [sp, #40] // save lencode
+ sub ip, r6, #1 // window-1
+ str r6, window_loc // store window
+ str ip, windowm1_loc // store window-1
+ ldr r2, state_lenbits // lenbits
+ ldr bits, state_bits // bits, use lr from this point on
+ ldr distcode, state_distcode// distcode, use r8
+ mov r2, r3, asl r2 // (1<<lensbits)
+ ldr r12, state_distbits // distbits
+ sub r2, r2, #1 // lmask = (1U << state->lenbits) - 1;
+ mov r3, r3, asl r12 // (1U << state->distbits)
+ sub r3, r3, #1 // dmask = (1U << state->distbits) - 1;
+
+#if defined(_ARM_ARCH_5)
+ strd r2, r3, lmask_loc // store lmask/dmask
+#else
+ str r2, lmask_loc // lmask
+ str r3, dmask_loc // dmask
+#endif
+
+ // start the do loop decoding literals and length/distances
+ // until end-of-block or not enough input data or output space
+
+do_loop:
+ cmp bits, #15 // bits vs 15
+ ldr r1, lmask_loc // lmask
+ bge bitsge15 // if bits >= 15, skip loading new 16 bits
+
+ // this is a shortcut with the processor reads data in little-endian mode
+ ldrh r3, [in,#1] // read 2 bytes
+ add in, #2 // in pointer += 2
+ add hold, hold, r3, asl bits // deposit the new 2 bytes into hold
+ add bits, #16 // bits count += 16
+
+bitsge15:
+ ldr ip, [sp, #40] // restore lencode
+ and r3, hold, r1 // r3 = hold & lmask
+ b dolen
+
+op_not_zero:
+
+ tst r2, #16 // if (op&16)
+ bne length_base // branch to length_base
+
+ tst r2, #64 // else if (op&64)
+ bne end_of_block // branch to end_of_block processing
+
+ // 2nd-level length code, this is the part where if ((op & 64) == 0) { ... }
+
+ // this.val + (hold & ((1U << op) - 1));
+ // r3 = r1 + hold & ((1<<r2)-1);
+
+ rsb r12, r2, #32 // r12 = (32-op)
+ ror r3, hold, r2 // rotate the op least significant bits of hold to MSB
+ add r3, r1, r3, lsr r12 // r3 = r1 + (op LSBs in hold) = r1 + hold & ((1<<r2)-1);
+
+ ldr ip, [sp, #40] // restore lencode
+
+dolen:
+
+ // code -> 8-bit code, 8-bit bits, 16-bit val
+ ldrb r2, [ip,r3,asl #2] // op = (unsigned)(this.bits);
+ add r3, ip, r3, asl #2 // r3 = this
+ ldrb ip, [r3, #1] // ip = this.bits
+ ldrh r1, [r3, #2] // r1 = this.value
+ cmp r2, #0 // op == 0 ?
+
+ mov hold, hold, lsr ip // hold >>= this.bits
+ rsb bits, ip, bits // bits -= this.bits
+ bne op_not_zero // branch to op_not_zero if this.op != 0
+
+ strb r1, [out, #1]! // PUP(out) = (unsigned char)(this.val);
+
+do_loop_while:
+ ldr r1, last_loc // last
+ ldr r2, end_loc // end
+ cmp in, r1 // compare in vs last
+ cmpcc out, r2 // if in < last, compare out vs end
+ bcc do_loop // if (in < last && out < end) go back to do_loop
+
+update_state_and_return:
+
+ sub r2, in, bits, lsr #3 // r2 = in - (bits>>3)
+
+ add r3, r2, #OFF // r3 = (in - (bits>>3)) + OFF
+ str r3, [strm, #0] // strm->next_in = in + OFF;
+
+ add r3, out, #OFF // out + OFF
+ str r3, [strm, #12] // strm->next_out = out + OFF;
+
+ ldr r3, last_loc // r3 = last
+ ldr ip, end_loc // ip = end
+
+ cmp r3, r2 // compare last vs in
+ addhi r3, r3, #5 // if last > in, last +=5
+ movls r6, r3 // o.w., r6 = last
+ rsbls r3, r6, r2 // r3 = in-last
+ rsbhi r3, r2, r3 // r3 = (last+5) - in
+ rsbls r3, r3, #5 // r3 = 5 - (in-last);
+ cmp out, ip // compare out vs end
+ str r3, [strm, #4] // strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
+ movcs r2, ip // if out<end, r2=end
+ addcc r3, ip, #256 // if out>=end, r3 = end+256
+ rsbcs r3, r2, out // if out<end, r3 = out-end
+ addcc r3, r3, #1 // if out>=end, r3 = end+257
+ rsbcs r3, r3, #256 // if out<end, r3 = 256-(out-end) = 256 + (end-out)
+ and bits, #7 // this is equivalent to bits -= (bits>>3) << 3;
+ rsbcc r3, out, r3 // if out<end, r3 = 257+end-out
+ addcs r3, r3, #1 // if out>=end, r3 = 257 + (end-out)
+ str r3, [strm, #16] // strm->avail_out = (unsigned)(out < end ? 257 + (end - out) : 257 - (out - end));
+
+ // hold &= (1U << bits) - 1;
+
+ rsb ip, bits, #32 // 32-bits
+ ror hold, hold, bits // this is equivalent to hold<<(32-bits)
+ lsr hold, hold, ip // logical shift right by (32-bits), hold now only keeps the bits LSBs
+
+ str bits, state_bits // state->bits = bits;
+ str hold, state_hold // state->hold = hold;
+
+ add sp, #local_size // pop out stack memory
+ ldmfd sp!,{r4-r6,r8-r11,pc} // restore registers and return
+
+length_base: // r2=op, r1=lmask
+ ands r2, r2, #15 // op&=15;
+ mov r6, r1 // len = (unsigned) this.val;
+ beq op_is_zero // if op==0, branch to op_is_zero
+ cmp r2, bits // op vs bits
+ ldrhib r3, [in, #1]! // if (op>bits) r3 = (PUP(in));
+ addhi hold, hold, r3, asl bits // if (op>bits) hold += (unsigned long)(PUP(in)) << bits;
+
+ rsb ip, r2, #32 // 32-op
+ ror r3, hold, r2 // (hold<<(32-op))
+ add r6, r1, r3, lsr ip // len += (unsigned)hold & ((1U << op) - 1);
+
+ addhi bits, bits, #8 // if (op>bits) bits += 8;
+
+ mov hold, hold, lsr r2 // hold >>= op;
+ rsb bits, r2, bits // bits -= op;
+
+op_is_zero:
+ cmp bits, #14
+ ldrh r3,[in,#1] // if (bits < 15) { 2 (PUP(in)); no condition code for better performance
+ addls in, #2 // in+=2;
+ addls hold, hold, r3, asl bits // twice hold += (unsigned long)(PUP(in)) << bits;
+ addls bits, #16 // 2 bits += 8; }
+
+dodist:
+
+ ldr r2, dmask_loc // r2 = dmask
+ and r3, hold, r2 // r3 = hold & dmask
+ mov r2, r3, asl #2
+ add r3, r2, distcode // &dcode[hold&dmask];
+ ldrb ip, [r2, distcode] // op
+ ldrh r1, [r3, #2] // dist = (unsigned)(this.val);
+ tst ip, #16 // op vs 16
+ ldrb r3, [r3, #1] // this.bits
+ mov hold, hold, lsr r3 // hold >>= this.bits;
+ rsb bits, r3, bits // bits -= this.bits;
+ bne distance_base // if (op&16) { distance base processing }
+ tst ip, #64 //
+ beq second_distance_code // else if ((op&64)==0) branch to 2nd level distance code
+
+ b invalide_distance_code
+
+check_2nd_level_distance_code:
+
+ tst r2, #64 // check for esle if ((op & 64) == 0) for 2nd level distance code
+ bne invalide_distance_code
+
+second_distance_code:
+
+ rsb r2, ip, #32 // 32-op
+ ror r3, hold, ip // hold<<(32-op)
+ add r3, r1, r3, lsr r2 // this.val + (hold & ((1U << op) - 1))
+
+ mov r2, r3, asl #2
+ add r3, r2, distcode // this = dcode[this.val + (hold & ((1U << op) - 1))];
+ ldrb r2, [r2, distcode] // this.op
+ ldrh r1, [r3, #2] // this.val
+
+ tst r2, #16 // op&16
+ ldrb r3, [r3, #1] // this.bits
+ mov ip, r2 // op
+ mov hold, hold, lsr r3 // hold >> = this.bits
+ rsb bits, r3, bits // bits -= this.bits
+ beq check_2nd_level_distance_code
+
+distance_base: // this is invoked from if ((op&16)!=0)
+
+ and r2, ip, #15 // op &= 15;
+ cmp r2, bits // op vs bits
+ ldrhib r3, [in, #1]! // if (op > bits) (PUP(in))
+ addhi hold, hold, r3, asl bits // hold += (unsigned long)(PUP(in)) << bits;
+ addhi bits, bits, #8 // bits += 8;
+ cmphi r2, bits // internel (bits < op)
+ ldrhib r3, [in, #1]! // if (op > bits) (PUP(in))
+ addhi hold, hold, r3, asl bits // hold += (unsigned long)(PUP(in)) << bits;
+
+ rsb ip, r2, #32 // (32-op)
+ ror r3, hold, r2 // hold<<(32-op)
+ add r3, r1, r3, lsr ip // dist += (unsigned)hold & ((1U << op) - 1);
+
+ ldr ip, beg_loc // beg
+
+#ifdef INFLATE_STRICT
+ ldr r1, state_dmax // r1 = dmax
+#endif
+
+ str r3, dist_loc // save dist
+
+#ifdef INFLATE_STRICT
+ cmp r3, r1 // dist vs dmax
+ bgt invalid_distance_too_far_back // if dist > dmax, set up msg/mode = bad and break
+#endif
+
+ ldr r1, dist_loc // dist
+ rsb r3, ip, out // (out - beg);
+ addhi bits, bits, #8 // this is the internel bits += 8 from above
+
+ cmp r1, r3 // dist vs (out - beg)
+
+ mov hold, hold, lsr r2 // hold >>= op ;
+ rsb bits, r2, bits // bits -= op;
+ rsbls r2, r1, out // if (dist<=op) r2 = from = out-dist
+ bls copy_direct_from_output // if (dist<=op) branch to copy_direct_from_output
+
+ ldr r2, whave_loc // whave
+ rsb r1, r3, r1 // op = dist-op
+ cmp r2, r1 // whave vs op
+ nop // pad dummy for better performance
+ bcc invalid_distance_too_far_back // if whave < op, message invalid distance too far back, and break
+
+ cmp write, #0 // write
+ bne non_very_common_case // if (write ==0) non_very_common_case
+
+ // the following : if (write == 0) { /* very common case */ }
+ nop // pad dummy for better performance
+ ldr ip, wsize_loc // wsize
+ cmp r6, r1 // len vs op
+ rsb r3, r1, ip // wsize - op
+ ldr ip, windowm1_loc // window - 1
+ add r2, ip, r3 // from = window - 1 + wsize - op : setup for using PUP(from)
+ movhi r3, r1 // if len > op, r3 = op
+ movhi r1, out // if len > op, r1 = out
+ bhi some_from_window // if (len > op), branch to some_from_window
+
+finish_copy:
+
+ // while (len > 2) {
+ // PUP(out) = PUP(from);
+ // PUP(out) = PUP(from);
+ // PUP(out) = PUP(from);
+ // len -= 3;
+ // }
+ // if (len) {
+ // PUP(out) = PUP(from);
+ // if (len > 1)
+ // PUP(out) = PUP(from);
+ // }
+
+ cmp r6, #2 // len > 2 ?
+ movls r1, r6 // if (len<=2) r1 = len
+ bls lenle2 // if (len<=2) branch to lenle2
+ mov r1, r6
+fcopy_per3bytes:
+ ldrb r3, [r2, #1] // 1st PUP(from)
+ sub r1, r1, #3 // len-=3
+ cmp r1, #2 // len > 2 ?
+ strb r3, [out, #1] // 1st PUP(out) = PUP(from);
+ ldrb r3, [r2, #2] // 2nd PUP(from)
+ add r2, r2, #3 // from+=3
+ strb r3, [out, #2] // 2nd PUP(out) = PUP(from);
+ ldrb r3, [r2, #0] // 3rd PUP(from)
+ add out, out, #3 // out+=3
+ strb r3, [out, #0] // 3rd PUP(out) = PUP(from);
+ bgt fcopy_per3bytes // while (len>3) back to loop head
+lenle2:
+ cmp r1, #0 // len
+ beq do_loop_while // back to while loop head if len==0
+ ldrb r3, [r2, #1] // PUP(from)
+ cmp r1, #2 // check whether len==2
+ strb r3, [out, #1]! // PUP(out) = PUP(from);
+ bne do_loop_while // back to while loop head if len==1
+ ldrb r3, [r2, #2] // 2nd PUP(from)
+ strb r3, [out, #1]! // 2nd PUP(out) = PUP(from);
+ b do_loop_while // back to while loop head
+
+end_of_block:
+ tst r2, #32 // if (op&32)
+ movne r3, #11 // TYPE?
+ strne r3, state_mode // state-mode = TYPE
+ bne update_state_and_return // break the do loop and branch to get ready to return
+ ldr r3, messages // "invalid literal/length code" message
+L75:
+ add r3, pc, r3
+ str r3, [strm, #24] // strm->msg = (char *)"invalid literal/length code";
+ mov r3, #27 // BAD?
+ str r3, state_mode // state->mode = BAD;
+ b update_state_and_return // break the do loop and branch to get ready to return
+
+//Read_2_bytes:
+// ldrh r3,[in,#1] // 2 (PUP(in)) together
+// add in, #2 // 2 in++
+// add hold, hold, r3, asl bits // twice hold += (unsigned long)(PUP(in)) << bits;
+// add bits, #16 // 2 bits += 8;
+// b dodist // branch to dodist
+ nop // a pad dummy instruction to give better performance
+
+copy_direct_from_output: // r2 = from = out - dist ;
+
+ // do {
+ ldrb r3, [r2, #1] // 1st PUP(from)
+ sub r6, r6, #3 // len-=3
+ cmp r6, #2 // len vs 2
+ strb r3, [out, #1] // 1st PUP(out) = PUP(from);
+ ldrb r3, [r2, #2] // 2nd PUP(from)
+ add r2, r2, #3 // update from+=3
+ strb r3, [out, #2] // 2nd PUP(out) = PUP(from);
+ ldrb r3, [r2, #0] // 3rd PUP(from);
+ add out, out, #3 // update out+=3
+ strb r3, [out, #0] // 3rd PUP(out) = PUP(from);
+ bhi copy_direct_from_output // while (len>2);
+
+ // len in r6 can now be 0 1 or 2
+
+ subs r6,#1 // len--;
+ ldrb r3, [r2, #1] // PUP(from)
+ blt do_loop_while // if len<0 back to while loop head
+ strb r3, [out, #1]! // PUP(out) = PUP(from);
+ subs r6, #1 // len--;
+ ldrb r3, [r2, #2] // 2nd PUP(from)
+ blt do_loop_while // if len<0 back to while loop head
+ strb r3, [out, #1]! // 2nd PUP(out) = PUP(from);
+ b do_loop_while // back to while loop head
+
+
+invalide_distance_code:
+ ldr r3, messages+4 // "invalid distance code"
+L72:
+ add r3, pc, r3
+ str r3, [strm, #24] // strm->msg = (char *)"invalid distance code";
+ mov r3, #27
+ str r3, state_mode // state->mode = BAD;
+ b update_state_and_return // break, restore registers, and return
+
+
+some_from_window:
+ add out, r3, out // out += op
+ rsb r6, r3, r6 // len -= op
+some_from_window_loop: // do {
+ ldrb ip, [r2, #1]! // PUP(from);
+ subs r3, r3, #1 // --op
+ strb ip, [r1, #1]! // PUP(out) = PUP(from);
+ bne some_from_window_loop // } while(op);
+ ldr r3, dist_loc // dist
+ rsb r2, r3, out // from = out - dist;
+ b finish_copy
+
+non_very_common_case:
+ cmp write, r1 // write vs op
+ nop // pad dummy for better performance
+ bcs contiguous_in_window // if (write >= op) branch to contiguous_in_window
+
+ /* wrap around window */
+
+ ldr r2, wsize_loc // wsize
+ ldr ip, windowm1_loc // window-1
+ add r3, write, r2 // r3 = wsize+write
+ rsb r3, r1, r3 // r3 = wsize+write-op
+ add r2, ip, r3 // r2 = from = wsize+write-op+window-1;
+ rsb r1, write, r1 // op -= write;
+
+ cmp r6, r1 // len vs op
+ bls finish_copy // if (len <= op) branch to finish_copy
+ rsb r6, r1, r6 // len -= op
+waw_loop: // do {
+ ldrb r3, [r2, #1]! // PUP(from)
+ subs r1, r1, #1 // --op;
+ strb r3, [out, #1]! // PUP(out) = PUP(from);
+ bne waw_loop // } while (op);
+
+ cmp write, r6 // write vs len
+ ldrcs r2, windowm1_loc // if (write>=len) r2 = from = window-1;
+ bcs finish_copy // if (write>=len) branch to finish_copy
+
+ // some from start of window
+
+ mov r1, write // op = write
+ sub r6, write // len -= op
+ sub ip, out
+ add ip, #1 // out+ip -> from
+sow_loop: // do {
+ ldrb r3,[out, ip] // PUP(from)
+ subs r1, #1 // --op;
+ strb r3, [out,#1]! // PUP(out) = PUP(from);
+ bne sow_loop // } while (op);
+
+ ldr r2, dist_loc // dist
+ sub r6, r6, write // len -= write
+ rsb r2, r2, out // r2 = from = out-dist
+ b finish_copy // continue to finish_copy
+
+
+contiguous_in_window:
+ ldr ip, windowm1_loc // window-1
+ cmp r6, r1 // len vs op
+ rsb r3, r1, write // r3 = write-op
+ add r2, ip, r3 // r2 = from = window+write-op-1
+ bls finish_copy // if (len <= op) branch to finish_copy
+ rsb r6, r1, r6 // len -= op
+ ldr r3, dist_loc // dist
+ciw_loop:
+ ldrb ip, [r2, #1]! // PUP(from)
+ subs r1, r1, #1 // op--
+ strb ip, [out, #1]! // PUP(out) = PUP(from);
+ bne ciw_loop // while (--op);
+ rsb r2, r3, out // from = out - dist;
+ b finish_copy
+
+invalid_distance_too_far_back:
+ ldr r3, messages+8 // "invalid distance too far back"
+L42:
+ add r3, pc, r3
+ str r3, [strm, #24] // strm->msg = (char *)"invalid distance too far back";
+ mov r3, #27
+ str r3, state_mode // state->mode = BAD;
+ b update_state_and_return // break, restore registers, and return
+
+ .align 2
+messages:
+ .long LC2-8-(L75)
+ .long LC1-8-(L72)
+ .long LC0-8-(L42)
+
+#endif // defined _ARM_ARCH_6
* For conditions of distribution and use, see copyright notice in zlib.h
*/
+
+#if defined _ARM_ARCH_6
+
+ // dummy definition, for armv6 or above, compile code from inffastS.s
+ typedef char DummyDefinition;
+
+#else // architecture
+
#include "zutil.h"
#include "inftrees.h"
#include "inflate.h"
*/
#endif /* !ASMINF */
+
+#endif // architecture
CFLAGS_ARM += -mthumb
endif
ifeq (-arch armv5,$(ARCH_FLAGS_ARM))
-CFLAGS_ARM += -mthumb
+CFLAGS_ARM += -mno-thumb
endif
ifeq (-arch xscale,$(ARCH_FLAGS_ARM))
CFLAGS_ARM += -mthumb
-Wl,-new_linker \
-Wl,-pagezero_size,0x0 \
-Wl,-segaddr,__HIB,0xC0000000 \
- -Wl,-image_base,0xC0008000
+ -Wl,-image_base,0xC0008000 \
+ -Wl,-exported_symbols_list,$(TARGET)/kernel-kpi.exp
export LDFLAGS_KERNEL = $(LDFLAGS_KERNEL_GEN) \
$(TARGET)/mach_kernel: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST), $(addprefix $(component)/$(firstword $($(addsuffix _KERNEL_CONFIG, $(shell printf $(component) | tr a-z A-Z))) $(KERNEL_CONFIG))/, $(addsuffix .o, $(component))))) lastkernelconstructor.o
$(_v)${MAKE} version.o
+ $(_v)${MAKE} build_mach_kernel_exports
@echo LD mach_kernel.sys
$(_v)$(CAT) $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST), $(addprefix $(component)/$(firstword $($(addsuffix _KERNEL_CONFIG, $(shell printf $(component) | tr a-z A-Z))) $(KERNEL_CONFIG))/, $(addsuffix .o, $(component))))) > mach_kernel.filelist
$(_v)$(LD) $(LDFLAGS_KERNEL) -filelist mach_kernel.filelist version.o lastkernelconstructor.o `if [ -e $(STATIC_KMODS) ]; then echo $(STATIC_KMODS); fi` \
$(TARGET)/kgmacros: $(SRCROOT)/kgmacros
$(_v)$(INSTALL) $(INSTALL_FLAGS) $? $@
+.PHONY: build_mach_kernel_exports
+build_mach_kernel_exports:
+ $(_v)${MAKE} \
+ MAKEFILES=${SOURCE}/config/Makefile \
+ SOURCE=${SOURCE}/config \
+ TARGET=$${TARGET} \
+ build_mach_kernel_exports;
+
# Special rules to install machine configuration variants
$(DSTROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TARGET)/mach_kernel force_file_install
#
options CONFIG_ENFORCE_SIGNED_CODE # <config_embedded>
+# support dynamic signing of code
+#
+options CONFIG_DYNAMIC_CODE_SIGNING # <dynamic_codesigning>
+
# vc_progress_white - make the progress gear white instead of black
options CONFIG_VC_PROGRESS_WHITE # <vc_progress_white>
#include <sys/errno.h>
#include <string.h>
#include <machine/machlimits.h>
+#include <pexpert/pexpert.h>
extern struct vc_info vinfo;
extern boolean_t panicDialogDesired;
static int pixels_needed_to_blit_digit( int digit );
static void blit_digit( int digit );
static const char * strnstr(const char * s, const char * find, size_t slen);
-void dim_screen(void);
static void panic_blit_rect(unsigned int x, unsigned int y, unsigned int width,
unsigned int height, int transparent,
const unsigned char * dataPtr);
}
-void
-dim_screen(void)
-{
- unsigned int *p, *endp, *row;
- int col, rowline, rowlongs;
- register unsigned int mask;
-
- if(!vinfo.v_depth)
- return;
-
- if ( vinfo.v_depth == 32 )
- mask = 0x007F7F7F;
- else if ( vinfo.v_depth == 30 )
- mask = (0x1ff<<20) | (0x1ff<<10) | 0x1ff;
- else if ( vinfo.v_depth == 16 )
- mask = 0x3DEF3DEF;
- else
- return;
-
- rowline = (int)(vinfo.v_rowscanbytes / 4);
- rowlongs = (int)(vinfo.v_rowbytes / 4);
-
- p = (unsigned int*) vinfo.v_baseaddr;
- endp = p + (rowlongs * vinfo.v_height);
-
- for (row = p ; row < endp ; row += rowlongs) {
- for (p = &row[0], col = 0; col < rowline; col++) {
- *p = (*p >> 1) & mask;
- ++p;
- }
- }
-}
-
-
/* From user mode Libc - this ought to be in a library */
static const char *
strnstr(const char * s, const char * find, size_t slen)
#endif /* GRATEFULDEBUGGER */
}
+void
+dim_screen(void)
+{
+ unsigned int *p, *endp, *row;
+ int col, rowline, rowlongs;
+ register unsigned int mask;
+
+ if(!vinfo.v_depth)
+ return;
+
+ if ( vinfo.v_depth == 32 )
+ mask = 0x007F7F7F;
+ else if ( vinfo.v_depth == 30 )
+ mask = (0x1ff<<20) | (0x1ff<<10) | 0x1ff;
+ else if ( vinfo.v_depth == 16 )
+ mask = 0x3DEF3DEF;
+ else
+ return;
+
+ rowline = (int)(vinfo.v_rowscanbytes / 4);
+ rowlongs = (int)(vinfo.v_rowbytes / 4);
+
+ p = (unsigned int*) vinfo.v_baseaddr;
+ endp = p + (rowlongs * vinfo.v_height);
+
+ for (row = p ; row < endp ; row += rowlongs) {
+ for (p = &row[0], col = 0; col < rowline; col++) {
+ *p = (*p >> 1) & mask;
+ ++p;
+ }
+ }
+}
+
void vcattach(void); /* XXX gcc 4 warning cleanup */
void
static unsigned panic_io_port;
static unsigned commit_paniclog_to_nvram;
-int debug_boot_arg;
+unsigned int debug_boot_arg;
void
machine_startup(void)
halt_in_debugger = halt_in_debugger ? 0 : 1;
#endif
- if (PE_parse_boot_argn("debug", &boot_arg, sizeof (boot_arg))) {
- if (boot_arg & DB_HALT) halt_in_debugger=1;
- if (boot_arg & DB_PRT) disable_debug_output=FALSE;
- if (boot_arg & DB_SLOG) systemLogDiags=TRUE;
- if (boot_arg & DB_NMI) panicDebugging=TRUE;
- if (boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE;
- debug_boot_arg = boot_arg;
+ if (PE_parse_boot_argn("debug", &debug_boot_arg, sizeof (debug_boot_arg))) {
+ if (debug_boot_arg & DB_HALT) halt_in_debugger=1;
+ if (debug_boot_arg & DB_PRT) disable_debug_output=FALSE;
+ if (debug_boot_arg & DB_SLOG) systemLogDiags=TRUE;
+ if (debug_boot_arg & DB_NMI) panicDebugging=TRUE;
+ if (debug_boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE;
+ } else {
+ debug_boot_arg = 0;
}
if (!PE_parse_boot_argn("nvram_paniclog", &commit_paniclog_to_nvram, sizeof (commit_paniclog_to_nvram)))
/* For use with the MP rendezvous mechanism
*/
-#if !CONFIG_EMBEDDED
static void
machine_halt_cpu(__unused void *arg) {
panic_io_port_read();
pmCPUHalt(PM_HALT_DEBUG);
}
-#endif
void
Debugger(
#endif
/* Print backtrace - callee is internally synchronized */
- panic_i386_backtrace(stackptr, 20, NULL, FALSE, NULL);
+ panic_i386_backtrace(stackptr, 32, NULL, FALSE, NULL);
/* everything should be printed now so copy to NVRAM
*/
}
}
}
- draw_panic_dialog();
+
+ /* If the user won't be able to read the dialog,
+ * don't bother trying to show it
+ */
+ if (!PE_reboot_on_panic())
+ draw_panic_dialog();
if (!panicDebugging) {
/* Clear the MP rendezvous function lock, in the event
* that a panic occurred while in that codepath.
*/
mp_rendezvous_break_lock();
-#if CONFIG_EMBEDDED
- PEHaltRestart(kPEPanicRestartCPU);
-#else
+ if (PE_reboot_on_panic()) {
+ PEHaltRestart(kPEPanicRestartCPU);
+ }
+
/* Force all CPUs to disable interrupts and HLT.
* We've panicked, and shouldn't depend on the
* PEHaltRestart() mechanism, which relies on several
* bits of infrastructure.
*/
mp_rendezvous_no_intrs(machine_halt_cpu, NULL);
-#endif
/* NOT REACHED */
}
}
#define k64Bit 0x00000200 /* processor supports EM64T (not what mode you're running in) */
#define kHasSSE4_1 0x00000400
#define kHasSSE4_2 0x00000800
-#define kHasAES 0x00001000
#define kInOrderPipeline 0x00002000 /* in-order execution */
#define kSlow 0x00004000 /* tsc < nanosecond */
#define kUP 0x00008000 /* set if (kNumCPUs == 1) */
cpuid_fn(6, reg);
ctp->sensor = bitfield32(reg[eax], 0, 0);
ctp->dynamic_acceleration = bitfield32(reg[eax], 1, 1);
+ ctp->invariant_APIC_timer = bitfield32(reg[eax], 2, 2);
ctp->thresholds = bitfield32(reg[ebx], 3, 0);
ctp->ACNT_MCNT = bitfield32(reg[ecx], 0, 0);
info_p->cpuid_thermal_leafp = ctp;
extfeature_map[] = {
{CPUID_EXTFEATURE_SYSCALL, "SYSCALL"},
{CPUID_EXTFEATURE_XD, "XD"},
+ {CPUID_EXTFEATURE_RDTSCP, "RDTSCP"},
{CPUID_EXTFEATURE_EM64T, "EM64T"},
{CPUID_EXTFEATURE_LAHF, "LAHF"},
- {CPUID_EXTFEATURE_RDTSCP, "RDTSCP"},
{CPUID_EXTFEATURE_TSCI, "TSCI"},
{0, 0}
};
#define CPUID_FEATURE_PBE _Bit(31) /* Pend Break Enable */
#define CPUID_FEATURE_SSE3 _HBit(0) /* Streaming SIMD extensions 3 */
+
#define CPUID_FEATURE_MONITOR _HBit(3) /* Monitor/mwait */
#define CPUID_FEATURE_DSCPL _HBit(4) /* Debug Store CPL */
#define CPUID_FEATURE_VMX _HBit(5) /* VMX */
#define CPUID_FEATURE_CX16 _HBit(13) /* CmpXchg16b instruction */
#define CPUID_FEATURE_xTPR _HBit(14) /* Send Task PRiority msgs */
#define CPUID_FEATURE_PDCM _HBit(15) /* Perf/Debug Capability MSR */
+
#define CPUID_FEATURE_DCA _HBit(18) /* Direct Cache Access */
#define CPUID_FEATURE_SSE4_1 _HBit(19) /* Streaming SIMD extensions 4.1 */
#define CPUID_FEATURE_SSE4_2 _HBit(20) /* Streaming SIMD extensions 4.2 */
*/
#define CPUID_EXTFEATURE_SYSCALL _Bit(11) /* SYSCALL/sysret */
#define CPUID_EXTFEATURE_XD _Bit(20) /* eXecute Disable */
+
#define CPUID_EXTFEATURE_RDTSCP _Bit(27) /* RDTSCP */
#define CPUID_EXTFEATURE_EM64T _Bit(29) /* Extended Mem 64 Technology */
-#define CPUID_EXTFEATURE_LAHF _HBit(20) /* LAFH/SAHF instructions */
+#define CPUID_EXTFEATURE_LAHF _HBit(0) /* LAHF/SAHF instructions */
/*
* The CPUID_EXTFEATURE_XXX values define 64-bit values
#define CPUID_MODEL_MEROM 15
#define CPUID_MODEL_PENRYN 23
#define CPUID_MODEL_NEHALEM 26
-#define CPUID_MODEL_ATOM 28
#define CPUID_MODEL_FIELDS 30 /* Lynnfield, Clarksfield, Jasper */
#define CPUID_MODEL_DALES 31 /* Havendale, Auburndale */
#define CPUID_MODEL_NEHALEM_EX 46
typedef struct {
boolean_t sensor;
boolean_t dynamic_acceleration;
+ boolean_t invariant_APIC_timer;
uint32_t thresholds;
boolean_t ACNT_MCNT;
} cpuid_thermal_leaf_t;
#include <kern/cpu_data.h>
#include <kern/assert.h>
#include <kern/machine.h>
+#include <kern/debug.h>
#include <vm/vm_map.h>
#include <vm/vm_kern.h>
static unsigned lapic_error_count_threshold = 5;
static boolean_t lapic_dont_panic = FALSE;
-extern int debug_boot_arg;
-
/* Base vector for local APIC interrupt sources */
int lapic_interrupt_base = LAPIC_DEFAULT_INTERRUPT_BASE;
-int lapic_to_cpu[MAX_CPUS];
+#define MAX_LAPICIDS (LAPIC_ID_MAX+1)
+int lapic_to_cpu[MAX_LAPICIDS];
int cpu_to_lapic[MAX_CPUS];
static void
{
int i;
- for (i = 0; i < MAX_CPUS; i++) {
- lapic_to_cpu[i] = -1;
+ for (i = 0; i < MAX_CPUS; i++)
cpu_to_lapic[i] = -1;
- }
+ for (i = 0; i < MAX_LAPICIDS; i++)
+ lapic_to_cpu[i] = -1;
}
void
lapic_cpu_map(int apic_id, int cpu)
{
+ assert(apic_id < MAX_LAPICIDS);
+ assert(cpu < MAX_CPUS);
cpu_to_lapic[cpu] = apic_id;
lapic_to_cpu[apic_id] = cpu;
}
uint32_t
ml_get_cpuid(uint32_t lapic_index)
{
- if(lapic_index >= (uint32_t)MAX_CPUS)
+ if(lapic_index >= (uint32_t)MAX_LAPICIDS)
return 0xFFFFFFFF; /* Return -1 if cpu too big */
/* Return the cpu ID (or -1 if not configured) */
kprintf("cpu_to_lapic[%d]: %d\n",
i, cpu_to_lapic[i]);
}
- for (i = 0; i < MAX_CPUS; i++) {
+ for (i = 0; i < MAX_LAPICIDS; i++) {
if (lapic_to_cpu[i] == -1)
continue;
kprintf("lapic_to_cpu[%d]: %d\n",
}
pmap = thread->map->pmap;
-#if CONFIG_DTRACE
- thread->machine.specFlags |= CopyIOActive;
-#endif /* CONFIG_DTRACE */
-
if (pmap == kernel_pmap || use_kernel_map) {
kern_vaddr = (vm_offset_t)user_addr;
KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)kern_vaddr,
(unsigned)kernel_addr, (unsigned)nbytes,
error | 0x80000000, 0);
+ return (error);
+ }
#if CONFIG_DTRACE
- thread->machine.specFlags &= ~CopyIOActive;
+ thread->machine.specFlags |= CopyIOActive;
#endif /* CONFIG_DTRACE */
- return (error);
+ if ((nbytes && (user_addr + nbytes <= user_addr)) || ((user_addr + nbytes) > vm_map_max(thread->map))) {
+ error = EFAULT;
+ goto done;
}
+
user_base = user_addr & ~((user_addr_t)(NBPDE - 1));
user_offset = (vm_offset_t)(user_addr & (NBPDE - 1));
}
window_offset = (char *)((uint32_t)paddr & (PAGE_SIZE - 1));
+ assert(!((current_thread()->machine.specFlags & CopyIOActive) && ((which & cppvKmap) == 0)));
+
if (current_thread()->machine.physwindow_busy) {
pt_entry_t old_pentry;
if (pmInitDone
&& pmDispatch != NULL
- && pmDispatch->cstateMachineIdle != NULL)
- (*pmDispatch->cstateMachineIdle)(0x7FFFFFFFFFFFFFFFULL);
+ && pmDispatch->MachineIdle != NULL)
+ (*pmDispatch->MachineIdle)(0x7FFFFFFFFFFFFFFFULL);
else {
/*
* If no power management, re-enable interrupts and halt.
}
boolean_t
-machine_cpu_is_inactive(int cpu)
+machine_processor_is_inactive(processor_t processor)
{
+ int cpu = processor->cpu_id;
+
if (pmDispatch != NULL
&& pmDispatch->pmIsCPUUnAvailable != NULL)
return(pmDispatch->pmIsCPUUnAvailable(cpu_to_lcpu(cpu)));
return(FALSE);
}
+processor_t
+machine_choose_processor(processor_set_t pset,
+ processor_t preferred)
+{
+ int startCPU;
+ int endCPU;
+ int preferredCPU;
+ int chosenCPU;
+
+ if (!pmInitDone)
+ return(preferred);
+
+ if (pset == NULL) {
+ startCPU = -1;
+ endCPU = -1;
+ } else {
+ startCPU = pset->cpu_set_low;
+ endCPU = pset->cpu_set_hi;
+ }
+
+ if (preferred == NULL)
+ preferredCPU = -1;
+ else
+ preferredCPU = preferred->cpu_id;
+
+ if (pmDispatch != NULL
+ && pmDispatch->pmChooseCPU != NULL) {
+ chosenCPU = pmDispatch->pmChooseCPU(startCPU, endCPU, preferredCPU);
+
+ if (chosenCPU == -1)
+ return(NULL);
+ return(cpu_datap(chosenCPU)->cpu_processor);
+ }
+
+ return(preferred);
+}
+
static uint32_t
pmGetSavedRunCount(void)
{
* This value should be changed each time that pmDsipatch_t or pmCallBacks_t
* changes.
*/
-#define PM_DISPATCH_VERSION 18
+#define PM_DISPATCH_VERSION 19
/*
* Dispatch table for functions that get installed when the power
{
int (*pmCPUStateInit)(void);
void (*cstateInit)(void);
- uint64_t (*cstateMachineIdle)(uint64_t maxIdleDuration);
+ uint64_t (*MachineIdle)(uint64_t maxIdleDuration);
uint64_t (*GetDeadline)(x86_lcpu_t *lcpu);
uint64_t (*SetDeadline)(x86_lcpu_t *lcpu, uint64_t);
void (*Deadline)(x86_lcpu_t *lcpu);
void (*markAllCPUsOff)(void);
void (*pmSetRunCount)(uint32_t count);
boolean_t (*pmIsCPUUnAvailable)(x86_lcpu_t *lcpu);
+ int (*pmChooseCPU)(int startCPU, int endCPU, int preferredCPU);
int (*pmIPIHandler)(void *state);
} pmDispatch_t;
*/
#include <string.h>
-#include <norma_vm.h>
#include <mach_kdb.h>
#include <mach_ldebug.h>
uint64_t max_preemption_latency_tsc = 0;
-/*
- * Private data structures.
- */
-
-/*
- * For each vm_page_t, there is a list of all currently
- * valid virtual mappings of that page. An entry is
- * a pv_rooted_entry_t; the list is the pv_table.
- *
- * N.B. with the new combo rooted/hashed scheme it is
- * only possibly to remove individual non-rooted entries
- * if they are found via the hashed chains as there is no
- * way to unlink the singly linked hashed entries if navigated to
- * via the queue list off the rooted entries. Think of it as
- * hash/walk/pull, keeping track of the prev pointer while walking
- * the singly linked hash list. All of this is to save memory and
- * keep both types of pv_entries as small as possible.
- */
-
-/*
-
-PV HASHING Changes - JK 1/2007
-
-Pve's establish physical to virtual mappings. These are used for aliasing of a
-physical page to (potentially many) virtual addresses within pmaps. In the previous
-implementation the structure of the pv_entries (each 16 bytes in size) was
-
-typedef struct pv_entry {
- struct pv_entry_t next;
- pmap_t pmap;
- vm_map_offset_t va;
-} *pv_entry_t;
-
-An initial array of these is created at boot time, one per physical page of memory,
-indexed by the physical page number. Additionally, a pool of entries is created from a
-pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
-Originally, we kept this pool around because the code in pmap_enter() was unable to
-block if it needed an entry and none were available - we'd panic. Some time ago I
-restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
-a pv structure and restart, removing a panic from the code (in the case of the kernel
-pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
-kernel pmaps). The pool has not been removed since there is a large performance gain
-keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
-
-As pmap_enter() created new mappings it linked the new pve's for them off the fixed
-pv array for that ppn (off the next pointer). These pve's are accessed for several
-operations, one of them being address space teardown. In that case, we basically do this
-
- for (every page/pte in the space) {
- calc pve_ptr from the ppn in the pte
- for (every pv in the list for the ppn) {
- if (this pv is for this pmap/vaddr) {
- do housekeeping
- unlink/free the pv
- }
- }
- }
-
-The problem arose when we were running, say 8000 (or even 2000) apache or other processes
-and one or all terminate. The list hanging off each pv array entry could have thousands of
-entries. We were continuously linearly searching each of these lists as we stepped through
-the address space we were tearing down. Because of the locks we hold, likely taking a cache
-miss for each node, and interrupt disabling for MP issues the system became completely
-unresponsive for many seconds while we did this.
-
-Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
-for operations like pmap_page_protect and finding and modifying/removing a single pve as
-part of pmap_enter processing) has led to modifying the pve structures and databases.
-
-There are now two types of pve structures. A "rooted" structure which is basically the
-original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
-hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of
-minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of
-pages in the system are not aliased and hence represented by a single pv entry I've kept
-the rooted entry size as small as possible because there is one of these dedicated for
-every physical page of memory. The hashed pve's are larger due to the addition of the hash
-link and the ppn entry needed for matching while running the hash list to find the entry we
-are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs)
-will pay the extra memory price. Both structures have the same first three fields allowing
-some simplification in the code.
-
-They have these shapes
-
-typedef struct pv_rooted_entry {
- queue_head_t qlink;
- vm_map_offset_t va;
- pmap_t pmap;
-} *pv_rooted_entry_t;
-
-
-typedef struct pv_hashed_entry {
- queue_head_t qlink;
- vm_map_offset_t va;
- pmap_t pmap;
- ppnum_t ppn;
- struct pv_hashed_entry *nexth;
-} *pv_hashed_entry_t;
-
-The main flow difference is that the code is now aware of the rooted entry and the hashed
-entries. Code that runs the pv list still starts with the rooted entry and then continues
-down the qlink onto the hashed entries. Code that is looking up a specific pv entry first
-checks the rooted entry and then hashes and runs the hash list for the match. The hash list
-lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
-
-*/
-
-typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */
- queue_head_t qlink;
- vm_map_offset_t va; /* virtual address for mapping */
- pmap_t pmap; /* pmap where mapping lies */
-} *pv_rooted_entry_t;
-
-#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
-
-pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
-
-typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */
- queue_head_t qlink;
- vm_map_offset_t va;
- pmap_t pmap;
- ppnum_t ppn;
- struct pv_hashed_entry *nexth;
-} *pv_hashed_entry_t;
-
-#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
-
-#define NPVHASH 4095 /* MUST BE 2^N - 1 */
pv_hashed_entry_t *pv_hash_table; /* hash lists */
uint32_t npvhash = 0;
-/* #define PV_DEBUG 1 uncomment to enable some PV debugging code */
-#ifdef PV_DEBUG
-#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
-#else
-#define CHK_NPVHASH()
-#endif
/*
* pv_list entries are kept on a list that can only be accessed
int pv_hashed_free_count = 0;
int pv_kern_free_count = 0;
int pv_hashed_kern_free_count = 0;
-#define PV_HASHED_LOW_WATER_MARK 5000
-#define PV_HASHED_KERN_LOW_WATER_MARK 100
-#define PV_HASHED_ALLOC_CHUNK 2000
-#define PV_HASHED_KERN_ALLOC_CHUNK 50
-thread_call_t mapping_adjust_call;
-static thread_call_data_t mapping_adjust_call_data;
-uint32_t mappingrecurse = 0;
-
-#define PV_HASHED_ALLOC(pvh_e) { \
- simple_lock(&pv_hashed_free_list_lock); \
- if ((pvh_e = pv_hashed_free_list) != 0) { \
- pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
- pv_hashed_free_count--; \
- if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
- if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
- thread_call_enter(mapping_adjust_call); \
- } \
- simple_unlock(&pv_hashed_free_list_lock); \
-}
-
-#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
- simple_lock(&pv_hashed_free_list_lock); \
- pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \
- pv_hashed_free_list = pvh_eh; \
- pv_hashed_free_count += pv_cnt; \
- simple_unlock(&pv_hashed_free_list_lock); \
-}
-
-#define PV_HASHED_KERN_ALLOC(pvh_e) { \
- simple_lock(&pv_hashed_kern_free_list_lock); \
- if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
- pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
- pv_hashed_kern_free_count--; \
- if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
- if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
- thread_call_enter(mapping_adjust_call); \
- } \
- simple_unlock(&pv_hashed_kern_free_list_lock); \
-}
-
-#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
- simple_lock(&pv_hashed_kern_free_list_lock); \
- pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \
- pv_hashed_kern_free_list = pvh_eh; \
- pv_hashed_kern_free_count += pv_cnt; \
- simple_unlock(&pv_hashed_kern_free_list_lock); \
-}
zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
static struct vm_object kptobj_object_store;
static vm_object_t kptobj;
-/*
- * Index into pv_head table, its lock bits, and the modify/reference and managed bits
- */
-
-#define pa_index(pa) (i386_btop(pa))
-#define ppn_to_pai(ppn) ((int)ppn)
-
-#define pai_to_pvh(pai) (&pv_head_table[pai])
-#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
-#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
-
-#define pvhashidx(pmap, va) (((uint32_t)pmap ^ ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & npvhash)
-#define pvhash(idx) (&pv_hash_table[idx])
-
-#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
-#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
-
/*
* Array of physical page attribites for managed pages.
* One byte per physical page.
pt_entry_t *DMAP1, *DMAP2;
caddr_t DADDR1;
caddr_t DADDR2;
-
-static inline
-void pmap_pvh_unlink(pv_hashed_entry_t pv);
-
-/*
- * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
- * properly deals with the anchor.
- * must be called with the hash locked, does not unlock it
- */
-
-static inline
-void pmap_pvh_unlink(pv_hashed_entry_t pvh)
-{
- pv_hashed_entry_t curh;
- pv_hashed_entry_t *pprevh;
- int pvhash_idx;
-
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
-
- pprevh = pvhash(pvhash_idx);
-
-#if PV_DEBUG
- if (NULL == *pprevh) panic("pvh_unlink null anchor"); /* JK DEBUG */
-#endif
- curh = *pprevh;
-
- while (PV_HASHED_ENTRY_NULL != curh) {
- if (pvh == curh)
- break;
- pprevh = &curh->nexth;
- curh = curh->nexth;
- }
- if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
- *pprevh = pvh->nexth;
- return;
-}
-
/*
* for legacy, returns the address of the pde entry.
* for 64 bit, causes the pdpt page containing the pde entry to be mapped,
va = (vm_offset_t)p->dirbase;
p->pdirbase = kvtophys(va);
- template = cpu_64bit ? INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF : INTEL_PTE_VALID;
+ template = INTEL_PTE_VALID;
for (i = 0; i< NPGPTD; i++, pdpt++ ) {
pmap_paddr_t pa;
pa = (pmap_paddr_t) kvtophys((vm_offset_t)(va + i386_ptob(i)));
/* uber space points to uber mapped kernel */
s = splhigh();
pml4p = pmap64_pml4(p, 0ULL);
- pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX),*kernel_pmap->pm_pml4);
+ pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX), *kernel_pmap->pm_pml4);
if (!is_64bit) {
}
}
-/*
- * Remove a range of hardware page-table entries.
- * The entries given are the first (inclusive)
- * and last (exclusive) entries for the VM pages.
- * The virtual address is the va for the first pte.
- *
- * The pmap must be locked.
- * If the pmap is not the kernel pmap, the range must lie
- * entirely within one pte-page. This is NOT checked.
- * Assumes that the pte-page exists.
- */
-
-void
-pmap_remove_range(
- pmap_t pmap,
- vm_map_offset_t start_vaddr,
- pt_entry_t *spte,
- pt_entry_t *epte)
-{
- register pt_entry_t *cpte;
- pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
- pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
- pv_hashed_entry_t pvh_e;
- int pvh_cnt = 0;
- int num_removed, num_unwired, num_found;
- int pai;
- pmap_paddr_t pa;
- vm_map_offset_t vaddr;
- int pvhash_idx;
- uint32_t pv_cnt;
-
- num_removed = 0;
- num_unwired = 0;
- num_found = 0;
-
- if (pmap != kernel_pmap &&
- pmap->pm_task_map == TASK_MAP_32BIT &&
- start_vaddr >= HIGH_MEM_BASE) {
- /*
- * The range is in the "high_shared_pde" which is shared
- * between the kernel and all 32-bit tasks. It holds
- * the 32-bit commpage but also the trampolines, GDT, etc...
- * so we can't let user tasks remove anything from it.
- */
- return;
- }
-
- /* invalidate the PTEs first to "freeze" them */
- for (cpte = spte, vaddr = start_vaddr;
- cpte < epte;
- cpte++, vaddr += PAGE_SIZE_64) {
-
- pa = pte_to_pa(*cpte);
- if (pa == 0)
- continue;
- num_found++;
-
- if (iswired(*cpte))
- num_unwired++;
-
- pai = pa_index(pa);
-
- if (!managed_page(pai)) {
- /*
- * Outside range of managed physical memory.
- * Just remove the mappings.
- */
- pmap_store_pte(cpte, 0);
- continue;
- }
-
- /* invalidate the PTE */
- pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID));
- }
-
- if (num_found == 0) {
- /* nothing was changed: we're done */
- goto update_counts;
- }
-
- /* propagate the invalidates to other CPUs */
-
- PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
-
- for (cpte = spte, vaddr = start_vaddr;
- cpte < epte;
- cpte++, vaddr += PAGE_SIZE_64) {
-
- pa = pte_to_pa(*cpte);
- if (pa == 0)
- continue;
-
- pai = pa_index(pa);
-
- LOCK_PVH(pai);
-
- pa = pte_to_pa(*cpte);
- if (pa == 0) {
- UNLOCK_PVH(pai);
- continue;
- }
-
- num_removed++;
-
- /*
- * Get the modify and reference bits, then
- * nuke the entry in the page table
- */
- /* remember reference and change */
- pmap_phys_attributes[pai] |=
- (char)(*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
- /* completely invalidate the PTE */
- pmap_store_pte(cpte, 0);
-
- /*
- * Remove the mapping from the pvlist for
- * this physical page.
- */
- {
- pv_rooted_entry_t pv_h;
- pv_hashed_entry_t *pprevh;
- ppnum_t ppn = (ppnum_t)pai;
-
- pv_h = pai_to_pvh(pai);
- pvh_e = PV_HASHED_ENTRY_NULL;
- if (pv_h->pmap == PMAP_NULL)
- panic("pmap_remove_range: null pv_list!");
-
- if (pv_h->va == vaddr && pv_h->pmap == pmap) { /* rooted or not */
- /*
- * Header is the pv_rooted_entry. We can't free that. If there is a queued
- * entry after this one we remove that
- * from the ppn queue, we remove it from the hash chain
- * and copy it to the rooted entry. Then free it instead.
- */
-
- pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
- if (pv_h != (pv_rooted_entry_t)pvh_e) { /* any queued after rooted? */
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
- LOCK_PV_HASH(pvhash_idx);
- remque(&pvh_e->qlink);
- {
- pprevh = pvhash(pvhash_idx);
- if (PV_HASHED_ENTRY_NULL == *pprevh) {
- panic("pmap_remove_range empty hash removing rooted pv");
- }
- }
- pmap_pvh_unlink(pvh_e);
- UNLOCK_PV_HASH(pvhash_idx);
- pv_h->pmap = pvh_e->pmap;
- pv_h->va = pvh_e->va; /* dispose of pvh_e */
- } else { /* none queued after rooted */
- pv_h->pmap = PMAP_NULL;
- pvh_e = PV_HASHED_ENTRY_NULL;
- } /* any queued after rooted */
-
- } else { /* rooted or not */
- /* not removing rooted pv. find it on hash chain, remove from ppn queue and
- * hash chain and free it */
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pmap,vaddr);
- LOCK_PV_HASH(pvhash_idx);
- pprevh = pvhash(pvhash_idx);
- if (PV_HASHED_ENTRY_NULL == *pprevh) {
- panic("pmap_remove_range empty hash removing hashed pv");
- }
- pvh_e = *pprevh;
- pmap_pv_hashlist_walks++;
- pv_cnt = 0;
- while (PV_HASHED_ENTRY_NULL != pvh_e) {
- pv_cnt++;
- if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == ppn) break;
- pprevh = &pvh_e->nexth;
- pvh_e = pvh_e->nexth;
- }
- pmap_pv_hashlist_cnts += pv_cnt;
- if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
- if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_remove_range pv not on hash");
- *pprevh = pvh_e->nexth;
- remque(&pvh_e->qlink);
- UNLOCK_PV_HASH(pvhash_idx);
-
- } /* rooted or not */
-
- UNLOCK_PVH(pai);
-
- if (pvh_e != PV_HASHED_ENTRY_NULL) {
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL) {
- pvh_et = pvh_e;
- }
-
- pvh_cnt++;
- }
-
- } /* removing mappings for this phy page */
- } /* for loop */
-
- if (pvh_eh != PV_HASHED_ENTRY_NULL) {
- PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
- }
-
-update_counts:
- /*
- * Update the counts
- */
-#if TESTING
- if (pmap->stats.resident_count < num_removed)
- panic("pmap_remove_range: resident_count");
-#endif
- assert(pmap->stats.resident_count >= num_removed);
- OSAddAtomic(-num_removed, &pmap->stats.resident_count);
-
-#if TESTING
- if (pmap->stats.wired_count < num_unwired)
- panic("pmap_remove_range: wired_count");
-#endif
- assert(pmap->stats.wired_count >= num_unwired);
- OSAddAtomic(-num_unwired, &pmap->stats.wired_count);
-
- return;
-}
/*
* Remove phys addr if mapped in specified map
}
-/*
- * Remove the given range of addresses
- * from the specified map.
- *
- * It is assumed that the start and end are properly
- * rounded to the hardware page size.
- */
-
-
-void
-pmap_remove(
- pmap_t map,
- addr64_t s64,
- addr64_t e64)
-{
- pt_entry_t *pde;
- pt_entry_t *spte, *epte;
- addr64_t l64;
- addr64_t orig_s64;
- uint64_t deadline;
-
- pmap_intr_assert();
-
- if (map == PMAP_NULL || s64 == e64)
- return;
-
- PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
- (int) map,
- (int) (s64>>32), (int) s64,
- (int) (e64>>32), (int) e64);
-
- PMAP_LOCK(map);
-
-#if 0
- /*
- * Check that address range in the kernel does not overlap the stacks.
- * We initialize local static min/max variables once to avoid making
- * 2 function calls for every remove. Note also that these functions
- * both return 0 before kernel stacks have been initialized, and hence
- * the panic is not triggered in this case.
- */
- if (map == kernel_pmap) {
- static vm_offset_t kernel_stack_min = 0;
- static vm_offset_t kernel_stack_max = 0;
-
- if (kernel_stack_min == 0) {
- kernel_stack_min = min_valid_stack_address();
- kernel_stack_max = max_valid_stack_address();
- }
- if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) ||
- (kernel_stack_min < e64 && e64 <= kernel_stack_max))
- panic("pmap_remove() attempted in kernel stack");
- }
-#else
-
- /*
- * The values of kernel_stack_min and kernel_stack_max are no longer
- * relevant now that we allocate kernel stacks anywhere in the kernel map,
- * so the old code above no longer applies. If we wanted to check that
- * we weren't removing a mapping of a page in a kernel stack we'd have to
- * mark the PTE with an unused bit and check that here.
- */
-
-#endif
-
- deadline = rdtsc64() + max_preemption_latency_tsc;
-
- orig_s64 = s64;
-
- while (s64 < e64) {
- l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size-1);
- if (l64 > e64)
- l64 = e64;
- pde = pmap_pde(map, s64);
-
- if (pde && (*pde & INTEL_PTE_VALID)) {
- spte = (pt_entry_t *)pmap_pte(map, (s64 & ~(pde_mapped_size-1)));
- spte = &spte[ptenum(s64)];
- epte = &spte[intel_btop(l64-s64)];
-
- pmap_remove_range(map, s64, spte, epte);
- }
- s64 = l64;
- pde++;
-
- if (s64 < e64 && rdtsc64() >= deadline) {
- PMAP_UNLOCK(map)
- PMAP_LOCK(map)
-
- deadline = rdtsc64() + max_preemption_latency_tsc;
- }
-
- }
-
- PMAP_UNLOCK(map);
-
- PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
- (int) map, 0, 0, 0, 0);
-
-}
-
-/*
- * Routine: pmap_page_protect
- *
- * Function:
- * Lower the permission for all mappings to a given
- * page.
- */
-void
-pmap_page_protect(
- ppnum_t pn,
- vm_prot_t prot)
-{
- pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
- pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
- pv_hashed_entry_t nexth;
- int pvh_cnt = 0;
- pv_rooted_entry_t pv_h;
- pv_rooted_entry_t pv_e;
- pv_hashed_entry_t pvh_e;
- pt_entry_t *pte;
- int pai;
- register pmap_t pmap;
- boolean_t remove;
- int pvhash_idx;
-
- pmap_intr_assert();
- assert(pn != vm_page_fictitious_addr);
- if (pn == vm_page_guard_addr)
- return;
-
- pai = ppn_to_pai(pn);
-
- if (!managed_page(pai)) {
- /*
- * Not a managed page.
- */
- return;
- }
-
- PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
- (int) pn, (int) prot, 0, 0, 0);
-
- /*
- * Determine the new protection.
- */
- switch (prot) {
- case VM_PROT_READ:
- case VM_PROT_READ|VM_PROT_EXECUTE:
- remove = FALSE;
- break;
- case VM_PROT_ALL:
- return; /* nothing to do */
- default:
- remove = TRUE;
- break;
- }
-
- pv_h = pai_to_pvh(pai);
-
- LOCK_PVH(pai);
-
-
- /*
- * Walk down PV list, changing or removing all mappings.
- */
- if (pv_h->pmap != PMAP_NULL) {
-
- pv_e = pv_h;
- pvh_e = (pv_hashed_entry_t)pv_e; /* cheat */
-
- do {
- register vm_map_offset_t vaddr;
- pmap = pv_e->pmap;
-
- vaddr = pv_e->va;
- pte = pmap_pte(pmap, vaddr);
-
- if (0 == pte) {
- panic("pmap_page_protect: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx, prot: %d kernel_pmap: %p", pmap, pn, vaddr, prot, kernel_pmap);
- }
-
- nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink); /* if there is one */
-
- /*
- * Remove the mapping if new protection is NONE
- * or if write-protecting a kernel mapping.
- */
- if (remove || pmap == kernel_pmap) {
- /*
- * Remove the mapping, collecting any modify bits.
- */
- pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
-
- PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
-
- pmap_phys_attributes[pai] |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
-
- pmap_store_pte(pte, 0);
-
-#if TESTING
- if (pmap->stats.resident_count < 1)
- panic("pmap_page_protect: resident_count");
-#endif
- assert(pmap->stats.resident_count >= 1);
- OSAddAtomic(-1, &pmap->stats.resident_count);
-
- /*
- * Deal with the pv_rooted_entry.
- */
-
- if (pv_e == pv_h) {
- /*
- * Fix up head later.
- */
- pv_h->pmap = PMAP_NULL;
- }
- else {
- /*
- * Delete this entry.
- */
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
- LOCK_PV_HASH(pvhash_idx);
- remque(&pvh_e->qlink);
- pmap_pvh_unlink(pvh_e);
- UNLOCK_PV_HASH(pvhash_idx);
-
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pvh_cnt++;
- }
- } else {
- /*
- * Write-protect.
- */
- pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WRITE));
- PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
- }
-
- pvh_e = nexth;
- } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
-
-
- /*
- * If pv_head mapping was removed, fix it up.
- */
-
- if (pv_h->pmap == PMAP_NULL) {
- pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
-
- if (pvh_e != (pv_hashed_entry_t)pv_h) {
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
- LOCK_PV_HASH(pvhash_idx);
- remque(&pvh_e->qlink);
- pmap_pvh_unlink(pvh_e);
- UNLOCK_PV_HASH(pvhash_idx);
- pv_h->pmap = pvh_e->pmap;
- pv_h->va = pvh_e->va;
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pvh_cnt++;
- }
- }
- }
- if (pvh_eh != PV_HASHED_ENTRY_NULL) {
- PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
- }
-
- UNLOCK_PVH(pai);
-
- PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
- 0, 0, 0, 0, 0);
-
-}
-
-
/*
* Routine:
* pmap_disconnect
}
-/*
- * Insert the given physical page (p) at
- * the specified virtual address (v) in the
- * target physical map with the protection requested.
- *
- * If specified, the page will be wired down, meaning
- * that the related pte cannot be reclaimed.
- *
- * NB: This is the only routine which MAY NOT lazy-evaluate
- * or lose information. That is, this routine must actually
- * insert this page into the given map NOW.
- */
-void
-pmap_enter(
- register pmap_t pmap,
- vm_map_offset_t vaddr,
- ppnum_t pn,
- vm_prot_t prot,
- unsigned int flags,
- boolean_t wired)
-{
- register pt_entry_t *pte;
- register pv_rooted_entry_t pv_h;
- register int pai;
- pv_hashed_entry_t pvh_e;
- pv_hashed_entry_t pvh_new;
- pv_hashed_entry_t *hashp;
- pt_entry_t template;
- pmap_paddr_t old_pa;
- pmap_paddr_t pa = (pmap_paddr_t)i386_ptob(pn);
- boolean_t need_tlbflush = FALSE;
- boolean_t set_NX;
- char oattr;
- int pvhash_idx;
- uint32_t pv_cnt;
- boolean_t old_pa_locked;
-
- pmap_intr_assert();
- assert(pn != vm_page_fictitious_addr);
- if (pmap_debug)
- printf("pmap(%qx, %x)\n", vaddr, pn);
- if (pmap == PMAP_NULL)
- return;
- if (pn == vm_page_guard_addr)
- return;
-
- PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
- (int) pmap,
- (int) (vaddr>>32), (int) vaddr,
- (int) pn, prot);
-
- if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled )
- set_NX = FALSE;
- else
- set_NX = TRUE;
-
- /*
- * Must allocate a new pvlist entry while we're unlocked;
- * zalloc may cause pageout (which will lock the pmap system).
- * If we determine we need a pvlist entry, we will unlock
- * and allocate one. Then we will retry, throughing away
- * the allocated entry later (if we no longer need it).
- */
-
- pvh_new = PV_HASHED_ENTRY_NULL;
-Retry:
- pvh_e = PV_HASHED_ENTRY_NULL;
-
- PMAP_LOCK(pmap);
-
- /*
- * Expand pmap to include this pte. Assume that
- * pmap is always expanded to include enough hardware
- * pages to map one VM page.
- */
-
- while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
- /*
- * Must unlock to expand the pmap.
- */
- PMAP_UNLOCK(pmap);
- pmap_expand(pmap, vaddr); /* going to grow pde level page(s) */
- PMAP_LOCK(pmap);
- }
-
- old_pa = pte_to_pa(*pte);
- pai = pa_index(old_pa);
- old_pa_locked = FALSE;
-
- /*
- * if we have a previous managed page, lock the pv entry now. after
- * we lock it, check to see if someone beat us to the lock and if so
- * drop the lock
- */
-
- if ((0 != old_pa) && managed_page(pai)) {
- LOCK_PVH(pai);
- old_pa_locked = TRUE;
- old_pa = pte_to_pa(*pte);
- if (0 == old_pa) {
- UNLOCK_PVH(pai); /* some other path beat us to it */
- old_pa_locked = FALSE;
- }
- }
-
-
- /*
- * Special case if the incoming physical page is already mapped
- * at this address.
- */
- if (old_pa == pa) {
-
- /*
- * May be changing its wired attribute or protection
- */
-
- template = pa_to_pte(pa) | INTEL_PTE_VALID;
-
- if(VM_MEM_NOT_CACHEABLE == (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
- if(!(flags & VM_MEM_GUARDED))
- template |= INTEL_PTE_PTA;
- template |= INTEL_PTE_NCACHE;
- }
-
- if (pmap != kernel_pmap)
- template |= INTEL_PTE_USER;
- if (prot & VM_PROT_WRITE)
- template |= INTEL_PTE_WRITE;
-
- if (set_NX == TRUE)
- template |= INTEL_PTE_NX;
-
- if (wired) {
- template |= INTEL_PTE_WIRED;
- if (!iswired(*pte))
- OSAddAtomic(+1, &pmap->stats.wired_count);
- }
- else {
- if (iswired(*pte)) {
- assert(pmap->stats.wired_count >= 1);
- OSAddAtomic(-1, &pmap->stats.wired_count);
- }
- }
-
- /* store modified PTE and preserve RC bits */
- pmap_update_pte(pte, *pte, template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD)));
- if (old_pa_locked) {
- UNLOCK_PVH(pai);
- old_pa_locked = FALSE;
- }
- need_tlbflush = TRUE;
- goto Done;
- }
-
- /*
- * Outline of code from here:
- * 1) If va was mapped, update TLBs, remove the mapping
- * and remove old pvlist entry.
- * 2) Add pvlist entry for new mapping
- * 3) Enter new mapping.
- *
- * If the old physical page is not managed step 1) is skipped
- * (except for updating the TLBs), and the mapping is
- * overwritten at step 3). If the new physical page is not
- * managed, step 2) is skipped.
- */
-
- if (old_pa != (pmap_paddr_t) 0) {
-
- /*
- * Don't do anything to pages outside valid memory here.
- * Instead convince the code that enters a new mapping
- * to overwrite the old one.
- */
-
- /* invalidate the PTE */
- pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
- /* propagate invalidate everywhere */
- PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
- /* remember reference and change */
- oattr = (char)(*pte & (PHYS_MODIFIED | PHYS_REFERENCED));
- /* completely invalidate the PTE */
- pmap_store_pte(pte, 0);
-
- if (managed_page(pai)) {
-#if TESTING
- if (pmap->stats.resident_count < 1)
- panic("pmap_enter: resident_count");
-#endif
- assert(pmap->stats.resident_count >= 1);
- OSAddAtomic(-1, &pmap->stats.resident_count);
-
- if (iswired(*pte)) {
-
-#if TESTING
- if (pmap->stats.wired_count < 1)
- panic("pmap_enter: wired_count");
-#endif
- assert(pmap->stats.wired_count >= 1);
- OSAddAtomic(-1, &pmap->stats.wired_count);
- }
-
- pmap_phys_attributes[pai] |= oattr;
- /*
- * Remove the mapping from the pvlist for
- * this physical page.
- * We'll end up with either a rooted pv or a
- * hashed pv
- */
- {
-
- pv_h = pai_to_pvh(pai);
-
- if (pv_h->pmap == PMAP_NULL) {
- panic("pmap_enter: null pv_list!");
- }
-
- if (pv_h->va == vaddr && pv_h->pmap == pmap) {
- /*
- * Header is the pv_rooted_entry.
- * If there is a next one, copy it to the
- * header and free the next one (we cannot
- * free the header)
- */
- pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
- if (pvh_e != (pv_hashed_entry_t)pv_h) {
- pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
- LOCK_PV_HASH(pvhash_idx);
- remque(&pvh_e->qlink);
- pmap_pvh_unlink(pvh_e);
- UNLOCK_PV_HASH(pvhash_idx);
- pv_h->pmap = pvh_e->pmap;
- pv_h->va = pvh_e->va;
- }
- else {
- pv_h->pmap = PMAP_NULL;
- pvh_e = PV_HASHED_ENTRY_NULL;
- }
- }
- else {
- pv_hashed_entry_t *pprevh;
- ppnum_t old_ppn;
- /* wasn't the rooted pv - hash, find it, and unlink it */
- old_ppn = (ppnum_t)pa_index(old_pa);
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pmap,vaddr);
- LOCK_PV_HASH(pvhash_idx);
- pprevh = pvhash(pvhash_idx);
-#if PV_DEBUG
- if (NULL==pprevh)panic("pmap enter 1");
-#endif
- pvh_e = *pprevh;
- pmap_pv_hashlist_walks++;
- pv_cnt = 0;
- while (PV_HASHED_ENTRY_NULL != pvh_e) {
- pv_cnt++;
- if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == old_ppn) break;
- pprevh = &pvh_e->nexth;
- pvh_e = pvh_e->nexth;
- }
- pmap_pv_hashlist_cnts += pv_cnt;
- if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
- if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_enter: pv not in hash list");
- if(NULL==pprevh)panic("pmap enter 2");
- *pprevh = pvh_e->nexth;
- remque(&pvh_e->qlink);
- UNLOCK_PV_HASH(pvhash_idx);
- }
- }
- }
- else {
- /*
- * old_pa is not managed.
- * Do removal part of accounting.
- */
-
- if (iswired(*pte)) {
- assert(pmap->stats.wired_count >= 1);
- OSAddAtomic(-1, &pmap->stats.wired_count);
- }
- }
- }
-
- /*
- * if we had a previously managed paged locked, unlock it now
- */
-
- if (old_pa_locked) {
- UNLOCK_PVH(pai);
- old_pa_locked = FALSE;
- }
-
- pai = pa_index(pa); /* now working with new incoming phys page */
- if (managed_page(pai)) {
-
- /*
- * Step 2) Enter the mapping in the PV list for this
- * physical page.
- */
- pv_h = pai_to_pvh(pai);
-
- LOCK_PVH(pai);
-
- if (pv_h->pmap == PMAP_NULL) {
- /*
- * No mappings yet, use rooted pv
- */
- pv_h->va = vaddr;
- pv_h->pmap = pmap;
- queue_init(&pv_h->qlink);
- }
- else {
- /*
- * Add new pv_hashed_entry after header.
- */
- if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
- pvh_e = pvh_new;
- pvh_new = PV_HASHED_ENTRY_NULL; /* show we used it */
- } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
- PV_HASHED_ALLOC(pvh_e);
- if (PV_HASHED_ENTRY_NULL == pvh_e) {
- /* the pv list is empty.
- * if we are on the kernel pmap we'll use one of the special private
- * kernel pv_e's, else, we need to unlock everything, zalloc a pv_e,
- * and restart bringing in the pv_e with us.
- */
- if (kernel_pmap == pmap) {
- PV_HASHED_KERN_ALLOC(pvh_e);
- } else {
- UNLOCK_PVH(pai);
- PMAP_UNLOCK(pmap);
- pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
- goto Retry;
- }
- }
- }
-
- if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pvh_e exhaustion");
- pvh_e->va = vaddr;
- pvh_e->pmap = pmap;
- pvh_e->ppn = pn;
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pmap,vaddr);
- LOCK_PV_HASH(pvhash_idx);
- insque(&pvh_e->qlink, &pv_h->qlink);
- hashp = pvhash(pvhash_idx);
-#if PV_DEBUG
- if(NULL==hashp)panic("pmap_enter 4");
-#endif
- pvh_e->nexth = *hashp;
- *hashp = pvh_e;
- UNLOCK_PV_HASH(pvhash_idx);
-
- /*
- * Remember that we used the pvlist entry.
- */
- pvh_e = PV_HASHED_ENTRY_NULL;
- }
-
- /*
- * only count the mapping
- * for 'managed memory'
- */
- OSAddAtomic(+1, &pmap->stats.resident_count);
- if (pmap->stats.resident_count > pmap->stats.resident_max) {
- pmap->stats.resident_max = pmap->stats.resident_count;
- }
- }
-
- /*
- * Step 3) Enter the mapping.
- *
- * Build a template to speed up entering -
- * only the pfn changes.
- */
- template = pa_to_pte(pa) | INTEL_PTE_VALID;
-
- if (flags & VM_MEM_NOT_CACHEABLE) {
- if(!(flags & VM_MEM_GUARDED))
- template |= INTEL_PTE_PTA;
- template |= INTEL_PTE_NCACHE;
- }
-
- if (pmap != kernel_pmap)
- template |= INTEL_PTE_USER;
- if (prot & VM_PROT_WRITE)
- template |= INTEL_PTE_WRITE;
-
- if (set_NX == TRUE)
- template |= INTEL_PTE_NX;
-
- if (wired) {
- template |= INTEL_PTE_WIRED;
- OSAddAtomic(+1, &pmap->stats.wired_count);
- }
- pmap_store_pte(pte, template);
-
- /* if this was a managed page we delayed unlocking the pv until here
- * to prevent pmap_page_protect et al from finding it until the pte
- * has been stored */
-
- if (managed_page(pai)) {
- UNLOCK_PVH(pai);
- }
-
-Done:
- if (need_tlbflush == TRUE)
- PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
-
- if (pvh_e != PV_HASHED_ENTRY_NULL) {
- PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
- }
-
- if (pvh_new != PV_HASHED_ENTRY_NULL) {
- PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
- }
-
- PMAP_UNLOCK(pmap);
- PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
-}
-
/*
* Routine: pmap_change_wiring
* Function: Change the wiring attribute for a map/virtual-address
return TRUE;
}
-void
-mapping_free_prime(void)
-{
- int i;
- pv_hashed_entry_t pvh_e;
- pv_hashed_entry_t pvh_eh;
- pv_hashed_entry_t pvh_et;
- int pv_cnt;
-
- pv_cnt = 0;
- pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
- for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) {
- pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
-
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pv_cnt++;
- }
- PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
-
- pv_cnt = 0;
- pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
- for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
- pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
-
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pv_cnt++;
- }
- PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
-
-}
-
-void
-mapping_adjust(void)
-{
- pv_hashed_entry_t pvh_e;
- pv_hashed_entry_t pvh_eh;
- pv_hashed_entry_t pvh_et;
- int pv_cnt;
- int i;
-
- if (mapping_adjust_call == NULL) {
- thread_call_setup(&mapping_adjust_call_data,
- (thread_call_func_t) mapping_adjust,
- (thread_call_param_t) NULL);
- mapping_adjust_call = &mapping_adjust_call_data;
- }
-
- pv_cnt = 0;
- pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
- if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) {
- for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
- pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
-
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pv_cnt++;
- }
- PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
- }
-
- pv_cnt = 0;
- pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
- if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) {
- for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) {
- pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
-
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pv_cnt++;
- }
- PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
- }
- mappingrecurse = 0;
-}
-
void
pmap_commpage32_init(vm_offset_t kernel_commpage, vm_offset_t user_commpage, int cnt)
{
#define INTEL_PTE_NX (1ULL << 63)
#define INTEL_PTE_INVALID 0
-
+/* This is conservative, but suffices */
+#define INTEL_PTE_RSVD ((1ULL << 8) | (1ULL << 9) | (1ULL << 10) | (1ULL << 11) | (0x1FFULL << 54))
#define pa_to_pte(a) ((a) & INTEL_PTE_PFN) /* XXX */
#define pte_to_pa(p) ((p) & INTEL_PTE_PFN) /* XXX */
#define pte_increment_pa(p) ((p) += INTEL_OFFMASK+1)
#endif
extern void pt_fake_zone_info(int *, vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, int *, int *);
-
+extern void pmap_pagetable_corruption_msg_log(int (*)(const char * fmt, ...)__printflike(1,2));
/*
#include <vm/pmap.h>
#include <sys/kdebug.h>
+#include <kern/debug.h>
#ifdef MACH_KERNEL_PRIVATE
simple_unlock(&(pmap)->lock); \
}
-extern void pmap_flush_tlbs(pmap_t pmap);
#define PMAP_UPDATE_TLBS(pmap, s, e) \
pmap_flush_tlbs(pmap)
void pmap_expand_pdpt(
pmap_t map,
vm_map_offset_t v);
+extern void pmap_flush_tlbs(pmap_t pmap);
+
#if defined(__x86_64__)
extern const boolean_t cpu_64bit;
#else
extern boolean_t cpu_64bit;
#endif
+/*
+ * Private data structures.
+ */
+
+/*
+ * For each vm_page_t, there is a list of all currently
+ * valid virtual mappings of that page. An entry is
+ * a pv_rooted_entry_t; the list is the pv_table.
+ *
+ * N.B. with the new combo rooted/hashed scheme it is
+ * only possibly to remove individual non-rooted entries
+ * if they are found via the hashed chains as there is no
+ * way to unlink the singly linked hashed entries if navigated to
+ * via the queue list off the rooted entries. Think of it as
+ * hash/walk/pull, keeping track of the prev pointer while walking
+ * the singly linked hash list. All of this is to save memory and
+ * keep both types of pv_entries as small as possible.
+ */
+
+/*
+
+PV HASHING Changes - JK 1/2007
+
+Pve's establish physical to virtual mappings. These are used for aliasing of a
+physical page to (potentially many) virtual addresses within pmaps. In the previous
+implementation the structure of the pv_entries (each 16 bytes in size) was
+
+typedef struct pv_entry {
+ struct pv_entry_t next;
+ pmap_t pmap;
+ vm_map_offset_t va;
+} *pv_entry_t;
+
+An initial array of these is created at boot time, one per physical page of memory,
+indexed by the physical page number. Additionally, a pool of entries is created from a
+pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
+Originally, we kept this pool around because the code in pmap_enter() was unable to
+block if it needed an entry and none were available - we'd panic. Some time ago I
+restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
+a pv structure and restart, removing a panic from the code (in the case of the kernel
+pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
+kernel pmaps). The pool has not been removed since there is a large performance gain
+keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
+
+As pmap_enter() created new mappings it linked the new pve's for them off the fixed
+pv array for that ppn (off the next pointer). These pve's are accessed for several
+operations, one of them being address space teardown. In that case, we basically do this
+
+ for (every page/pte in the space) {
+ calc pve_ptr from the ppn in the pte
+ for (every pv in the list for the ppn) {
+ if (this pv is for this pmap/vaddr) {
+ do housekeeping
+ unlink/free the pv
+ }
+ }
+ }
+
+The problem arose when we were running, say 8000 (or even 2000) apache or other processes
+and one or all terminate. The list hanging off each pv array entry could have thousands of
+entries. We were continuously linearly searching each of these lists as we stepped through
+the address space we were tearing down. Because of the locks we hold, likely taking a cache
+miss for each node, and interrupt disabling for MP issues the system became completely
+unresponsive for many seconds while we did this.
+
+Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
+for operations like pmap_page_protect and finding and modifying/removing a single pve as
+part of pmap_enter processing) has led to modifying the pve structures and databases.
+
+There are now two types of pve structures. A "rooted" structure which is basically the
+original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
+hash list via a hash of [pmap, vaddr]. These have been designed with the two goals of
+minimizing wired memory and making the lookup of a ppn faster. Since a vast majority of
+pages in the system are not aliased and hence represented by a single pv entry I've kept
+the rooted entry size as small as possible because there is one of these dedicated for
+every physical page of memory. The hashed pve's are larger due to the addition of the hash
+link and the ppn entry needed for matching while running the hash list to find the entry we
+are looking for. This way, only systems that have lots of aliasing (like 2000+ httpd procs)
+will pay the extra memory price. Both structures have the same first three fields allowing
+some simplification in the code.
+
+They have these shapes
+
+typedef struct pv_rooted_entry {
+ queue_head_t qlink;
+ vm_map_offset_t va;
+ pmap_t pmap;
+} *pv_rooted_entry_t;
+
+
+typedef struct pv_hashed_entry {
+ queue_head_t qlink;
+ vm_map_offset_t va;
+ pmap_t pmap;
+ ppnum_t ppn;
+ struct pv_hashed_entry *nexth;
+} *pv_hashed_entry_t;
+
+The main flow difference is that the code is now aware of the rooted entry and the hashed
+entries. Code that runs the pv list still starts with the rooted entry and then continues
+down the qlink onto the hashed entries. Code that is looking up a specific pv entry first
+checks the rooted entry and then hashes and runs the hash list for the match. The hash list
+lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
+
+*/
+
+typedef struct pv_rooted_entry { /* first three entries must match pv_hashed_entry_t */
+ queue_head_t qlink;
+ vm_map_offset_t va; /* virtual address for mapping */
+ pmap_t pmap; /* pmap where mapping lies */
+} *pv_rooted_entry_t;
+
+#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
+
+
+typedef struct pv_hashed_entry { /* first three entries must match pv_rooted_entry_t */
+ queue_head_t qlink;
+ vm_map_offset_t va;
+ pmap_t pmap;
+ ppnum_t ppn;
+ struct pv_hashed_entry *nexth;
+} *pv_hashed_entry_t;
+
+#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
+
+/* #define PV_DEBUG 1 uncomment to enable some PV debugging code */
+#ifdef PV_DEBUG
+#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
+#else
+#define CHK_NPVHASH()
+#endif
+
+#define NPVHASH 4095 /* MUST BE 2^N - 1 */
+#define PV_HASHED_LOW_WATER_MARK 5000
+#define PV_HASHED_KERN_LOW_WATER_MARK 400
+#define PV_HASHED_ALLOC_CHUNK 2000
+#define PV_HASHED_KERN_ALLOC_CHUNK 200
+
+#define PV_HASHED_ALLOC(pvh_e) { \
+ simple_lock(&pv_hashed_free_list_lock); \
+ if ((pvh_e = pv_hashed_free_list) != 0) { \
+ pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
+ pv_hashed_free_count--; \
+ if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
+ if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
+ thread_call_enter(mapping_adjust_call); \
+ } \
+ simple_unlock(&pv_hashed_free_list_lock); \
+}
+
+#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
+ simple_lock(&pv_hashed_free_list_lock); \
+ pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \
+ pv_hashed_free_list = pvh_eh; \
+ pv_hashed_free_count += pv_cnt; \
+ simple_unlock(&pv_hashed_free_list_lock); \
+}
+
+#define PV_HASHED_KERN_ALLOC(pvh_e) { \
+ simple_lock(&pv_hashed_kern_free_list_lock); \
+ if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
+ pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
+ pv_hashed_kern_free_count--; \
+ if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
+ if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
+ thread_call_enter(mapping_adjust_call); \
+ } \
+ simple_unlock(&pv_hashed_kern_free_list_lock); \
+}
+
+#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
+ simple_lock(&pv_hashed_kern_free_list_lock); \
+ pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \
+ pv_hashed_kern_free_list = pvh_eh; \
+ pv_hashed_kern_free_count += pv_cnt; \
+ simple_unlock(&pv_hashed_kern_free_list_lock); \
+}
+
+/*
+ * Index into pv_head table, its lock bits, and the modify/reference and managed bits
+ */
+
+#define pa_index(pa) (i386_btop(pa))
+#define ppn_to_pai(ppn) ((int)ppn)
+
+#define pai_to_pvh(pai) (&pv_head_table[pai])
+#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
+#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
+#define pvhash(idx) (&pv_hash_table[idx])
+
+#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
+#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
+
+#define IS_MANAGED_PAGE(x) \
+ ((unsigned int)(x) <= last_managed_page && \
+ (pmap_phys_attributes[x] & PHYS_MANAGED))
+
+/*
+ * Physical page attributes. Copy bits from PTE definition.
+ */
+#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
+#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
+#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
+
+/*
+ * Amount of virtual memory mapped by one
+ * page-directory entry.
+ */
+#define PDE_MAPPED_SIZE (pdetova(1))
+
+
+/*
+ * Locking and TLB invalidation
+ */
+
+/*
+ * Locking Protocols: (changed 2/2007 JK)
+ *
+ * There are two structures in the pmap module that need locking:
+ * the pmaps themselves, and the per-page pv_lists (which are locked
+ * by locking the pv_lock_table entry that corresponds to the pv_head
+ * for the list in question.) Most routines want to lock a pmap and
+ * then do operations in it that require pv_list locking -- however
+ * pmap_remove_all and pmap_copy_on_write operate on a physical page
+ * basis and want to do the locking in the reverse order, i.e. lock
+ * a pv_list and then go through all the pmaps referenced by that list.
+ *
+ * The system wide pmap lock has been removed. Now, paths take a lock
+ * on the pmap before changing its 'shape' and the reverse order lockers
+ * (coming in by phys ppn) take a lock on the corresponding pv and then
+ * retest to be sure nothing changed during the window before they locked
+ * and can then run up/down the pv lists holding the list lock. This also
+ * lets the pmap layer run (nearly completely) interrupt enabled, unlike
+ * previously.
+ */
+
+/*
+ * PV locking
+ */
+
+#define LOCK_PVH(index) { \
+ mp_disable_preemption(); \
+ lock_pvh_pai(index); \
+}
+
+#define UNLOCK_PVH(index) { \
+ unlock_pvh_pai(index); \
+ mp_enable_preemption(); \
+}
+/*
+ * PV hash locking
+ */
+
+#define LOCK_PV_HASH(hash) lock_hash_hash(hash)
+#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
+extern uint32_t npvhash;
+extern pv_hashed_entry_t *pv_hash_table; /* hash lists */
+extern pv_hashed_entry_t pv_hashed_free_list;
+extern pv_hashed_entry_t pv_hashed_kern_free_list;
+decl_simple_lock_data(extern, pv_hashed_free_list_lock)
+decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
+decl_simple_lock_data(extern, pv_hash_table_lock)
+
+extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
+
+extern int pv_hashed_free_count;
+extern int pv_hashed_kern_free_count;
+#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
+#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
+extern char *pv_lock_table; /* pointer to array of bits */
+
+extern char *pv_hash_lock_table;
+extern pv_rooted_entry_t pv_head_table; /* array of entries, one
+ * per page */
+extern uint64_t pde_mapped_size;
+
+extern char *pmap_phys_attributes;
+extern unsigned int last_managed_page;
+
+/*
+ * when spinning through pmap_remove
+ * ensure that we don't spend too much
+ * time with preemption disabled.
+ * I'm setting the current threshold
+ * to 20us
+ */
+#define MAX_PREEMPTION_LATENCY_NS 20000
+extern uint64_t max_preemption_latency_tsc;
+
+/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
+#ifdef DEBUGINTERRUPTS
+#define pmap_intr_assert() { \
+ if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
+ panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
+}
+#else
+#define pmap_intr_assert()
+#endif
+
+extern int nx_enabled;
+extern unsigned int inuse_ptepages_count;
+
+static inline uint32_t
+pvhashidx(pmap_t pmap, vm_map_offset_t va)
+{
+ return ((uint32_t)(uintptr_t)pmap ^
+ ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
+ npvhash;
+}
+
+/*
+ * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
+ * properly deals with the anchor.
+ * must be called with the hash locked, does not unlock it
+ */
+
+static inline void
+pmap_pvh_unlink(pv_hashed_entry_t pvh)
+{
+ pv_hashed_entry_t curh;
+ pv_hashed_entry_t *pprevh;
+ int pvhash_idx;
+
+ CHK_NPVHASH();
+ pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
+
+ pprevh = pvhash(pvhash_idx);
+
+#if PV_DEBUG
+ if (NULL == *pprevh)
+ panic("pvh_unlink null anchor"); /* JK DEBUG */
+#endif
+ curh = *pprevh;
+
+ while (PV_HASHED_ENTRY_NULL != curh) {
+ if (pvh == curh)
+ break;
+ pprevh = &curh->nexth;
+ curh = curh->nexth;
+ }
+ if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
+ *pprevh = pvh->nexth;
+ return;
+}
+
+static inline void
+pv_hash_add(pv_hashed_entry_t pvh_e,
+ pv_rooted_entry_t pv_h)
+{
+ pv_hashed_entry_t *hashp;
+ int pvhash_idx;
+
+ CHK_NPVHASH();
+ pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
+ LOCK_PV_HASH(pvhash_idx);
+ insque(&pvh_e->qlink, &pv_h->qlink);
+ hashp = pvhash(pvhash_idx);
+#if PV_DEBUG
+ if (NULL==hashp)
+ panic("pv_hash_add(%p) null hash bucket", pvh_e);
+#endif
+ pvh_e->nexth = *hashp;
+ *hashp = pvh_e;
+ UNLOCK_PV_HASH(pvhash_idx);
+}
+
+static inline void
+pv_hash_remove(pv_hashed_entry_t pvh_e)
+{
+ int pvhash_idx;
+
+ CHK_NPVHASH();
+ pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
+ LOCK_PV_HASH(pvhash_idx);
+ remque(&pvh_e->qlink);
+ pmap_pvh_unlink(pvh_e);
+ UNLOCK_PV_HASH(pvhash_idx);
+}
+
+static inline boolean_t popcnt1(uint64_t distance) {
+ return ((distance & (distance - 1)) == 0);
+}
+
+/*
+ * Routines to handle suppression of/recovery from some forms of pagetable corruption
+ * incidents observed in the field. These can be either software induced (wild
+ * stores to the mapwindows where applicable, use after free errors
+ * (typically of pages addressed physically), mis-directed DMAs etc., or due
+ * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
+ * the recording mechanism is deliberately not MP-safe. The overarching goal is to
+ * still assert on potential software races, but attempt recovery from incidents
+ * identifiable as occurring due to issues beyond the control of the pmap module.
+ * The latter includes single-bit errors and malformed pagetable entries.
+ * We currently limit ourselves to recovery/suppression of one incident per
+ * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
+ * are logged.
+ * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
+ */
+
+typedef enum {
+ PTE_VALID = 0x0,
+ PTE_INVALID = 0x1,
+ PTE_RSVD = 0x2,
+ PTE_SUPERVISOR = 0x4,
+ PTE_BITFLIP = 0x8,
+ PV_BITFLIP = 0x10,
+ PTE_INVALID_CACHEABILITY = 0x20
+} pmap_pagetable_corruption_t;
+
+typedef enum {
+ ROOT_PRESENT = 0,
+ ROOT_ABSENT = 1
+} pmap_pv_assertion_t;
+
+typedef enum {
+ PMAP_ACTION_IGNORE = 0x0,
+ PMAP_ACTION_ASSERT = 0x1,
+ PMAP_ACTION_RETRY = 0x2,
+ PMAP_ACTION_RETRY_RELOCK = 0x4
+} pmap_pagetable_corruption_action_t;
+
+#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
+extern uint64_t pmap_pagetable_corruption_interval_abstime;
+
+extern uint32_t pmap_pagetable_corruption_incidents;
+#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
+typedef struct {
+ pmap_pv_assertion_t incident;
+ pmap_pagetable_corruption_t reason;
+ pmap_pagetable_corruption_action_t action;
+ pmap_t pmap;
+ vm_map_offset_t vaddr;
+ pt_entry_t pte;
+ ppnum_t ppn;
+ pmap_t pvpmap;
+ vm_map_offset_t pvva;
+ uint64_t abstime;
+} pmap_pagetable_corruption_record_t;
+
+extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
+extern uint64_t pmap_pagetable_corruption_last_abstime;
+extern thread_call_t pmap_pagetable_corruption_log_call;
+extern boolean_t pmap_pagetable_corruption_timeout;
+
+static inline void
+pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
+ uint32_t pmap_pagetable_corruption_log_index;
+ pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
+ pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
+ /* Asynchronously log */
+ thread_call_enter(pmap_pagetable_corruption_log_call);
+}
+
+static inline pmap_pagetable_corruption_action_t
+pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
+ pmap_pv_assertion_t action = PMAP_ACTION_ASSERT;
+ pmap_pagetable_corruption_t suppress_reason = PTE_VALID;
+ ppnum_t suppress_ppn = 0;
+ pt_entry_t cpte = *ptep;
+ ppnum_t cpn = pa_index(pte_to_pa(cpte));
+ ppnum_t ppn = *ppnp;
+ pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn));
+ pv_rooted_entry_t pv_e = pv_h;
+ uint32_t bitdex;
+ pmap_t pvpmap = pv_h->pmap;
+ vm_map_offset_t pvva = pv_h->va;
+ boolean_t ppcd = FALSE;
+
+ /* Ideally, we'd consult the Mach VM here to definitively determine
+ * the nature of the mapping for this address space and address.
+ * As that would be a layering violation in this context, we
+ * use various heuristics to recover from single bit errors,
+ * malformed pagetable entries etc. These are not intended
+ * to be comprehensive.
+ */
+
+ /* As a precautionary measure, mark A+D */
+ pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
+
+ /*
+ * Correct potential single bit errors in either (but not both) element
+ * of the PV
+ */
+ do {
+ if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
+ (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
+ pv_e->pmap = pmap;
+ pv_e->va = vaddr;
+ suppress_reason = PV_BITFLIP;
+ action = PMAP_ACTION_RETRY;
+ goto pmap_cpc_exit;
+ }
+ } while((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink)) != pv_h);
+
+ /* Discover root entries with a Hamming
+ * distance of 1 from the supplied
+ * physical page frame.
+ */
+ for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
+ ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
+ if (IS_MANAGED_PAGE(npn)) {
+ pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
+ if (npv_h->va == vaddr && npv_h->pmap == pmap) {
+ suppress_reason = PTE_BITFLIP;
+ suppress_ppn = npn;
+ action = PMAP_ACTION_RETRY_RELOCK;
+ UNLOCK_PVH(ppn_to_pai(ppn));
+ *ppnp = npn;
+ goto pmap_cpc_exit;
+ }
+ }
+ }
+
+ if (pmap == kernel_pmap) {
+ action = PMAP_ACTION_ASSERT;
+ goto pmap_cpc_exit;
+ }
+
+ /* Check for malformed/inconsistent entries */
+
+ if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) {
+ action = PMAP_ACTION_IGNORE;
+ suppress_reason = PTE_INVALID_CACHEABILITY;
+ }
+ else if (cpte & INTEL_PTE_RSVD) {
+ action = PMAP_ACTION_IGNORE;
+ suppress_reason = PTE_RSVD;
+ }
+ else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) {
+ action = PMAP_ACTION_IGNORE;
+ suppress_reason = PTE_SUPERVISOR;
+ }
+pmap_cpc_exit:
+ PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
+
+ if (debug_boot_arg && !ppcd) {
+ action = PMAP_ACTION_ASSERT;
+ }
+
+ if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
+ action = PMAP_ACTION_ASSERT;
+ pmap_pagetable_corruption_timeout = TRUE;
+ }
+ else
+ {
+ pmap_pagetable_corruption_last_abstime = mach_absolute_time();
+ }
+ pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
+ return action;
+}
+/*
+ * Remove pv list entry.
+ * Called with pv_head_table entry locked.
+ * Returns pv entry to be freed (or NULL).
+ */
+
+static inline __attribute__((always_inline)) pv_hashed_entry_t
+pmap_pv_remove( pmap_t pmap,
+ vm_map_offset_t vaddr,
+ ppnum_t *ppnp,
+ pt_entry_t *pte)
+{
+ pv_hashed_entry_t pvh_e;
+ pv_rooted_entry_t pv_h;
+ pv_hashed_entry_t *pprevh;
+ int pvhash_idx;
+ uint32_t pv_cnt;
+ ppnum_t ppn;
+
+pmap_pv_remove_retry:
+ ppn = *ppnp;
+ pvh_e = PV_HASHED_ENTRY_NULL;
+ pv_h = pai_to_pvh(ppn_to_pai(ppn));
+
+ if (pv_h->pmap == PMAP_NULL) {
+ pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
+ if (pac == PMAP_ACTION_IGNORE)
+ goto pmap_pv_remove_exit;
+ else if (pac == PMAP_ACTION_ASSERT)
+ panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): null pv_list!", pmap, vaddr, ppn, *pte);
+ else if (pac == PMAP_ACTION_RETRY_RELOCK) {
+ LOCK_PVH(ppn_to_pai(*ppnp));
+ pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
+ goto pmap_pv_remove_retry;
+ }
+ else if (pac == PMAP_ACTION_RETRY)
+ goto pmap_pv_remove_retry;
+ }
+
+ if (pv_h->va == vaddr && pv_h->pmap == pmap) {
+ /*
+ * Header is the pv_rooted_entry.
+ * We can't free that. If there is a queued
+ * entry after this one we remove that
+ * from the ppn queue, we remove it from the hash chain
+ * and copy it to the rooted entry. Then free it instead.
+ */
+ pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
+ if (pv_h != (pv_rooted_entry_t) pvh_e) {
+ /*
+ * Entry queued to root, remove this from hash
+ * and install as new root.
+ */
+ CHK_NPVHASH();
+ pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
+ LOCK_PV_HASH(pvhash_idx);
+ remque(&pvh_e->qlink);
+ pprevh = pvhash(pvhash_idx);
+ if (PV_HASHED_ENTRY_NULL == *pprevh) {
+ panic("pmap_pv_remove(%p,0x%llx,0x%x): "
+ "empty hash, removing rooted",
+ pmap, vaddr, ppn);
+ }
+ pmap_pvh_unlink(pvh_e);
+ UNLOCK_PV_HASH(pvhash_idx);
+ pv_h->pmap = pvh_e->pmap;
+ pv_h->va = pvh_e->va; /* dispose of pvh_e */
+ } else {
+ /* none queued after rooted */
+ pv_h->pmap = PMAP_NULL;
+ pvh_e = PV_HASHED_ENTRY_NULL;
+ }
+ } else {
+ /*
+ * not removing rooted pv. find it on hash chain, remove from
+ * ppn queue and hash chain and free it
+ */
+ CHK_NPVHASH();
+ pvhash_idx = pvhashidx(pmap, vaddr);
+ LOCK_PV_HASH(pvhash_idx);
+ pprevh = pvhash(pvhash_idx);
+ if (PV_HASHED_ENTRY_NULL == *pprevh) {
+ panic("pmap_pv_remove(%p,0x%llx,0x%x): empty hash", pmap, vaddr, ppn);
+ }
+ pvh_e = *pprevh;
+ pmap_pv_hashlist_walks++;
+ pv_cnt = 0;
+ while (PV_HASHED_ENTRY_NULL != pvh_e) {
+ pv_cnt++;
+ if (pvh_e->pmap == pmap &&
+ pvh_e->va == vaddr &&
+ pvh_e->ppn == ppn)
+ break;
+ pprevh = &pvh_e->nexth;
+ pvh_e = pvh_e->nexth;
+ }
+ if (PV_HASHED_ENTRY_NULL == pvh_e) {
+ pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
+
+ if (pac == PMAP_ACTION_ASSERT)
+ panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, pv_h->pmap, pv_h->va);
+ else {
+ UNLOCK_PV_HASH(pvhash_idx);
+ if (pac == PMAP_ACTION_RETRY_RELOCK) {
+ LOCK_PVH(ppn_to_pai(*ppnp));
+ pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
+ goto pmap_pv_remove_retry;
+ }
+ else if (pac == PMAP_ACTION_RETRY) {
+ goto pmap_pv_remove_retry;
+ }
+ else if (pac == PMAP_ACTION_IGNORE) {
+ goto pmap_pv_remove_exit;
+ }
+ }
+ }
+ pmap_pv_hashlist_cnts += pv_cnt;
+ if (pmap_pv_hashlist_max < pv_cnt)
+ pmap_pv_hashlist_max = pv_cnt;
+ *pprevh = pvh_e->nexth;
+ remque(&pvh_e->qlink);
+ UNLOCK_PV_HASH(pvhash_idx);
+ }
+pmap_pv_remove_exit:
+ return pvh_e;
+}
+
#endif /* MACH_KERNEL_PRIVATE */
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <i386/pmap_internal.h>
+
+
+void pmap_remove_range(
+ pmap_t pmap,
+ vm_map_offset_t va,
+ pt_entry_t *spte,
+ pt_entry_t *epte);
+
+pv_rooted_entry_t pv_head_table; /* array of entries, one per
+ * page */
+thread_call_t mapping_adjust_call;
+static thread_call_data_t mapping_adjust_call_data;
+uint32_t mappingrecurse = 0;
+
+pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[PMAP_PAGETABLE_CORRUPTION_MAX_LOG];
+uint32_t pmap_pagetable_corruption_incidents;
+uint64_t pmap_pagetable_corruption_last_abstime = (~(0ULL) >> 1);
+uint64_t pmap_pagetable_corruption_interval_abstime;
+thread_call_t pmap_pagetable_corruption_log_call;
+static thread_call_data_t pmap_pagetable_corruption_log_call_data;
+boolean_t pmap_pagetable_corruption_timeout = FALSE;
+
/*
* The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
* on a NBPDE boundary.
return ppn;
}
+/*
+ * Insert the given physical page (p) at
+ * the specified virtual address (v) in the
+ * target physical map with the protection requested.
+ *
+ * If specified, the page will be wired down, meaning
+ * that the related pte cannot be reclaimed.
+ *
+ * NB: This is the only routine which MAY NOT lazy-evaluate
+ * or lose information. That is, this routine must actually
+ * insert this page into the given map NOW.
+ */
+void
+pmap_enter(
+ register pmap_t pmap,
+ vm_map_offset_t vaddr,
+ ppnum_t pn,
+ vm_prot_t prot,
+ unsigned int flags,
+ boolean_t wired)
+{
+ pt_entry_t *pte;
+ pv_rooted_entry_t pv_h;
+ int pai;
+ pv_hashed_entry_t pvh_e;
+ pv_hashed_entry_t pvh_new;
+ pt_entry_t template;
+ pmap_paddr_t old_pa;
+ pmap_paddr_t pa = (pmap_paddr_t) i386_ptob(pn);
+ boolean_t need_tlbflush = FALSE;
+ boolean_t set_NX;
+ char oattr;
+ boolean_t old_pa_locked;
+ /* 2MiB mappings are confined to x86_64 by VM */
+ boolean_t superpage = flags & VM_MEM_SUPERPAGE;
+ vm_object_t delpage_pm_obj = NULL;
+ int delpage_pde_index = 0;
+ pt_entry_t old_pte;
+
+ pmap_intr_assert();
+ assert(pn != vm_page_fictitious_addr);
+
+ if (pmap == PMAP_NULL)
+ return;
+ if (pn == vm_page_guard_addr)
+ return;
+
+ PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
+ pmap,
+ (uint32_t) (vaddr >> 32), (uint32_t) vaddr,
+ pn, prot);
+
+ if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
+ set_NX = FALSE;
+ else
+ set_NX = TRUE;
+
+ /*
+ * Must allocate a new pvlist entry while we're unlocked;
+ * zalloc may cause pageout (which will lock the pmap system).
+ * If we determine we need a pvlist entry, we will unlock
+ * and allocate one. Then we will retry, throughing away
+ * the allocated entry later (if we no longer need it).
+ */
+
+ pvh_new = PV_HASHED_ENTRY_NULL;
+Retry:
+ pvh_e = PV_HASHED_ENTRY_NULL;
+
+ PMAP_LOCK(pmap);
+
+ /*
+ * Expand pmap to include this pte. Assume that
+ * pmap is always expanded to include enough hardware
+ * pages to map one VM page.
+ */
+ if(superpage) {
+ while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
+ /* need room for another pde entry */
+ PMAP_UNLOCK(pmap);
+ pmap_expand_pdpt(pmap, vaddr);
+ PMAP_LOCK(pmap);
+ }
+ } else {
+ while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
+ /*
+ * Must unlock to expand the pmap
+ * going to grow pde level page(s)
+ */
+ PMAP_UNLOCK(pmap);
+ pmap_expand(pmap, vaddr);
+ PMAP_LOCK(pmap);
+ }
+ }
+
+ if (superpage && *pte && !(*pte & INTEL_PTE_PS)) {
+ /*
+ * There is still an empty page table mapped that
+ * was used for a previous base page mapping.
+ * Remember the PDE and the PDE index, so that we
+ * can free the page at the end of this function.
+ */
+ delpage_pde_index = (int)pdeidx(pmap, vaddr);
+ delpage_pm_obj = pmap->pm_obj;
+ *pte = 0;
+ }
+
+
+ old_pa = pte_to_pa(*pte);
+ pai = pa_index(old_pa);
+ old_pa_locked = FALSE;
+
+ /*
+ * if we have a previous managed page, lock the pv entry now. after
+ * we lock it, check to see if someone beat us to the lock and if so
+ * drop the lock
+ */
+ if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
+ LOCK_PVH(pai);
+ old_pa_locked = TRUE;
+ old_pa = pte_to_pa(*pte);
+ if (0 == old_pa) {
+ UNLOCK_PVH(pai); /* another path beat us to it */
+ old_pa_locked = FALSE;
+ }
+ }
+
+ /*
+ * Special case if the incoming physical page is already mapped
+ * at this address.
+ */
+ if (old_pa == pa) {
+
+ /*
+ * May be changing its wired attribute or protection
+ */
+
+ template = pa_to_pte(pa) | INTEL_PTE_VALID;
+
+ if (VM_MEM_NOT_CACHEABLE ==
+ (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
+ if (!(flags & VM_MEM_GUARDED))
+ template |= INTEL_PTE_PTA;
+ template |= INTEL_PTE_NCACHE;
+ }
+ if (pmap != kernel_pmap)
+ template |= INTEL_PTE_USER;
+ if (prot & VM_PROT_WRITE)
+ template |= INTEL_PTE_WRITE;
+
+ if (set_NX)
+ template |= INTEL_PTE_NX;
+
+ if (wired) {
+ template |= INTEL_PTE_WIRED;
+ if (!iswired(*pte))
+ OSAddAtomic(+1,
+ &pmap->stats.wired_count);
+ } else {
+ if (iswired(*pte)) {
+ assert(pmap->stats.wired_count >= 1);
+ OSAddAtomic(-1,
+ &pmap->stats.wired_count);
+ }
+ }
+ if (superpage) /* this path can not be used */
+ template |= INTEL_PTE_PS; /* to change the page size! */
+
+ /* store modified PTE and preserve RC bits */
+ pmap_update_pte(pte, *pte,
+ template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD)));
+ if (old_pa_locked) {
+ UNLOCK_PVH(pai);
+ old_pa_locked = FALSE;
+ }
+ need_tlbflush = TRUE;
+ goto Done;
+ }
+
+ /*
+ * Outline of code from here:
+ * 1) If va was mapped, update TLBs, remove the mapping
+ * and remove old pvlist entry.
+ * 2) Add pvlist entry for new mapping
+ * 3) Enter new mapping.
+ *
+ * If the old physical page is not managed step 1) is skipped
+ * (except for updating the TLBs), and the mapping is
+ * overwritten at step 3). If the new physical page is not
+ * managed, step 2) is skipped.
+ */
+
+ if (old_pa != (pmap_paddr_t) 0) {
+
+ /*
+ * Don't do anything to pages outside valid memory here.
+ * Instead convince the code that enters a new mapping
+ * to overwrite the old one.
+ */
+
+ /* invalidate the PTE */
+ pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
+ /* propagate invalidate everywhere */
+ PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
+ /* remember reference and change */
+ old_pte = *pte;
+ oattr = (char) (old_pte & (PHYS_MODIFIED | PHYS_REFERENCED));
+ /* completely invalidate the PTE */
+ pmap_store_pte(pte, 0);
+
+ if (IS_MANAGED_PAGE(pai)) {
+#if TESTING
+ if (pmap->stats.resident_count < 1)
+ panic("pmap_enter: resident_count");
+#endif
+ assert(pmap->stats.resident_count >= 1);
+ OSAddAtomic(-1,
+ &pmap->stats.resident_count);
+
+ if (iswired(*pte)) {
+#if TESTING
+ if (pmap->stats.wired_count < 1)
+ panic("pmap_enter: wired_count");
+#endif
+ assert(pmap->stats.wired_count >= 1);
+ OSAddAtomic(-1,
+ &pmap->stats.wired_count);
+ }
+ pmap_phys_attributes[pai] |= oattr;
+
+ /*
+ * Remove the mapping from the pvlist for
+ * this physical page.
+ * We'll end up with either a rooted pv or a
+ * hashed pv
+ */
+ pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte);
+
+ } else {
+
+ /*
+ * old_pa is not managed.
+ * Do removal part of accounting.
+ */
+
+ if (iswired(*pte)) {
+ assert(pmap->stats.wired_count >= 1);
+ OSAddAtomic(-1,
+ &pmap->stats.wired_count);
+ }
+ }
+ }
+
+ /*
+ * if we had a previously managed paged locked, unlock it now
+ */
+ if (old_pa_locked) {
+ UNLOCK_PVH(pai);
+ old_pa_locked = FALSE;
+ }
+
+ pai = pa_index(pa); /* now working with new incoming phys page */
+ if (IS_MANAGED_PAGE(pai)) {
+
+ /*
+ * Step 2) Enter the mapping in the PV list for this
+ * physical page.
+ */
+ pv_h = pai_to_pvh(pai);
+
+ LOCK_PVH(pai);
+
+ if (pv_h->pmap == PMAP_NULL) {
+ /*
+ * No mappings yet, use rooted pv
+ */
+ pv_h->va = vaddr;
+ pv_h->pmap = pmap;
+ queue_init(&pv_h->qlink);
+ } else {
+ /*
+ * Add new pv_hashed_entry after header.
+ */
+ if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
+ pvh_e = pvh_new;
+ pvh_new = PV_HASHED_ENTRY_NULL;
+ } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
+ PV_HASHED_ALLOC(pvh_e);
+ if (PV_HASHED_ENTRY_NULL == pvh_e) {
+ /*
+ * the pv list is empty. if we are on
+ * the kernel pmap we'll use one of
+ * the special private kernel pv_e's,
+ * else, we need to unlock
+ * everything, zalloc a pv_e, and
+ * restart bringing in the pv_e with
+ * us.
+ */
+ if (kernel_pmap == pmap) {
+ PV_HASHED_KERN_ALLOC(pvh_e);
+ } else {
+ UNLOCK_PVH(pai);
+ PMAP_UNLOCK(pmap);
+ pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
+ goto Retry;
+ }
+ }
+ }
+
+ if (PV_HASHED_ENTRY_NULL == pvh_e)
+ panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings");
+
+ pvh_e->va = vaddr;
+ pvh_e->pmap = pmap;
+ pvh_e->ppn = pn;
+ pv_hash_add(pvh_e, pv_h);
+
+ /*
+ * Remember that we used the pvlist entry.
+ */
+ pvh_e = PV_HASHED_ENTRY_NULL;
+ }
+
+ /*
+ * only count the mapping
+ * for 'managed memory'
+ */
+ OSAddAtomic(+1, & pmap->stats.resident_count);
+ if (pmap->stats.resident_count > pmap->stats.resident_max) {
+ pmap->stats.resident_max = pmap->stats.resident_count;
+ }
+ }
+ /*
+ * Step 3) Enter the mapping.
+ *
+ * Build a template to speed up entering -
+ * only the pfn changes.
+ */
+ template = pa_to_pte(pa) | INTEL_PTE_VALID;
+
+ if (flags & VM_MEM_NOT_CACHEABLE) {
+ if (!(flags & VM_MEM_GUARDED))
+ template |= INTEL_PTE_PTA;
+ template |= INTEL_PTE_NCACHE;
+ }
+ if (pmap != kernel_pmap)
+ template |= INTEL_PTE_USER;
+ if (prot & VM_PROT_WRITE)
+ template |= INTEL_PTE_WRITE;
+ if (set_NX)
+ template |= INTEL_PTE_NX;
+ if (wired) {
+ template |= INTEL_PTE_WIRED;
+ OSAddAtomic(+1, & pmap->stats.wired_count);
+ }
+ if (superpage)
+ template |= INTEL_PTE_PS;
+ pmap_store_pte(pte, template);
+
+ /*
+ * if this was a managed page we delayed unlocking the pv until here
+ * to prevent pmap_page_protect et al from finding it until the pte
+ * has been stored
+ */
+ if (IS_MANAGED_PAGE(pai)) {
+ UNLOCK_PVH(pai);
+ }
+Done:
+ if (need_tlbflush == TRUE)
+ PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
+
+ if (pvh_e != PV_HASHED_ENTRY_NULL) {
+ PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
+ }
+ if (pvh_new != PV_HASHED_ENTRY_NULL) {
+ PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
+ }
+ PMAP_UNLOCK(pmap);
+
+ if (delpage_pm_obj) {
+ vm_page_t m;
+
+ vm_object_lock(delpage_pm_obj);
+ m = vm_page_lookup(delpage_pm_obj, delpage_pde_index);
+ if (m == VM_PAGE_NULL)
+ panic("pmap_enter: pte page not in object");
+ VM_PAGE_FREE(m);
+ OSAddAtomic(-1, &inuse_ptepages_count);
+ vm_object_unlock(delpage_pm_obj);
+ }
+
+ PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Remove a range of hardware page-table entries.
+ * The entries given are the first (inclusive)
+ * and last (exclusive) entries for the VM pages.
+ * The virtual address is the va for the first pte.
+ *
+ * The pmap must be locked.
+ * If the pmap is not the kernel pmap, the range must lie
+ * entirely within one pte-page. This is NOT checked.
+ * Assumes that the pte-page exists.
+ */
+
+void
+pmap_remove_range(
+ pmap_t pmap,
+ vm_map_offset_t start_vaddr,
+ pt_entry_t *spte,
+ pt_entry_t *epte)
+{
+ pt_entry_t *cpte;
+ pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
+ pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
+ pv_hashed_entry_t pvh_e;
+ int pvh_cnt = 0;
+ int num_removed, num_unwired, num_found, num_invalid;
+ int pai;
+ pmap_paddr_t pa;
+ vm_map_offset_t vaddr;
+
+ num_removed = 0;
+ num_unwired = 0;
+ num_found = 0;
+ num_invalid = 0;
+#if defined(__i386__)
+ if (pmap != kernel_pmap &&
+ pmap->pm_task_map == TASK_MAP_32BIT &&
+ start_vaddr >= HIGH_MEM_BASE) {
+ /*
+ * The range is in the "high_shared_pde" which is shared
+ * between the kernel and all 32-bit tasks. It holds
+ * the 32-bit commpage but also the trampolines, GDT, etc...
+ * so we can't let user tasks remove anything from it.
+ */
+ return;
+ }
+#endif
+ /* invalidate the PTEs first to "freeze" them */
+ for (cpte = spte, vaddr = start_vaddr;
+ cpte < epte;
+ cpte++, vaddr += PAGE_SIZE_64) {
+ pt_entry_t p = *cpte;
+
+ pa = pte_to_pa(p);
+ if (pa == 0)
+ continue;
+ num_found++;
+
+ if (iswired(p))
+ num_unwired++;
+
+ pai = pa_index(pa);
+
+ if (!IS_MANAGED_PAGE(pai)) {
+ /*
+ * Outside range of managed physical memory.
+ * Just remove the mappings.
+ */
+ pmap_store_pte(cpte, 0);
+ continue;
+ }
+
+ if ((p & INTEL_PTE_VALID) == 0)
+ num_invalid++;
+
+ /* invalidate the PTE */
+ pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID));
+ }
+
+ if (num_found == 0) {
+ /* nothing was changed: we're done */
+ goto update_counts;
+ }
+
+ /* propagate the invalidates to other CPUs */
+
+ PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
+
+ for (cpte = spte, vaddr = start_vaddr;
+ cpte < epte;
+ cpte++, vaddr += PAGE_SIZE_64) {
+
+ pa = pte_to_pa(*cpte);
+ if (pa == 0)
+ continue;
+
+ pai = pa_index(pa);
+
+ LOCK_PVH(pai);
+
+ pa = pte_to_pa(*cpte);
+ if (pa == 0) {
+ UNLOCK_PVH(pai);
+ continue;
+ }
+ num_removed++;
+
+ /*
+ * Get the modify and reference bits, then
+ * nuke the entry in the page table
+ */
+ /* remember reference and change */
+ pmap_phys_attributes[pai] |=
+ (char) (*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
+
+ /*
+ * Remove the mapping from the pvlist for this physical page.
+ */
+ pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte);
+
+ /* completely invalidate the PTE */
+ pmap_store_pte(cpte, 0);
+
+ UNLOCK_PVH(pai);
+
+ if (pvh_e != PV_HASHED_ENTRY_NULL) {
+ pvh_e->qlink.next = (queue_entry_t) pvh_eh;
+ pvh_eh = pvh_e;
+
+ if (pvh_et == PV_HASHED_ENTRY_NULL) {
+ pvh_et = pvh_e;
+ }
+ pvh_cnt++;
+ }
+ } /* for loop */
+
+ if (pvh_eh != PV_HASHED_ENTRY_NULL) {
+ PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
+ }
+update_counts:
+ /*
+ * Update the counts
+ */
+#if TESTING
+ if (pmap->stats.resident_count < num_removed)
+ panic("pmap_remove_range: resident_count");
+#endif
+ assert(pmap->stats.resident_count >= num_removed);
+ OSAddAtomic(-num_removed, &pmap->stats.resident_count);
+
+#if TESTING
+ if (pmap->stats.wired_count < num_unwired)
+ panic("pmap_remove_range: wired_count");
+#endif
+ assert(pmap->stats.wired_count >= num_unwired);
+ OSAddAtomic(-num_unwired, &pmap->stats.wired_count);
+
+ return;
+}
+
+
+/*
+ * Remove the given range of addresses
+ * from the specified map.
+ *
+ * It is assumed that the start and end are properly
+ * rounded to the hardware page size.
+ */
+void
+pmap_remove(
+ pmap_t map,
+ addr64_t s64,
+ addr64_t e64)
+{
+ pt_entry_t *pde;
+ pt_entry_t *spte, *epte;
+ addr64_t l64;
+ uint64_t deadline;
+
+ pmap_intr_assert();
+
+ if (map == PMAP_NULL || s64 == e64)
+ return;
+
+ PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
+ map,
+ (uint32_t) (s64 >> 32), s64,
+ (uint32_t) (e64 >> 32), e64);
+
+
+ PMAP_LOCK(map);
+
+#if 0
+ /*
+ * Check that address range in the kernel does not overlap the stacks.
+ * We initialize local static min/max variables once to avoid making
+ * 2 function calls for every remove. Note also that these functions
+ * both return 0 before kernel stacks have been initialized, and hence
+ * the panic is not triggered in this case.
+ */
+ if (map == kernel_pmap) {
+ static vm_offset_t kernel_stack_min = 0;
+ static vm_offset_t kernel_stack_max = 0;
+
+ if (kernel_stack_min == 0) {
+ kernel_stack_min = min_valid_stack_address();
+ kernel_stack_max = max_valid_stack_address();
+ }
+ if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) ||
+ (kernel_stack_min < e64 && e64 <= kernel_stack_max))
+ panic("pmap_remove() attempted in kernel stack");
+ }
+#else
+
+ /*
+ * The values of kernel_stack_min and kernel_stack_max are no longer
+ * relevant now that we allocate kernel stacks in the kernel map,
+ * so the old code above no longer applies. If we wanted to check that
+ * we weren't removing a mapping of a page in a kernel stack we'd
+ * mark the PTE with an unused bit and check that here.
+ */
+
+#endif
+
+ deadline = rdtsc64() + max_preemption_latency_tsc;
+
+ while (s64 < e64) {
+ l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1);
+ if (l64 > e64)
+ l64 = e64;
+ pde = pmap_pde(map, s64);
+
+ if (pde && (*pde & INTEL_PTE_VALID)) {
+ if (*pde & INTEL_PTE_PS) {
+ /*
+ * If we're removing a superpage, pmap_remove_range()
+ * must work on level 2 instead of level 1; and we're
+ * only passing a single level 2 entry instead of a
+ * level 1 range.
+ */
+ spte = pde;
+ epte = spte+1; /* excluded */
+ } else {
+ spte = pmap_pte(map, (s64 & ~(pde_mapped_size - 1)));
+ spte = &spte[ptenum(s64)];
+ epte = &spte[intel_btop(l64 - s64)];
+ }
+ pmap_remove_range(map, s64, spte, epte);
+ }
+ s64 = l64;
+
+ if (s64 < e64 && rdtsc64() >= deadline) {
+ PMAP_UNLOCK(map)
+ PMAP_LOCK(map)
+ deadline = rdtsc64() + max_preemption_latency_tsc;
+ }
+ }
+
+ PMAP_UNLOCK(map);
+
+ PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
+ map, 0, 0, 0, 0);
+
+}
+
+/*
+ * Routine: pmap_page_protect
+ *
+ * Function:
+ * Lower the permission for all mappings to a given
+ * page.
+ */
+void
+pmap_page_protect(
+ ppnum_t pn,
+ vm_prot_t prot)
+{
+ pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
+ pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
+ pv_hashed_entry_t nexth;
+ int pvh_cnt = 0;
+ pv_rooted_entry_t pv_h;
+ pv_rooted_entry_t pv_e;
+ pv_hashed_entry_t pvh_e;
+ pt_entry_t *pte;
+ int pai;
+ pmap_t pmap;
+ boolean_t remove;
+
+ pmap_intr_assert();
+ assert(pn != vm_page_fictitious_addr);
+ if (pn == vm_page_guard_addr)
+ return;
+
+ pai = ppn_to_pai(pn);
+
+ if (!IS_MANAGED_PAGE(pai)) {
+ /*
+ * Not a managed page.
+ */
+ return;
+ }
+ PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
+ pn, prot, 0, 0, 0);
+
+ /*
+ * Determine the new protection.
+ */
+ switch (prot) {
+ case VM_PROT_READ:
+ case VM_PROT_READ | VM_PROT_EXECUTE:
+ remove = FALSE;
+ break;
+ case VM_PROT_ALL:
+ return; /* nothing to do */
+ default:
+ remove = TRUE;
+ break;
+ }
+
+ pv_h = pai_to_pvh(pai);
+
+ LOCK_PVH(pai);
+
+
+ /*
+ * Walk down PV list, if any, changing or removing all mappings.
+ */
+ if (pv_h->pmap == PMAP_NULL)
+ goto done;
+
+ pv_e = pv_h;
+ pvh_e = (pv_hashed_entry_t) pv_e; /* cheat */
+
+ do {
+ vm_map_offset_t vaddr;
+
+ pmap = pv_e->pmap;
+ vaddr = pv_e->va;
+ pte = pmap_pte(pmap, vaddr);
+
+#if DEBUG
+ if (pa_index(pte_to_pa(*pte)) != pn)
+ panic("pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte);
+#endif
+ if (0 == pte) {
+ panic("pmap_page_protect() "
+ "pmap=%p pn=0x%x vaddr=0x%llx\n",
+ pmap, pn, vaddr);
+ }
+ nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
+
+ /*
+ * Remove the mapping if new protection is NONE
+ * or if write-protecting a kernel mapping.
+ */
+ if (remove || pmap == kernel_pmap) {
+ /*
+ * Remove the mapping, collecting dirty bits.
+ */
+ pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_VALID);
+ PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
+ pmap_phys_attributes[pai] |=
+ *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
+ pmap_store_pte(pte, 0);
+
+#if TESTING
+ if (pmap->stats.resident_count < 1)
+ panic("pmap_page_protect: resident_count");
+#endif
+ assert(pmap->stats.resident_count >= 1);
+ OSAddAtomic(-1, &pmap->stats.resident_count);
+
+ /*
+ * Deal with the pv_rooted_entry.
+ */
+
+ if (pv_e == pv_h) {
+ /*
+ * Fix up head later.
+ */
+ pv_h->pmap = PMAP_NULL;
+ } else {
+ /*
+ * Delete this entry.
+ */
+ pv_hash_remove(pvh_e);
+ pvh_e->qlink.next = (queue_entry_t) pvh_eh;
+ pvh_eh = pvh_e;
+
+ if (pvh_et == PV_HASHED_ENTRY_NULL)
+ pvh_et = pvh_e;
+ pvh_cnt++;
+ }
+ } else {
+ /*
+ * Write-protect.
+ */
+ pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_WRITE);
+ PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
+ }
+ pvh_e = nexth;
+ } while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
+
+
+ /*
+ * If pv_head mapping was removed, fix it up.
+ */
+ if (pv_h->pmap == PMAP_NULL) {
+ pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
+
+ if (pvh_e != (pv_hashed_entry_t) pv_h) {
+ pv_hash_remove(pvh_e);
+ pv_h->pmap = pvh_e->pmap;
+ pv_h->va = pvh_e->va;
+ pvh_e->qlink.next = (queue_entry_t) pvh_eh;
+ pvh_eh = pvh_e;
+
+ if (pvh_et == PV_HASHED_ENTRY_NULL)
+ pvh_et = pvh_e;
+ pvh_cnt++;
+ }
+ }
+ if (pvh_eh != PV_HASHED_ENTRY_NULL) {
+ PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
+ }
+done:
+ UNLOCK_PVH(pai);
+
+ PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
+ 0, 0, 0, 0, 0);
+}
+
+__private_extern__ void
+pmap_pagetable_corruption_msg_log(int (*log_func)(const char * fmt, ...)__printflike(1,2)) {
+ if (pmap_pagetable_corruption_incidents > 0) {
+ int i, e = MIN(pmap_pagetable_corruption_incidents, PMAP_PAGETABLE_CORRUPTION_MAX_LOG);
+ (*log_func)("%u pagetable corruption incident(s) detected, timeout: %u\n", pmap_pagetable_corruption_incidents, pmap_pagetable_corruption_timeout);
+ for (i = 0; i < e; i++) {
+ (*log_func)("Incident 0x%x, reason: 0x%x, action: 0x%x, time: 0x%llx\n", pmap_pagetable_corruption_records[i].incident, pmap_pagetable_corruption_records[i].reason, pmap_pagetable_corruption_records[i].action, pmap_pagetable_corruption_records[i].abstime);
+ }
+ }
+}
+
+void
+mapping_free_prime(void)
+{
+ int i;
+ pv_hashed_entry_t pvh_e;
+ pv_hashed_entry_t pvh_eh;
+ pv_hashed_entry_t pvh_et;
+ int pv_cnt;
+
+ pv_cnt = 0;
+ pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
+ for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) {
+ pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
+
+ pvh_e->qlink.next = (queue_entry_t)pvh_eh;
+ pvh_eh = pvh_e;
+
+ if (pvh_et == PV_HASHED_ENTRY_NULL)
+ pvh_et = pvh_e;
+ pv_cnt++;
+ }
+ PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
+
+ pv_cnt = 0;
+ pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
+ for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
+ pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
+
+ pvh_e->qlink.next = (queue_entry_t)pvh_eh;
+ pvh_eh = pvh_e;
+
+ if (pvh_et == PV_HASHED_ENTRY_NULL)
+ pvh_et = pvh_e;
+ pv_cnt++;
+ }
+ PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
+
+}
+
+static inline void
+pmap_pagetable_corruption_log_setup(void) {
+ if (pmap_pagetable_corruption_log_call == NULL) {
+ nanotime_to_absolutetime(PMAP_PAGETABLE_CORRUPTION_INTERVAL, 0, &pmap_pagetable_corruption_interval_abstime);
+ thread_call_setup(&pmap_pagetable_corruption_log_call_data,
+ (thread_call_func_t) pmap_pagetable_corruption_msg_log,
+ (thread_call_param_t) &printf);
+ pmap_pagetable_corruption_log_call = &pmap_pagetable_corruption_log_call_data;
+ }
+}
+
+void
+mapping_adjust(void)
+{
+ pv_hashed_entry_t pvh_e;
+ pv_hashed_entry_t pvh_eh;
+ pv_hashed_entry_t pvh_et;
+ int pv_cnt;
+ int i;
+
+ if (mapping_adjust_call == NULL) {
+ thread_call_setup(&mapping_adjust_call_data,
+ (thread_call_func_t) mapping_adjust,
+ (thread_call_param_t) NULL);
+ mapping_adjust_call = &mapping_adjust_call_data;
+ }
+
+ pmap_pagetable_corruption_log_setup();
+
+ pv_cnt = 0;
+ pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
+ if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) {
+ for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
+ pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
+
+ pvh_e->qlink.next = (queue_entry_t)pvh_eh;
+ pvh_eh = pvh_e;
+
+ if (pvh_et == PV_HASHED_ENTRY_NULL)
+ pvh_et = pvh_e;
+ pv_cnt++;
+ }
+ PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
+ }
+
+ pv_cnt = 0;
+ pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
+ if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) {
+ for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) {
+ pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
+
+ pvh_e->qlink.next = (queue_entry_t)pvh_eh;
+ pvh_eh = pvh_e;
+
+ if (pvh_et == PV_HASHED_ENTRY_NULL)
+ pvh_et = pvh_e;
+ pv_cnt++;
+ }
+ PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
+ }
+ mappingrecurse = 0;
+}
+
mach_msg_size_t max_desc = (mach_msg_size_t)(((size - sizeof(mach_msg_base_t)) /
sizeof(mach_msg_ool_descriptor32_t)) *
DESC_SIZE_ADJUSTMENT);
- if (msg_and_trailer_size >= MACH_MSG_SIZE_MAX - max_desc)
+ if (msg_and_trailer_size > MACH_MSG_SIZE_MAX - max_desc)
return IKM_NULL;
max_expanded_size = msg_and_trailer_size + max_desc;
assert(i <= IKM_STASH);
kmsg = cache->entries[--i];
cache->avail = i;
- ikm_check_init(kmsg, max_expanded_size);
enable_preemption();
- kmsg->ikm_header = (mach_msg_header_t *)
- ((vm_offset_t)(kmsg + 1) +
- max_expanded_size -
- msg_and_trailer_size);
+ ikm_check_init(kmsg, max_expanded_size);
+ ikm_set_header(kmsg, msg_and_trailer_size);
return (kmsg);
}
enable_preemption();
if (kmsg != IKM_NULL) {
ikm_init(kmsg, max_expanded_size);
- kmsg->ikm_header = (mach_msg_header_t *)
- ((vm_offset_t)(kmsg + 1) +
- max_expanded_size -
- msg_and_trailer_size);
+ ikm_set_header(kmsg, msg_and_trailer_size);
}
return(kmsg);
IP_CLEAR_PREALLOC(port, kmsg);
}
+/*
+ * Routine: ipc_kmsg_prealloc
+ * Purpose:
+ * Wraper to ipc_kmsg_alloc() to account for
+ * header expansion requirements.
+ */
+ipc_kmsg_t
+ipc_kmsg_prealloc(mach_msg_size_t size)
+{
+#if defined(__LP64__)
+ if (size > MACH_MSG_SIZE_MAX - LEGACY_HEADER_SIZE_DELTA)
+ return IKM_NULL;
+
+ size += LEGACY_HEADER_SIZE_DELTA;
+#endif
+ return ipc_kmsg_alloc(size);
+}
/*
* clients. These are set up for those kernel clients
* which cannot afford to wait.
*/
-#ifndef __LP64__
- /* LP64todo - does the prealloc kmsg need ikm_header padding?
- */
if (IP_PREALLOC(dest_port)) {
+ mach_msg_size_t max_desc = 0;
+
ip_lock(dest_port);
if (!ip_active(dest_port)) {
ip_unlock(dest_port);
}
assert(IP_PREALLOC(dest_port));
kmsg = dest_port->ip_premsg;
- if (msg_and_trailer_size > kmsg->ikm_size) {
- ip_unlock(dest_port);
- return MACH_SEND_TOO_LARGE;
- }
if (ikm_prealloc_inuse(kmsg)) {
ip_unlock(dest_port);
return MACH_SEND_NO_BUFFER;
}
+#if !defined(__LP64__)
+ if (msg->msgh_bits & MACH_MSGH_BITS_COMPLEX) {
+ assert(size > sizeof(mach_msg_base_t));
+ max_desc = ((mach_msg_base_t *)msg)->body.msgh_descriptor_count *
+ DESC_SIZE_ADJUSTMENT;
+ }
+#endif
+ if (msg_and_trailer_size > kmsg->ikm_size - max_desc) {
+ ip_unlock(dest_port);
+ return MACH_SEND_TOO_LARGE;
+ }
ikm_prealloc_set_inuse(kmsg, dest_port);
+ ikm_set_header(kmsg, msg_and_trailer_size);
ip_unlock(dest_port);
}
else
-#endif /* !__LP64__ */
{
kmsg = ipc_kmsg_alloc(msg_and_trailer_size);
if (kmsg == IKM_NULL)
assert((kmsg)->ikm_next == IKM_BOGUS); \
MACRO_END
+#define ikm_set_header(kmsg, mtsize) \
+MACRO_BEGIN \
+ (kmsg)->ikm_header = (mach_msg_header_t *) \
+ ((vm_offset_t)((kmsg) + 1) + (kmsg)->ikm_size - (mtsize)); \
+MACRO_END
+
struct ipc_kmsg_queue {
struct ipc_kmsg *ikmq_base;
};
extern void ipc_kmsg_destroy_dest(
ipc_kmsg_t kmsg);
-
/* Preallocate a kernel message buffer */
+extern ipc_kmsg_t ipc_kmsg_prealloc(
+ mach_msg_size_t size);
+
+/* bind a preallocated message buffer to a port */
extern void ipc_kmsg_set_prealloc(
ipc_kmsg_t kmsg,
ipc_port_t port);
-/* Clear a kernel message buffer */
+/* Clear preallocated message buffer binding */
extern void ipc_kmsg_clear_prealloc(
ipc_kmsg_t kmsg,
ipc_port_t port);
lck_mtx_ext_t ipc_port_multiple_lock_data_ext;
lck_mtx_ext_t ipc_port_timestamp_lock_data_ext;
ipc_port_timestamp_t ipc_port_timestamp_data;
+int ipc_portbt;
#if MACH_ASSERT
void ipc_port_init_debug(
{
queue_init(&port_alloc_queue);
lck_mtx_init_ext(&port_alloc_queue_lock, &port_alloc_queue_lock_ext, &ipc_lck_grp, &ipc_lck_attr);
+
+ if (!PE_parse_boot_argn("ipc_portbt", &ipc_portbt, sizeof (ipc_portbt)))
+ ipc_portbt = 0;
}
+#ifdef MACH_BSD
+extern int proc_pid(struct proc*);
+#endif /* MACH_BSD */
/*
* Initialize all of the debugging state in a port.
for (i = 0; i < IP_NSPARES; ++i)
port->ip_spares[i] = 0;
+#ifdef MACH_BSD
+ task_t task = current_task();
+ if (task != TASK_NULL) {
+ struct proc* proc = (struct proc*) get_bsdtask_info(task);
+ if (proc)
+ port->ip_spares[0] = proc_pid(proc);
+ }
+#endif /* MACH_BSD */
+
/*
* Machine-dependent routine to fill in an
* array with up to IP_CALLSTACK_MAX levels
* of return pc information.
*/
- machine_callstack(&port->ip_callstack[0], IP_CALLSTACK_MAX);
+ if (ipc_portbt)
+ machine_callstack(&port->ip_callstack[0], IP_CALLSTACK_MAX);
#if 0
lck_mtx_lock(&port_alloc_queue_lock);
#endif
#if MACH_ASSERT
-#define IP_NSPARES 10
-#define IP_CALLSTACK_MAX 10
+#define IP_NSPARES 4
+#define IP_CALLSTACK_MAX 16
queue_chain_t ip_port_links; /* all allocated ports */
thread_t ip_thread; /* who made me? thread context */
unsigned long ip_timetrack; /* give an idea of "when" created */
return KERN_RESOURCE_SHORTAGE;
} else {
mach_msg_size_t size = qosp->len + MAX_TRAILER_SIZE;
+
if (right != MACH_PORT_RIGHT_RECEIVE)
return (KERN_INVALID_VALUE);
- kmsg = (ipc_kmsg_t)ipc_kmsg_alloc(size);
+
+ kmsg = (ipc_kmsg_t)ipc_kmsg_prealloc(size);
if (kmsg == IKM_NULL)
return (KERN_RESOURCE_SHORTAGE);
}
#include <mach/mach_types.h>
#include <sys/appleapiopts.h>
#include <kern/debug.h>
+#include <uuid/uuid.h>
#include <kdp/kdp_internal.h>
#include <kdp/kdp_private.h>
#include <kdp/kdp_core.h>
+#include <kdp/kdp_dyld.h>
#include <libsa/types.h>
extern unsigned int return_on_panic;
typedef struct thread_snapshot *thread_snapshot_t;
+typedef struct task_snapshot *task_snapshot_t;
extern int
machine_trace_thread(thread_t thread, char *tracepos, char *tracebound, int nframes, boolean_t user_p);
int
-kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, unsigned trace_options, uint32_t *pbytesTraced);
+kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_flags, uint32_t dispatch_offset, uint32_t *pbytesTraced);
boolean_t kdp_copyin(pmap_t, uint64_t, void *, size_t);
extern void bcopy_phys(addr64_t, addr64_t, vm_size_t);
}
int
-kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, unsigned trace_options, uint32_t *pbytesTraced)
+kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size, uint32_t trace_flags, uint32_t dispatch_offset, uint32_t *pbytesTraced)
{
char *tracepos = (char *) tracebuf;
char *tracebound = tracepos + tracebuf_size;
task_t task = TASK_NULL;
thread_t thread = THREAD_NULL;
- int nframes = trace_options;
thread_snapshot_t tsnap = NULL;
unsigned framesize = 2 * sizeof(vm_offset_t);
- boolean_t dispatch_p = ((trace_options & STACKSHOT_GET_DQ) != 0);
- uint16_t dispatch_offset = (trace_options & STACKSHOT_DISPATCH_OFFSET_MASK) >> STACKSHOT_DISPATCH_OFFSET_SHIFT;
struct task ctask;
struct thread cthread;
-
- if ((nframes <= 0) || nframes > MAX_FRAMES)
- nframes = MAX_FRAMES;
+
+ boolean_t dispatch_p = ((trace_flags & STACKSHOT_GET_DQ) != 0);
+ boolean_t save_loadinfo_p = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0);
queue_iterate(&tasks, task, task_t, tasks) {
+ int task_pid = pid_from_task(task);
+ boolean_t task64 = task_has_64BitAddr(task);
+
if ((task == NULL) || (ml_nofault_copy((vm_offset_t) task, (vm_offset_t) &ctask, sizeof(struct task)) != sizeof(struct task)))
goto error_exit;
+
/* Trace everything, unless a process was specified */
- if ((pid == -1) || (pid == pid_from_task(task)))
+ if ((pid == -1) || (pid == task_pid)) {
+ task_snapshot_t task_snap;
+ uint32_t uuid_info_count;
+ mach_vm_address_t uuid_info_addr;
+
+ if (save_loadinfo_p && task_pid > 0) {
+ // Read the dyld_all_image_infos struct from the task memory to get UUID array count and location
+ if (task64) {
+ struct dyld_all_image_infos64 task_image_infos;
+ if (!kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct dyld_all_image_infos64)))
+ goto error_exit;
+ uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
+ uuid_info_addr = task_image_infos.uuidArray;
+ } else {
+ struct dyld_all_image_infos task_image_infos;
+ if (!kdp_copyin(task->map->pmap, task->all_image_info_addr, &task_image_infos, sizeof(struct dyld_all_image_infos)))
+ goto error_exit;
+ uuid_info_count = task_image_infos.uuidArrayCount;
+ uuid_info_addr = task_image_infos.uuidArray;
+ }
+ } else {
+ uuid_info_count = 0;
+ uuid_info_addr = 0;
+ }
+
+ if (tracepos + sizeof(struct task_snapshot) > tracebound) {
+ error = -1;
+ goto error_exit;
+ }
+
+ task_snap = (task_snapshot_t) tracepos;
+ task_snap->snapshot_magic = STACKSHOT_TASK_SNAPSHOT_MAGIC;
+ task_snap->pid = task_pid;
+ task_snap->nloadinfos = uuid_info_count;
+ /* Add the BSD process identifiers */
+ if (task_pid != -1)
+ proc_name_kdp(task, task_snap->p_comm, sizeof(task_snap->p_comm));
+ else
+ task_snap->p_comm[0] = '\0';
+ task_snap->ss_flags = 0;
+ if (task64)
+ task_snap->ss_flags |= kUser64_p;
+
+ tracepos += sizeof(struct task_snapshot);
+
+ if (task_pid > 0 && uuid_info_count > 0) {
+ uint32_t uuid_info_size = (uint32_t)(task64 ? sizeof(struct dyld_uuid_info64) : sizeof(struct dyld_uuid_info));
+ uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size;
+
+ if (tracepos + uuid_info_array_size > tracebound) {
+ error = -1;
+ goto error_exit;
+ }
+
+ // Copy in the UUID info array
+ if (!kdp_copyin(task->map->pmap, uuid_info_addr, tracepos, uuid_info_array_size))
+ goto error_exit;
+
+ tracepos += uuid_info_array_size;
+ }
+
queue_iterate(&task->threads, thread, thread_t, task_threads){
if ((thread == NULL) || (ml_nofault_copy((vm_offset_t) thread, (vm_offset_t) &cthread, sizeof(struct thread)) != sizeof(struct thread)))
goto error_exit;
+
if (((tracepos + 4 * sizeof(struct thread_snapshot)) > tracebound)) {
error = -1;
goto error_exit;
}
-/* Populate the thread snapshot header */
+ /* Populate the thread snapshot header */
tsnap = (thread_snapshot_t) tracepos;
tsnap->thread_id = (uint64_t) (uintptr_t)thread;
tsnap->state = thread->state;
tsnap->wait_event = thread->wait_event;
tsnap->continuation = (uint64_t) (uintptr_t) thread->continuation;
-/* Add the BSD process identifiers */
- if ((tsnap->pid = pid_from_task(task)) != -1)
- proc_name_kdp(task, tsnap->p_comm, sizeof(tsnap->p_comm));
- else
- tsnap->p_comm[0] = '\0';
- tsnap->snapshot_magic = 0xfeedface;
+ tsnap->snapshot_magic = STACKSHOT_THREAD_SNAPSHOT_MAGIC;
tracepos += sizeof(struct thread_snapshot);
tsnap->ss_flags = 0;
if (dispatch_p && (task != kernel_task) && (task->active) && (task->map)) {
uint64_t dqkeyaddr = thread_dispatchqaddr(thread);
if (dqkeyaddr != 0) {
- boolean_t task64 = task_has_64BitAddr(task);
uint64_t dqaddr = 0;
if (kdp_copyin(task->map->pmap, dqkeyaddr, &dqaddr, (task64 ? 8 : 4)) && (dqaddr != 0)) {
uint64_t dqserialnumaddr = dqaddr + dispatch_offset;
*/
if (thread->kernel_stack != 0) {
#if defined(__LP64__)
- tracebytes = machine_trace_thread64(thread, tracepos, tracebound, nframes, FALSE);
+ tracebytes = machine_trace_thread64(thread, tracepos, tracebound, MAX_FRAMES, FALSE);
tsnap->ss_flags |= kKernel64_p;
framesize = 16;
#else
- tracebytes = machine_trace_thread(thread, tracepos, tracebound, nframes, FALSE);
+ tracebytes = machine_trace_thread(thread, tracepos, tracebound, MAX_FRAMES, FALSE);
framesize = 8;
#endif
}
tsnap->nkern_frames = tracebytes/framesize;
tracepos += tracebytes;
tracebytes = 0;
-/* Trace user stack, if any */
+ /* Trace user stack, if any */
if (thread->task->map != kernel_map) {
/* 64-bit task? */
if (task_has_64BitAddr(thread->task)) {
- tracebytes = machine_trace_thread64(thread, tracepos, tracebound, nframes, TRUE);
+ tracebytes = machine_trace_thread64(thread, tracepos, tracebound, MAX_FRAMES, TRUE);
tsnap->ss_flags |= kUser64_p;
framesize = 16;
}
else {
- tracebytes = machine_trace_thread(thread, tracepos, tracebound, nframes, TRUE);
+ tracebytes = machine_trace_thread(thread, tracepos, tracebound, MAX_FRAMES, TRUE);
framesize = 8;
}
}
tracepos += tracebytes;
tracebytes = 0;
}
+ }
}
error_exit:
--- /dev/null
+/*
+ * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ * Data structure definitions copied from dyld so that we can read dyld's saved UUID information
+ * for each binary image not loaded from the shared cache during stackshots.
+ */
+
+/* From dyld/include/dyld_images.h */
+
+struct dyld_uuid_info {
+ user32_addr_t imageLoadAddress; /* base address image is mapped into */
+ uuid_t imageUUID; /* UUID of image */
+};
+
+struct dyld_uuid_info64 {
+ user64_addr_t imageLoadAddress; /* base address image is mapped into */
+ uuid_t imageUUID; /* UUID of image */
+};
+
+// FIXME: dyld is in C++, and some of the fields in dyld_all_image_infos are C++
+// native booleans. There must be a better way...
+typedef uint8_t bool;
+
+struct dyld_all_image_infos {
+ uint32_t version;
+ uint32_t infoArrayCount;
+ user32_addr_t infoArray;
+ user32_addr_t notification;
+ bool processDetachedFromSharedRegion;
+ bool libSystemInitialized;
+ user32_addr_t dyldImageLoadAddress;
+ user32_addr_t jitInfo;
+ user32_addr_t dyldVersion;
+ user32_addr_t errorMessage;
+ user32_addr_t terminationFlags;
+ user32_addr_t coreSymbolicationShmPage;
+ user32_addr_t systemOrderFlag;
+ user32_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count
+ user32_addr_t uuidArray;
+};
+
+struct dyld_all_image_infos64 {
+ uint32_t version;
+ uint32_t infoArrayCount;
+ user64_addr_t infoArray;
+ user64_addr_t notification;
+ bool processDetachedFromSharedRegion;
+ bool libSystemInitialized;
+ user64_addr_t dyldImageLoadAddress;
+ user64_addr_t jitInfo;
+ user64_addr_t dyldVersion;
+ user64_addr_t errorMessage;
+ user64_addr_t terminationFlags;
+ user64_addr_t coreSymbolicationShmPage;
+ user64_addr_t systemOrderFlag;
+ user64_size_t uuidArrayCount; // dyld defines this as a uintptr_t despite it being a count
+ user64_addr_t uuidArray;
+};
static void *stack_snapshot_buf;
static uint32_t stack_snapshot_bufsize;
static int stack_snapshot_pid;
-static uint32_t stack_snapshot_options;
+static uint32_t stack_snapshot_flags;
+static uint32_t stack_snapshot_dispatch_offset;
static unsigned int old_debugger;
void
kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size,
- uint32_t options);
+ uint32_t flags, uint32_t dispatch_offset);
void
kdp_snapshot_postflight(void);
extern int
kdp_stackshot(int pid, void *tracebuf, uint32_t tracebuf_size,
- unsigned trace_options, uint32_t *pbytesTraced);
+ uint32_t flags, uint32_t dispatch_offset, uint32_t *pbytesTraced);
int
kdp_stack_snapshot_geterror(void);
/* Cache stack snapshot parameters in preparation for a trace */
void
-kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, uint32_t options)
+kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset)
{
stack_snapshot_pid = pid;
stack_snapshot_buf = tracebuf;
stack_snapshot_bufsize = tracebuf_size;
- stack_snapshot_options = options;
+ stack_snapshot_flags = flags;
+ stack_snapshot_dispatch_offset = dispatch_offset;
kdp_snapshot++;
/* Mark this debugger as active, since the polled mode driver that
* ordinarily does this may not be enabled (yet), or since KDB may be
if (kdp_snapshot && (!panic_active()) && (panic_caller == 0)) {
stack_snapshot_ret = kdp_stackshot(stack_snapshot_pid,
stack_snapshot_buf, stack_snapshot_bufsize,
- stack_snapshot_options, &stack_snapshot_bytes_traced);
+ stack_snapshot_flags, stack_snapshot_dispatch_offset,
+ &stack_snapshot_bytes_traced);
return;
}
extern char osversion[];
__private_extern__ void panic_display_system_configuration(void) {
- static boolean_t config_displayed = FALSE;
+ static volatile boolean_t config_displayed = FALSE;
panic_display_process_name();
if (config_displayed == FALSE) {
+ config_displayed = TRUE;
kdb_printf("\nMac OS version:\n%s\n",
(osversion[0] != 0) ? osversion : "Not yet set");
kdb_printf("\nKernel version:\n%s\n",version);
panic_display_model_name();
panic_display_uptime();
- config_displayed = TRUE;
+#if defined(__i386__) || defined(__x86_64__)
+ pmap_pagetable_corruption_msg_log(&kdb_printf);
+#endif /* i386 || x86_64 */
panic_display_zprint();
kext_dump_panic_lists(&kdb_log);
}
uint32_t snapshot_magic;
uint32_t nkern_frames;
uint32_t nuser_frames;
- int32_t pid;
uint64_t wait_event;
uint64_t continuation;
uint64_t thread_id;
int32_t state;
char ss_flags;
+} __attribute__ ((packed));
+
+struct task_snapshot {
+ uint32_t snapshot_magic;
+ int32_t pid;
+ uint32_t nloadinfos;
+ char ss_flags;
/* We restrict ourselves to a statically defined
* (current as of 2009) length for the
* p_comm string, due to scoping issues (osfmk/bsd and user/kernel
kHasDispatchSerial = 0x4
};
-enum {STACKSHOT_GET_DQ = 1};
-#define STACKSHOT_DISPATCH_OFFSET_MASK 0xffff0000
-#define STACKSHOT_DISPATCH_OFFSET_SHIFT 16
+enum {
+ STACKSHOT_GET_DQ = 0x1,
+ STACKSHOT_SAVE_LOADINFO = 0x2
+};
+
+#define STACKSHOT_THREAD_SNAPSHOT_MAGIC 0xfeedface
+#define STACKSHOT_TASK_SNAPSHOT_MAGIC 0xdecafbad
#endif /* __APPLE_API_UNSTABLE */
#endif /* __APPLE_API_PRIVATE */
extern unsigned int systemLogDiags;
extern char debug_buf[];
+extern unsigned int debug_boot_arg;
#ifdef MACH_KERNEL_PRIVATE
processor_data_init(processor);
processor->processor_list = NULL;
+ pset_lock(pset);
+ if (pset->cpu_set_count++ == 0)
+ pset->cpu_set_low = pset->cpu_set_hi = cpu_id;
+ else {
+ pset->cpu_set_low = (cpu_id < pset->cpu_set_low)? cpu_id: pset->cpu_set_low;
+ pset->cpu_set_hi = (cpu_id > pset->cpu_set_hi)? cpu_id: pset->cpu_set_hi;
+ }
+ pset_unlock(pset);
+
simple_lock(&processor_list_lock);
if (processor_list == NULL)
processor_list = processor;
queue_init(&pset->idle_queue);
pset->processor_count = 0;
pset->low_pri = pset->low_count = PROCESSOR_NULL;
+ pset->cpu_set_low = pset->cpu_set_hi = 0;
+ pset->cpu_set_count = 0;
pset_lock_init(pset);
pset->pset_self = IP_NULL;
pset->pset_name_self = IP_NULL;
int processor_count;
+ int cpu_set_low, cpu_set_hi;
+ int cpu_set_count;
+
decl_simple_lock_data(,sched_lock) /* lock for above */
struct ipc_port * pset_self; /* port for operations */
#define pset_deallocate(x)
#define pset_reference(x)
-extern void machine_run_count(
- uint32_t count);
+extern void machine_run_count(
+ uint32_t count);
+
+extern boolean_t machine_processor_is_inactive(
+ processor_t processor);
-extern boolean_t machine_cpu_is_inactive(
- int cpu_id);
+extern processor_t machine_choose_processor(
+ processor_set_t pset,
+ processor_t processor);
#else /* MACH_KERNEL_PRIVATE */
#define BASEPRI_FOREGROUND (BASEPRI_DEFAULT + 16) /* 47 */
#define BASEPRI_BACKGROUND (BASEPRI_DEFAULT + 15) /* 46 */
#define BASEPRI_DEFAULT (MAXPRI_USER - (NRQS / 4)) /* 31 */
+#define MAXPRI_THROTTLE (MINPRI + 4) /* 4 */
#define MINPRI_USER MINPRI /* 0 */
/*
pset_lock(pset);
- inactive_state = processor->state != PROCESSOR_SHUTDOWN && machine_cpu_is_inactive(processor->cpu_id);
+ inactive_state = processor->state != PROCESSOR_SHUTDOWN && machine_processor_is_inactive(processor);
simple_lock(&rt_lock);
thread->realtime.deadline = UINT64_MAX;
thread->reason |= AST_QUANTUM;
}
- }
- else {
+ } else {
/*
* For non-realtime threads treat a tiny
* remaining quantum as an expired quantum
/*
* Waiting.
*/
+ boolean_t should_terminate = FALSE;
+
+ /* Only the first call to thread_dispatch
+ * after explicit termination should add
+ * the thread to the termination queue
+ */
+ if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
+ should_terminate = TRUE;
+ thread->state |= TH_TERMINATE2;
+ }
+
thread->state &= ~TH_RUN;
if (thread->sched_mode & TH_MODE_TIMESHARE)
sched_share_decr();
sched_run_decr();
+ (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
+
if (thread->wake_active) {
thread->wake_active = FALSE;
thread_unlock(thread);
wake_unlock(thread);
- (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
-
- if (thread->state & TH_TERMINATE)
+ if (should_terminate)
thread_terminate_enqueue(thread);
}
}
* choose_processor:
*
* Choose a processor for the thread, beginning at
+ * the pset. Accepts an optional processor hint in
* the pset.
*
* Returns a processor, possibly from a different pset.
static processor_t
choose_processor(
processor_set_t pset,
+ processor_t processor,
thread_t thread)
{
processor_set_t nset, cset = pset;
- processor_t processor = thread->last_processor;
processor_meta_t pmeta = PROCESSOR_META_NULL;
/*
- * Prefer the last processor, when appropriate.
+ * Prefer the hinted processor, when appropriate.
*/
if (processor != PROCESSOR_NULL) {
+ processor_t mprocessor;
+
if (processor->processor_meta != PROCESSOR_META_NULL)
processor = processor->processor_meta->primary;
+ mprocessor = machine_choose_processor(pset, processor);
+ if (mprocessor != PROCESSOR_NULL)
+ processor = mprocessor;
+
if (processor->processor_set != pset || processor->state == PROCESSOR_INACTIVE ||
processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)
processor = PROCESSOR_NULL;
if (processor->state == PROCESSOR_IDLE)
return (processor);
}
+ else {
+ processor = machine_choose_processor(pset, processor);
+
+ if (processor != PROCESSOR_NULL) {
+ if (processor->processor_set != pset || processor->state == PROCESSOR_INACTIVE ||
+ processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE)
+ processor = PROCESSOR_NULL;
+ else
+ if (processor->state == PROCESSOR_IDLE)
+ return (processor);
+ }
+ }
/*
* Iterate through the processor sets to locate
pset = thread->affinity_set->aset_pset;
pset_lock(pset);
- processor = choose_processor(pset, thread);
+ processor = choose_processor(pset, PROCESSOR_NULL, thread);
}
else
if (thread->last_processor != PROCESSOR_NULL) {
*/
if (thread->sched_pri <= processor->current_pri ||
thread->realtime.deadline >= processor->deadline)
- processor = choose_processor(pset, thread);
+ processor = choose_processor(pset, PROCESSOR_NULL, thread);
}
else
- processor = choose_processor(pset, thread);
+ processor = choose_processor(pset, processor, thread);
}
else {
/*
pset = choose_next_pset(pset);
pset_lock(pset);
- processor = choose_processor(pset, thread);
+ processor = choose_processor(pset, PROCESSOR_NULL, thread);
task->pset_hint = processor->processor_set;
}
}
processor->processor_meta->primary != processor)
return (AST_PREEMPT);
- if (machine_cpu_is_inactive(processor->cpu_id))
+ if (machine_processor_is_inactive(processor))
return (AST_PREEMPT);
if (processor->active_thread->state & TH_SUSP)
(void)splsched();
- if (processor->state == PROCESSOR_INACTIVE && !machine_cpu_is_inactive(processor->cpu_id))
+ if (processor->state == PROCESSOR_INACTIVE && !machine_processor_is_inactive(processor))
break;
}
extern kern_return_t idle_thread_create(
processor_t processor);
-/* Start thread running */
-extern void thread_bootstrap_return(void);
-
/* Continuation return from syscall */
extern void thread_syscall_return(
kern_return_t ret);
thread_t thread,
wait_result_t result);
+/* Start thread running */
+extern void thread_bootstrap_return(void);
+
/* Return from exception (BSD-visible interface) */
extern void thread_exception_return(void) __dead2;
task->role = info->role;
}
}
+ else
+ if (info->role == TASK_THROTTLE_APPLICATION) {
+ task_priority(task, MAXPRI_THROTTLE, MAXPRI_THROTTLE);
+ task->role = info->role;
+ }
+ else
+ if (info->role == TASK_DEFAULT_APPLICATION) {
+ task_priority(task, BASEPRI_DEFAULT, MAXPRI_USER);
+ task->role = info->role;
+ }
else
result = KERN_INVALID_ARGUMENT;
kern_return_t
thread_create_workq(
task_t task,
+ thread_continue_t thread_return,
thread_t *new_thread)
{
kern_return_t result;
if (task == TASK_NULL || task == kernel_task)
return (KERN_INVALID_ARGUMENT);
- result = thread_create_internal(task, -1, (thread_continue_t)thread_bootstrap_return,
- TH_OPTION_NOCRED | TH_OPTION_NOSUSP, &thread);
+ result = thread_create_internal(task, -1, thread_return, TH_OPTION_NOCRED | TH_OPTION_NOSUSP, &thread);
if (result != KERN_SUCCESS)
return (result);
#define TH_RUN 0x04 /* running or on runq */
#define TH_UNINT 0x08 /* waiting uninteruptibly */
#define TH_TERMINATE 0x10 /* halted at termination */
+#define TH_TERMINATE2 0x20 /* added to termination queue */
#define TH_IDLE 0x80 /* idling processor */
extern kern_return_t thread_create_workq(
task_t task,
+ thread_continue_t thread_return,
thread_t *new_thread);
extern void thread_yield_internal(
timer_call_data_t delayed_timer;
struct wait_queue idle_wqueue;
+ struct wait_queue daemon_wqueue;
uint32_t idle_count, active_count;
};
timer_call_setup(&group->delayed_timer, thread_call_delayed_timer, group);
wait_queue_init(&group->idle_wqueue, SYNC_POLICY_FIFO);
+ wait_queue_init(&group->daemon_wqueue, SYNC_POLICY_FIFO);
queue_init(&thread_call_internal_queue);
for (
else
if (!thread_call_daemon_awake) {
thread_call_daemon_awake = TRUE;
- thread_wakeup_one(&thread_call_daemon_awake);
+ wait_queue_wakeup_one(&group->daemon_wqueue, NULL, THREAD_AWAKENED);
}
}
simple_lock(&thread_call_lock);
}
- thread_call_daemon_awake = FALSE;
- assert_wait(&thread_call_daemon_awake, THREAD_UNINT);
+ thread_call_daemon_awake = FALSE;
+ wait_queue_assert_wait(&group->daemon_wqueue, NULL, THREAD_UNINT, 0);
simple_unlock(&thread_call_lock);
(void) spllo();
TASK_FOREGROUND_APPLICATION,
TASK_BACKGROUND_APPLICATION,
TASK_CONTROL_APPLICATION,
- TASK_GRAPHICS_SERVER
+ TASK_GRAPHICS_SERVER,
+ TASK_THROTTLE_APPLICATION,
+ TASK_DEFAULT_APPLICATION
};
typedef enum task_role task_role_t;
#define VM_PROT_WANTS_COPY ((vm_prot_t) 0x10)
+/*
+ * The caller wants this memory region treated as if it had a valid
+ * code signature.
+ */
+
+#define VM_PROT_TRUSTED ((vm_prot_t) 0x20)
+
+
#endif /* _MACH_VM_PROT_H_ */
}
boolean_t
-machine_cpu_is_inactive(__unused int num)
+machine_processor_is_inactive(__unused processor_t processor)
{
return(FALSE);
}
+processor_t
+machine_choose_processor(__unused processor_set_t pset, processor_t processor)
+{
+ return (processor);
+}
+
vm_offset_t ml_stack_remaining(void)
{
uintptr_t local = (uintptr_t) &local;
/* Page might have been tainted before or not; now it
* definitively is. If the page wasn't tainted, we must
* disconnect it from all pmaps later. */
- must_disconnect = ~m->cs_tainted;
+ must_disconnect = !m->cs_tainted;
m->cs_tainted = TRUE;
cs_enter_tainted_accepted++;
}
submap_end = offset + (end - start);
submap_start = offset;
+
+ vm_map_lock_read(sub_map);
if(vm_map_lookup_entry(sub_map, offset, &entry)) {
remove_size = (entry->vme_end - entry->vme_start);
}
}
entry = entry->vme_next;
- }
+ }
+ vm_map_unlock_read(sub_map);
return;
}
map->switch_protect=val;
vm_map_unlock(map);
}
+
+/* Add (generate) code signature for memory range */
+#if CONFIG_DYNAMIC_CODE_SIGNING
+kern_return_t vm_map_sign(vm_map_t map,
+ vm_map_offset_t start,
+ vm_map_offset_t end)
+{
+ vm_map_entry_t entry;
+ vm_page_t m;
+ vm_object_t object;
+
+ /*
+ * Vet all the input parameters and current type and state of the
+ * underlaying object. Return with an error if anything is amiss.
+ */
+ if (map == VM_MAP_NULL)
+ return(KERN_INVALID_ARGUMENT);
+
+ vm_map_lock_read(map);
+
+ if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
+ /*
+ * Must pass a valid non-submap address.
+ */
+ vm_map_unlock_read(map);
+ return(KERN_INVALID_ADDRESS);
+ }
+
+ if((entry->vme_start > start) || (entry->vme_end < end)) {
+ /*
+ * Map entry doesn't cover the requested range. Not handling
+ * this situation currently.
+ */
+ vm_map_unlock_read(map);
+ return(KERN_INVALID_ARGUMENT);
+ }
+
+ object = entry->object.vm_object;
+ if (object == VM_OBJECT_NULL) {
+ /*
+ * Object must already be present or we can't sign.
+ */
+ vm_map_unlock_read(map);
+ return KERN_INVALID_ARGUMENT;
+ }
+
+ vm_object_lock(object);
+ vm_map_unlock_read(map);
+
+ while(start < end) {
+ uint32_t refmod;
+
+ m = vm_page_lookup(object, start - entry->vme_start + entry->offset );
+ if (m==VM_PAGE_NULL) {
+ /* shoud we try to fault a page here? we can probably
+ * demand it exists and is locked for this request */
+ vm_object_unlock(object);
+ return KERN_FAILURE;
+ }
+ /* deal with special page status */
+ if (m->busy ||
+ (m->unusual && (m->error || m->restart || m->private || m->absent))) {
+ vm_object_unlock(object);
+ return KERN_FAILURE;
+ }
+
+ /* Page is OK... now "validate" it */
+ /* This is the place where we'll call out to create a code
+ * directory, later */
+ m->cs_validated = TRUE;
+
+ /* The page is now "clean" for codesigning purposes. That means
+ * we don't consider it as modified (wpmapped) anymore. But
+ * we'll disconnect the page so we note any future modification
+ * attempts. */
+ m->wpmapped = FALSE;
+ refmod = pmap_disconnect(m->phys_page);
+
+ /* Pull the dirty status from the pmap, since we cleared the
+ * wpmapped bit */
+ if ((refmod & VM_MEM_MODIFIED) && !m->dirty) {
+ m->dirty = TRUE;
+ }
+
+ /* On to the next page */
+ start += PAGE_SIZE;
+ }
+ vm_object_unlock(object);
+
+ return KERN_SUCCESS;
+}
+#endif
int *flags,
int force_data_sync);
+#if CONFIG_DYNAMIC_CODE_SIGNING
+extern kern_return_t vm_map_sign(vm_map_t map,
+ vm_map_offset_t start,
+ vm_map_offset_t end);
+#endif
+
__END_DECLS
#endif /* KERNEL_PRIVATE */
pmap = thread->map->pmap;
+
+ assert((vm_offset_t)kernel_addr >= VM_MIN_KERNEL_AND_KEXT_ADDRESS ||
+ copy_type == COPYINPHYS || copy_type == COPYOUTPHYS);
+
/* Sanity and security check for addresses to/from a user */
- if ((copy_type == COPYIN ||
- copy_type == COPYINSTR ||
- copy_type == COPYOUT) &&
- (pmap != kernel_pmap) &&
- ((vm_offset_t)kernel_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS ||
- !IS_USERADDR64_CANONICAL(user_addr))) {
- error = EACCES;
+
+ if (((pmap != kernel_pmap) && (use_kernel_map == 0)) &&
+ ((nbytes && (user_addr+nbytes <= user_addr)) || ((user_addr + nbytes) > vm_map_max(thread->map)))) {
+ error = EFAULT;
goto out;
}
*/
#include <string.h>
-#include <norma_vm.h>
#include <mach_kdb.h>
#include <mach_ldebug.h>
#include <i386/mp_desc.h>
-/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */
-#ifdef DEBUGINTERRUPTS
-#define pmap_intr_assert() { \
- if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \
- panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \
-}
-#else
-#define pmap_intr_assert()
-#endif
#ifdef IWANTTODEBUG
#undef DEBUG
* Forward declarations for internal functions.
*/
-void pmap_remove_range(
- pmap_t pmap,
- vm_map_offset_t va,
- pt_entry_t *spte,
- pt_entry_t *epte);
void phys_attribute_clear(
ppnum_t phys,
const boolean_t cpu_64bit = TRUE; /* Mais oui! */
-/*
- * when spinning through pmap_remove
- * ensure that we don't spend too much
- * time with preemption disabled.
- * I'm setting the current threshold
- * to 20us
- */
-#define MAX_PREEMPTION_LATENCY_NS 20000
-
uint64_t max_preemption_latency_tsc = 0;
-
-/*
- * Private data structures.
- */
-
-/*
- * For each vm_page_t, there is a list of all currently
- * valid virtual mappings of that page. An entry is
- * a pv_rooted_entry_t; the list is the pv_table.
- *
- * N.B. with the new combo rooted/hashed scheme it is
- * only possibly to remove individual non-rooted entries
- * if they are found via the hashed chains as there is no
- * way to unlink the singly linked hashed entries if navigated to
- * via the queue list off the rooted entries. Think of it as
- * hash/walk/pull, keeping track of the prev pointer while walking
- * the singly linked hash list. All of this is to save memory and
- * keep both types of pv_entries as small as possible.
- */
-
-/*
-
-PV HASHING Changes - JK 1/2007
-
-Pve's establish physical to virtual mappings. These are used for aliasing of a
-physical page to (potentially many) virtual addresses within pmaps. In the
-previous implementation the structure of the pv_entries (each 16 bytes in size) was
-
-typedef struct pv_entry {
- struct pv_entry_t next;
- pmap_t pmap;
- vm_map_offset_t va;
-} *pv_entry_t;
-
-An initial array of these is created at boot time, one per physical page of
-memory, indexed by the physical page number. Additionally, a pool of entries
-is created from a pv_zone to be used as needed by pmap_enter() when it is
-creating new mappings. Originally, we kept this pool around because the code
-in pmap_enter() was unable to block if it needed an entry and none were
-available - we'd panic. Some time ago I restructured the pmap_enter() code
-so that for user pmaps it can block while zalloc'ing a pv structure and restart,
-removing a panic from the code (in the case of the kernel pmap we cannot block
-and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
-The pool has not been removed since there is a large performance gain keeping
-freed pv's around for reuse and not suffering the overhead of zalloc for every
-new pv we need.
-
-As pmap_enter() created new mappings it linked the new pve's for them off the
-fixed pv array for that ppn (off the next pointer). These pve's are accessed
-for several operations, one of them being address space teardown. In that case,
-we basically do this
-
- for (every page/pte in the space) {
- calc pve_ptr from the ppn in the pte
- for (every pv in the list for the ppn) {
- if (this pv is for this pmap/vaddr) {
- do housekeeping
- unlink/free the pv
- }
- }
- }
-
-The problem arose when we were running, say 8000 (or even 2000) apache or
-other processes and one or all terminate. The list hanging off each pv array
-entry could have thousands of entries. We were continuously linearly searching
-each of these lists as we stepped through the address space we were tearing
-down. Because of the locks we hold, likely taking a cache miss for each node,
-and interrupt disabling for MP issues the system became completely unresponsive
-for many seconds while we did this.
-
-Realizing that pve's are accessed in two distinct ways (linearly running the
-list by ppn for operations like pmap_page_protect and finding and
-modifying/removing a single pve as part of pmap_enter processing) has led to
-modifying the pve structures and databases.
-
-There are now two types of pve structures. A "rooted" structure which is
-basically the original structure accessed in an array by ppn, and a ''hashed''
-structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
-designed with the two goals of minimizing wired memory and making the lookup of
-a ppn faster. Since a vast majority of pages in the system are not aliased
-and hence represented by a single pv entry I've kept the rooted entry size as
-small as possible because there is one of these dedicated for every physical
-page of memory. The hashed pve's are larger due to the addition of the hash
-link and the ppn entry needed for matching while running the hash list to find
-the entry we are looking for. This way, only systems that have lots of
-aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
-structures have the same first three fields allowing some simplification in
-the code.
-
-They have these shapes
-
-typedef struct pv_rooted_entry {
- queue_head_t qlink;
- vm_map_offset_t va;
- pmap_t pmap;
-} *pv_rooted_entry_t;
-
-
-typedef struct pv_hashed_entry {
- queue_head_t qlink;
- vm_map_offset_t va;
- pmap_t pmap;
- ppnum_t ppn;
- struct pv_hashed_entry *nexth;
-} *pv_hashed_entry_t;
-
-The main flow difference is that the code is now aware of the rooted entry and
-the hashed entries. Code that runs the pv list still starts with the rooted
-entry and then continues down the qlink onto the hashed entries. Code that is
-looking up a specific pv entry first checks the rooted entry and then hashes
-and runs the hash list for the match. The hash list lengths are much smaller
-than the original pv lists that contained all aliases for the specific ppn.
-
-*/
-
-typedef struct pv_rooted_entry {
- /* first three entries must match pv_hashed_entry_t */
- queue_head_t qlink;
- vm_map_offset_t va; /* virtual address for mapping */
- pmap_t pmap; /* pmap where mapping lies */
-} *pv_rooted_entry_t;
-
-#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0)
-
-pv_rooted_entry_t pv_head_table; /* array of entries, one per page */
-
-typedef struct pv_hashed_entry {
- /* first three entries must match pv_rooted_entry_t */
- queue_head_t qlink;
- vm_map_offset_t va;
- pmap_t pmap;
- ppnum_t ppn;
- struct pv_hashed_entry *nexth;
-} *pv_hashed_entry_t;
-
-#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
-
-#define NPVHASH 4095 /* MUST BE 2^N - 1 */
pv_hashed_entry_t *pv_hash_table; /* hash lists */
uint32_t npvhash = 0;
-//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */
-#ifdef PV_DEBUG
-#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
-#else
-#define CHK_NPVHASH(x)
-#endif
-
pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
decl_simple_lock_data(,pv_hashed_free_list_lock)
int pv_hashed_free_count = 0;
int pv_hashed_kern_free_count = 0;
-#define PV_HASHED_LOW_WATER_MARK 5000
-#define PV_HASHED_KERN_LOW_WATER_MARK 100
-#define PV_HASHED_ALLOC_CHUNK 2000
-#define PV_HASHED_KERN_ALLOC_CHUNK 50
-thread_call_t mapping_adjust_call;
-static thread_call_data_t mapping_adjust_call_data;
-uint32_t mappingrecurse = 0;
-
-#define PV_HASHED_ALLOC(pvh_e) { \
- simple_lock(&pv_hashed_free_list_lock); \
- if ((pvh_e = pv_hashed_free_list) != 0) { \
- pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
- pv_hashed_free_count--; \
- if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
- if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
- thread_call_enter(mapping_adjust_call); \
- } \
- simple_unlock(&pv_hashed_free_list_lock); \
-}
-
-#define PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
- simple_lock(&pv_hashed_free_list_lock); \
- pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; \
- pv_hashed_free_list = pvh_eh; \
- pv_hashed_free_count += pv_cnt; \
- simple_unlock(&pv_hashed_free_list_lock); \
-}
-
-#define PV_HASHED_KERN_ALLOC(pvh_e) { \
- simple_lock(&pv_hashed_kern_free_list_lock); \
- if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
- pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next; \
- pv_hashed_kern_free_count--; \
- if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK)\
- if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
- thread_call_enter(mapping_adjust_call); \
- } \
- simple_unlock(&pv_hashed_kern_free_list_lock); \
-}
-#define PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) { \
- simple_lock(&pv_hashed_kern_free_list_lock); \
- pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; \
- pv_hashed_kern_free_list = pvh_eh; \
- pv_hashed_kern_free_count += pv_cnt; \
- simple_unlock(&pv_hashed_kern_free_list_lock); \
-}
zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */
*/
char *pv_lock_table; /* pointer to array of bits */
-#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
+
char *pv_hash_lock_table;
-#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE)
+
/*
* First and last physical addresses that we maintain any information
static struct vm_object kpdptobj_object_store;
/*
- * Index into pv_head table, its lock bits, and the modify/reference and managed bits
- */
-
-#define pa_index(pa) (i386_btop(pa))
-#define ppn_to_pai(ppn) ((int)ppn)
-
-#define pai_to_pvh(pai) (&pv_head_table[pai])
-#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table)
-#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table)
-
-static inline uint32_t
-pvhashidx(pmap_t pmap, vm_offset_t va)
-{
- return ((uint32_t)(uint64_t)pmap ^
- ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
- npvhash;
-}
-#define pvhash(idx) (&pv_hash_table[idx])
-
-#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table)
-#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table)
-
-/*
- * Array of physical page attribites for managed pages.
+ * Array of physical page attributes for managed pages.
* One byte per physical page.
*/
char *pmap_phys_attributes;
unsigned int last_managed_page = 0;
-#define IS_MANAGED_PAGE(x) \
- ((unsigned int)(x) <= last_managed_page && \
- (pmap_phys_attributes[x] & PHYS_MANAGED))
-
-/*
- * Physical page attributes. Copy bits from PTE definition.
- */
-#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */
-#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */
-#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */
-
-/*
- * Amount of virtual memory mapped by one
- * page-directory entry.
- */
-#define PDE_MAPPED_SIZE (pdetova(1))
uint64_t pde_mapped_size = PDE_MAPPED_SIZE;
-/*
- * Locking and TLB invalidation
- */
-
-/*
- * Locking Protocols: (changed 2/2007 JK)
- *
- * There are two structures in the pmap module that need locking:
- * the pmaps themselves, and the per-page pv_lists (which are locked
- * by locking the pv_lock_table entry that corresponds to the pv_head
- * for the list in question.) Most routines want to lock a pmap and
- * then do operations in it that require pv_list locking -- however
- * pmap_remove_all and pmap_copy_on_write operate on a physical page
- * basis and want to do the locking in the reverse order, i.e. lock
- * a pv_list and then go through all the pmaps referenced by that list.
- *
- * The system wide pmap lock has been removed. Now, paths take a lock
- * on the pmap before changing its 'shape' and the reverse order lockers
- * (coming in by phys ppn) take a lock on the corresponding pv and then
- * retest to be sure nothing changed during the window before they locked
- * and can then run up/down the pv lists holding the list lock. This also
- * lets the pmap layer run (nearly completely) interrupt enabled, unlike
- * previously.
- */
-
-/*
- * PV locking
- */
-
-#define LOCK_PVH(index) { \
- mp_disable_preemption(); \
- lock_pvh_pai(index); \
-}
-
-#define UNLOCK_PVH(index) { \
- unlock_pvh_pai(index); \
- mp_enable_preemption(); \
-}
-/*
- * PV hash locking
- */
-
-#define LOCK_PV_HASH(hash) lock_hash_hash(hash)
-#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash)
-
unsigned pmap_memory_region_count;
unsigned pmap_memory_region_current;
struct zone *pmap_zone; /* zone of pmap structures */
-int pmap_debug = 0; /* flag for debugging prints */
-
unsigned int inuse_ptepages_count = 0;
addr64_t kernel64_cr3;
pt_entry_t *DMAP1, *DMAP2;
caddr_t DADDR1;
caddr_t DADDR2;
-
-/*
- * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
- * properly deals with the anchor.
- * must be called with the hash locked, does not unlock it
- */
-
-static inline void
-pmap_pvh_unlink(pv_hashed_entry_t pvh)
-{
- pv_hashed_entry_t curh;
- pv_hashed_entry_t *pprevh;
- int pvhash_idx;
-
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
-
- pprevh = pvhash(pvhash_idx);
-
-#if PV_DEBUG
- if (NULL == *pprevh)
- panic("pvh_unlink null anchor"); /* JK DEBUG */
-#endif
- curh = *pprevh;
-
- while (PV_HASHED_ENTRY_NULL != curh) {
- if (pvh == curh)
- break;
- pprevh = &curh->nexth;
- curh = curh->nexth;
- }
- if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
- *pprevh = pvh->nexth;
- return;
-}
-
-static inline void
-pv_hash_add(pv_hashed_entry_t pvh_e,
- pv_rooted_entry_t pv_h)
-{
- pv_hashed_entry_t *hashp;
- int pvhash_idx;
-
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
- LOCK_PV_HASH(pvhash_idx);
- insque(&pvh_e->qlink, &pv_h->qlink);
- hashp = pvhash(pvhash_idx);
-#if PV_DEBUG
- if (NULL==hashp)
- panic("pv_hash_add(%p) null hash bucket", pvh_e);
-#endif
- pvh_e->nexth = *hashp;
- *hashp = pvh_e;
- UNLOCK_PV_HASH(pvhash_idx);
-}
-
-static inline void
-pv_hash_remove(pv_hashed_entry_t pvh_e)
-{
- int pvhash_idx;
-
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
- LOCK_PV_HASH(pvhash_idx);
- remque(&pvh_e->qlink);
- pmap_pvh_unlink(pvh_e);
- UNLOCK_PV_HASH(pvhash_idx);
-}
-
-/*
- * Remove pv list entry.
- * Called with pv_head_table entry locked.
- * Returns pv entry to be freed (or NULL).
- */
-static inline pv_hashed_entry_t
-pmap_pv_remove(pmap_t pmap,
- vm_map_offset_t vaddr,
- ppnum_t ppn)
-{
- pv_hashed_entry_t pvh_e;
- pv_rooted_entry_t pv_h;
- pv_hashed_entry_t *pprevh;
- int pvhash_idx;
- uint32_t pv_cnt;
-
- pvh_e = PV_HASHED_ENTRY_NULL;
- pv_h = pai_to_pvh(ppn_to_pai(ppn));
- if (pv_h->pmap == PMAP_NULL)
- panic("pmap_pv_remove(%p,%llu,%u): null pv_list!",
- pmap, vaddr, ppn);
-
- if (pv_h->va == vaddr && pv_h->pmap == pmap) {
- /*
- * Header is the pv_rooted_entry.
- * We can't free that. If there is a queued
- * entry after this one we remove that
- * from the ppn queue, we remove it from the hash chain
- * and copy it to the rooted entry. Then free it instead.
- */
- pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
- if (pv_h != (pv_rooted_entry_t) pvh_e) {
- /*
- * Entry queued to root, remove this from hash
- * and install as nem root.
- */
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
- LOCK_PV_HASH(pvhash_idx);
- remque(&pvh_e->qlink);
- pprevh = pvhash(pvhash_idx);
- if (PV_HASHED_ENTRY_NULL == *pprevh) {
- panic("pmap_pv_remove(%p,%llu,%u): "
- "empty hash, removing rooted",
- pmap, vaddr, ppn);
- }
- pmap_pvh_unlink(pvh_e);
- UNLOCK_PV_HASH(pvhash_idx);
- pv_h->pmap = pvh_e->pmap;
- pv_h->va = pvh_e->va; /* dispose of pvh_e */
- } else {
- /* none queued after rooted */
- pv_h->pmap = PMAP_NULL;
- pvh_e = PV_HASHED_ENTRY_NULL;
- }
- } else {
- /*
- * not removing rooted pv. find it on hash chain, remove from
- * ppn queue and hash chain and free it
- */
- CHK_NPVHASH();
- pvhash_idx = pvhashidx(pmap, vaddr);
- LOCK_PV_HASH(pvhash_idx);
- pprevh = pvhash(pvhash_idx);
- if (PV_HASHED_ENTRY_NULL == *pprevh) {
- panic("pmap_pv_remove(%p,%llu,%u): empty hash",
- pmap, vaddr, ppn);
- }
- pvh_e = *pprevh;
- pmap_pv_hashlist_walks++;
- pv_cnt = 0;
- while (PV_HASHED_ENTRY_NULL != pvh_e) {
- pv_cnt++;
- if (pvh_e->pmap == pmap &&
- pvh_e->va == vaddr &&
- pvh_e->ppn == ppn)
- break;
- pprevh = &pvh_e->nexth;
- pvh_e = pvh_e->nexth;
- }
- if (PV_HASHED_ENTRY_NULL == pvh_e)
- panic("pmap_pv_remove(%p,%llu,%u): pv not on hash",
- pmap, vaddr, ppn);
- pmap_pv_hashlist_cnts += pv_cnt;
- if (pmap_pv_hashlist_max < pv_cnt)
- pmap_pv_hashlist_max = pv_cnt;
- *pprevh = pvh_e->nexth;
- remque(&pvh_e->qlink);
- UNLOCK_PV_HASH(pvhash_idx);
- }
-
- return pvh_e;
-}
-
/*
* for legacy, returns the address of the pde entry.
* for 64 bit, causes the pdpt page containing the pde entry to be mapped,
}
}
-/*
- * Remove a range of hardware page-table entries.
- * The entries given are the first (inclusive)
- * and last (exclusive) entries for the VM pages.
- * The virtual address is the va for the first pte.
- *
- * The pmap must be locked.
- * If the pmap is not the kernel pmap, the range must lie
- * entirely within one pte-page. This is NOT checked.
- * Assumes that the pte-page exists.
- */
-
-void
-pmap_remove_range(
- pmap_t pmap,
- vm_map_offset_t start_vaddr,
- pt_entry_t *spte,
- pt_entry_t *epte)
-{
- pt_entry_t *cpte;
- pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
- pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
- pv_hashed_entry_t pvh_e;
- int pvh_cnt = 0;
- int num_removed, num_unwired, num_found;
- int pai;
- pmap_paddr_t pa;
- vm_map_offset_t vaddr;
-
- num_removed = 0;
- num_unwired = 0;
- num_found = 0;
-
- /* invalidate the PTEs first to "freeze" them */
- for (cpte = spte, vaddr = start_vaddr;
- cpte < epte;
- cpte++, vaddr += PAGE_SIZE_64) {
-
- pa = pte_to_pa(*cpte);
- if (pa == 0)
- continue;
- num_found++;
-
- if (iswired(*cpte))
- num_unwired++;
-
- pai = pa_index(pa);
-
- if (!IS_MANAGED_PAGE(pai)) {
- /*
- * Outside range of managed physical memory.
- * Just remove the mappings.
- */
- pmap_store_pte(cpte, 0);
- continue;
- }
-
- /* invalidate the PTE */
- pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID));
- }
-
- if (num_found == 0) {
- /* nothing was changed: we're done */
- goto update_counts;
- }
-
- /* propagate the invalidates to other CPUs */
-
- PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
-
- for (cpte = spte, vaddr = start_vaddr;
- cpte < epte;
- cpte++, vaddr += PAGE_SIZE_64) {
-
- pa = pte_to_pa(*cpte);
- if (pa == 0)
- continue;
-
- pai = pa_index(pa);
-
- LOCK_PVH(pai);
-
- pa = pte_to_pa(*cpte);
- if (pa == 0) {
- UNLOCK_PVH(pai);
- continue;
- }
- num_removed++;
-
- /*
- * Get the modify and reference bits, then
- * nuke the entry in the page table
- */
- /* remember reference and change */
- pmap_phys_attributes[pai] |=
- (char) (*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
- /* completely invalidate the PTE */
- pmap_store_pte(cpte, 0);
-
- /*
- * Remove the mapping from the pvlist for this physical page.
- */
- pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t) pai);
-
- UNLOCK_PVH(pai);
-
- if (pvh_e != PV_HASHED_ENTRY_NULL) {
- pvh_e->qlink.next = (queue_entry_t) pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL) {
- pvh_et = pvh_e;
- }
- pvh_cnt++;
- }
- } /* for loop */
-
- if (pvh_eh != PV_HASHED_ENTRY_NULL) {
- PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
- }
-update_counts:
- /*
- * Update the counts
- */
-#if TESTING
- if (pmap->stats.resident_count < num_removed)
- panic("pmap_remove_range: resident_count");
-#endif
- assert(pmap->stats.resident_count >= num_removed);
- OSAddAtomic(-num_removed, &pmap->stats.resident_count);
-
-#if TESTING
- if (pmap->stats.wired_count < num_unwired)
- panic("pmap_remove_range: wired_count");
-#endif
- assert(pmap->stats.wired_count >= num_unwired);
- OSAddAtomic(-num_unwired, &pmap->stats.wired_count);
-
- return;
-}
-
/*
* Remove phys addr if mapped in specified map
*
}
-/*
- * Remove the given range of addresses
- * from the specified map.
- *
- * It is assumed that the start and end are properly
- * rounded to the hardware page size.
- */
-void
-pmap_remove(
- pmap_t map,
- addr64_t s64,
- addr64_t e64)
-{
- pt_entry_t *pde;
- pt_entry_t *spte, *epte;
- addr64_t l64;
- uint64_t deadline;
-
- pmap_intr_assert();
-
- if (map == PMAP_NULL || s64 == e64)
- return;
-
- PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
- map,
- (uint32_t) (s64 >> 32), s64,
- (uint32_t) (e64 >> 32), e64);
-
-
- PMAP_LOCK(map);
-
-#if 0
- /*
- * Check that address range in the kernel does not overlap the stacks.
- * We initialize local static min/max variables once to avoid making
- * 2 function calls for every remove. Note also that these functions
- * both return 0 before kernel stacks have been initialized, and hence
- * the panic is not triggered in this case.
- */
- if (map == kernel_pmap) {
- static vm_offset_t kernel_stack_min = 0;
- static vm_offset_t kernel_stack_max = 0;
-
- if (kernel_stack_min == 0) {
- kernel_stack_min = min_valid_stack_address();
- kernel_stack_max = max_valid_stack_address();
- }
- if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) ||
- (kernel_stack_min < e64 && e64 <= kernel_stack_max))
- panic("pmap_remove() attempted in kernel stack");
- }
-#else
-
- /*
- * The values of kernel_stack_min and kernel_stack_max are no longer
- * relevant now that we allocate kernel stacks in the kernel map,
- * so the old code above no longer applies. If we wanted to check that
- * we weren't removing a mapping of a page in a kernel stack we'd
- * mark the PTE with an unused bit and check that here.
- */
-
-#endif
-
- deadline = rdtsc64() + max_preemption_latency_tsc;
-
- while (s64 < e64) {
- l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1);
- if (l64 > e64)
- l64 = e64;
- pde = pmap_pde(map, s64);
-
- if (pde && (*pde & INTEL_PTE_VALID)) {
- if (*pde & INTEL_PTE_PS) {
- /*
- * If we're removing a superpage, pmap_remove_range()
- * must work on level 2 instead of level 1; and we're
- * only passing a single level 2 entry instead of a
- * level 1 range.
- */
- spte = pde;
- epte = spte+1; /* excluded */
- } else {
- spte = pmap_pte(map, (s64 & ~(pde_mapped_size - 1)));
- spte = &spte[ptenum(s64)];
- epte = &spte[intel_btop(l64 - s64)];
- }
- pmap_remove_range(map, s64, spte, epte);
- }
- s64 = l64;
- pde++;
-
- if (s64 < e64 && rdtsc64() >= deadline) {
- PMAP_UNLOCK(map)
- PMAP_LOCK(map)
- deadline = rdtsc64() + max_preemption_latency_tsc;
- }
- }
-
- PMAP_UNLOCK(map);
-
- PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
- map, 0, 0, 0, 0);
-
-}
-
-/*
- * Routine: pmap_page_protect
- *
- * Function:
- * Lower the permission for all mappings to a given
- * page.
- */
-void
-pmap_page_protect(
- ppnum_t pn,
- vm_prot_t prot)
-{
- pv_hashed_entry_t pvh_eh = PV_HASHED_ENTRY_NULL;
- pv_hashed_entry_t pvh_et = PV_HASHED_ENTRY_NULL;
- pv_hashed_entry_t nexth;
- int pvh_cnt = 0;
- pv_rooted_entry_t pv_h;
- pv_rooted_entry_t pv_e;
- pv_hashed_entry_t pvh_e;
- pt_entry_t *pte;
- int pai;
- pmap_t pmap;
- boolean_t remove;
-
- pmap_intr_assert();
- assert(pn != vm_page_fictitious_addr);
- if (pn == vm_page_guard_addr)
- return;
-
- pai = ppn_to_pai(pn);
-
- if (!IS_MANAGED_PAGE(pai)) {
- /*
- * Not a managed page.
- */
- return;
- }
- PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
- pn, prot, 0, 0, 0);
-
- /*
- * Determine the new protection.
- */
- switch (prot) {
- case VM_PROT_READ:
- case VM_PROT_READ | VM_PROT_EXECUTE:
- remove = FALSE;
- break;
- case VM_PROT_ALL:
- return; /* nothing to do */
- default:
- remove = TRUE;
- break;
- }
-
- pv_h = pai_to_pvh(pai);
-
- LOCK_PVH(pai);
-
-
- /*
- * Walk down PV list, if any, changing or removing all mappings.
- */
- if (pv_h->pmap == PMAP_NULL)
- goto done;
-
- pv_e = pv_h;
- pvh_e = (pv_hashed_entry_t) pv_e; /* cheat */
-
- do {
- vm_map_offset_t vaddr;
-
- pmap = pv_e->pmap;
- vaddr = pv_e->va;
- pte = pmap_pte(pmap, vaddr);
- if (0 == pte) {
- panic("pmap_page_protect() "
- "pmap=%p pn=0x%x vaddr=0x%llx\n",
- pmap, pn, vaddr);
- }
- nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
-
- /*
- * Remove the mapping if new protection is NONE
- * or if write-protecting a kernel mapping.
- */
- if (remove || pmap == kernel_pmap) {
- /*
- * Remove the mapping, collecting dirty bits.
- */
- pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_VALID);
- PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
- pmap_phys_attributes[pai] |=
- *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
- pmap_store_pte(pte, 0);
-
-#if TESTING
- if (pmap->stats.resident_count < 1)
- panic("pmap_page_protect: resident_count");
-#endif
- assert(pmap->stats.resident_count >= 1);
- OSAddAtomic(-1, &pmap->stats.resident_count);
-
- /*
- * Deal with the pv_rooted_entry.
- */
-
- if (pv_e == pv_h) {
- /*
- * Fix up head later.
- */
- pv_h->pmap = PMAP_NULL;
- } else {
- /*
- * Delete this entry.
- */
- pv_hash_remove(pvh_e);
- pvh_e->qlink.next = (queue_entry_t) pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pvh_cnt++;
- }
- } else {
- /*
- * Write-protect.
- */
- pmap_update_pte(pte, *pte, *pte & ~INTEL_PTE_WRITE);
- PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
- }
- pvh_e = nexth;
- } while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
-
-
- /*
- * If pv_head mapping was removed, fix it up.
- */
- if (pv_h->pmap == PMAP_NULL) {
- pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
-
- if (pvh_e != (pv_hashed_entry_t) pv_h) {
- pv_hash_remove(pvh_e);
- pv_h->pmap = pvh_e->pmap;
- pv_h->va = pvh_e->va;
- pvh_e->qlink.next = (queue_entry_t) pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pvh_cnt++;
- }
- }
- if (pvh_eh != PV_HASHED_ENTRY_NULL) {
- PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
- }
-done:
- UNLOCK_PVH(pai);
-
- PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
- 0, 0, 0, 0, 0);
-}
-
/*
* Routine:
}
}
-
-/*
- * Insert the given physical page (p) at
- * the specified virtual address (v) in the
- * target physical map with the protection requested.
- *
- * If specified, the page will be wired down, meaning
- * that the related pte cannot be reclaimed.
- *
- * NB: This is the only routine which MAY NOT lazy-evaluate
- * or lose information. That is, this routine must actually
- * insert this page into the given map NOW.
- */
-void
-pmap_enter(
- register pmap_t pmap,
- vm_map_offset_t vaddr,
- ppnum_t pn,
- vm_prot_t prot,
- unsigned int flags,
- boolean_t wired)
-{
- pt_entry_t *pte;
- pv_rooted_entry_t pv_h;
- int pai;
- pv_hashed_entry_t pvh_e;
- pv_hashed_entry_t pvh_new;
- pt_entry_t template;
- pmap_paddr_t old_pa;
- pmap_paddr_t pa = (pmap_paddr_t) i386_ptob(pn);
- boolean_t need_tlbflush = FALSE;
- boolean_t set_NX;
- char oattr;
- boolean_t old_pa_locked;
- boolean_t superpage = flags & VM_MEM_SUPERPAGE;
- vm_object_t delpage_pm_obj = NULL;
- int delpage_pde_index = 0;
-
-
- pmap_intr_assert();
- assert(pn != vm_page_fictitious_addr);
- if (pmap_debug)
- kprintf("pmap_enter(%p,%llu,%u)\n", pmap, vaddr, pn);
- if (pmap == PMAP_NULL)
- return;
- if (pn == vm_page_guard_addr)
- return;
-
- PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
- pmap,
- (uint32_t) (vaddr >> 32), (uint32_t) vaddr,
- pn, prot);
-
- if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
- set_NX = FALSE;
- else
- set_NX = TRUE;
-
- /*
- * Must allocate a new pvlist entry while we're unlocked;
- * zalloc may cause pageout (which will lock the pmap system).
- * If we determine we need a pvlist entry, we will unlock
- * and allocate one. Then we will retry, throughing away
- * the allocated entry later (if we no longer need it).
- */
-
- pvh_new = PV_HASHED_ENTRY_NULL;
-Retry:
- pvh_e = PV_HASHED_ENTRY_NULL;
-
- PMAP_LOCK(pmap);
-
- /*
- * Expand pmap to include this pte. Assume that
- * pmap is always expanded to include enough hardware
- * pages to map one VM page.
- */
- if(superpage) {
- while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
- /* need room for another pde entry */
- PMAP_UNLOCK(pmap);
- pmap_expand_pdpt(pmap, vaddr);
- PMAP_LOCK(pmap);
- }
- } else {
- while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
- /*
- * Must unlock to expand the pmap
- * going to grow pde level page(s)
- */
- PMAP_UNLOCK(pmap);
- pmap_expand(pmap, vaddr);
- PMAP_LOCK(pmap);
- }
- }
-
- if (superpage && *pte && !(*pte & INTEL_PTE_PS)) {
- /*
- * There is still an empty page table mapped that
- * was used for a previous base page mapping.
- * Remember the PDE and the PDE index, so that we
- * can free the page at the end of this function.
- */
- delpage_pde_index = (int)pdeidx(pmap, vaddr);
- delpage_pm_obj = pmap->pm_obj;
- *pte = 0;
- }
-
- old_pa = pte_to_pa(*pte);
- pai = pa_index(old_pa);
- old_pa_locked = FALSE;
-
- /*
- * if we have a previous managed page, lock the pv entry now. after
- * we lock it, check to see if someone beat us to the lock and if so
- * drop the lock
- */
- if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
- LOCK_PVH(pai);
- old_pa_locked = TRUE;
- old_pa = pte_to_pa(*pte);
- if (0 == old_pa) {
- UNLOCK_PVH(pai); /* another path beat us to it */
- old_pa_locked = FALSE;
- }
- }
-
- /*
- * Special case if the incoming physical page is already mapped
- * at this address.
- */
- if (old_pa == pa) {
-
- /*
- * May be changing its wired attribute or protection
- */
-
- template = pa_to_pte(pa) | INTEL_PTE_VALID;
-
- if (VM_MEM_NOT_CACHEABLE ==
- (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
- if (!(flags & VM_MEM_GUARDED))
- template |= INTEL_PTE_PTA;
- template |= INTEL_PTE_NCACHE;
- }
- if (pmap != kernel_pmap)
- template |= INTEL_PTE_USER;
- if (prot & VM_PROT_WRITE)
- template |= INTEL_PTE_WRITE;
-
- if (set_NX)
- template |= INTEL_PTE_NX;
-
- if (wired) {
- template |= INTEL_PTE_WIRED;
- if (!iswired(*pte))
- OSAddAtomic(+1,
- &pmap->stats.wired_count);
- } else {
- if (iswired(*pte)) {
- assert(pmap->stats.wired_count >= 1);
- OSAddAtomic(-1,
- &pmap->stats.wired_count);
- }
- }
- if (superpage) /* this path can not be used */
- template |= INTEL_PTE_PS; /* to change the page size! */
-
- /* store modified PTE and preserve RC bits */
- pmap_update_pte(pte, *pte,
- template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD)));
- if (old_pa_locked) {
- UNLOCK_PVH(pai);
- old_pa_locked = FALSE;
- }
- need_tlbflush = TRUE;
- goto Done;
- }
-
- /*
- * Outline of code from here:
- * 1) If va was mapped, update TLBs, remove the mapping
- * and remove old pvlist entry.
- * 2) Add pvlist entry for new mapping
- * 3) Enter new mapping.
- *
- * If the old physical page is not managed step 1) is skipped
- * (except for updating the TLBs), and the mapping is
- * overwritten at step 3). If the new physical page is not
- * managed, step 2) is skipped.
- */
-
- if (old_pa != (pmap_paddr_t) 0) {
-
- /*
- * Don't do anything to pages outside valid memory here.
- * Instead convince the code that enters a new mapping
- * to overwrite the old one.
- */
-
- /* invalidate the PTE */
- pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
- /* propagate invalidate everywhere */
- PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
- /* remember reference and change */
- oattr = (char) (*pte & (PHYS_MODIFIED | PHYS_REFERENCED));
- /* completely invalidate the PTE */
- pmap_store_pte(pte, 0);
-
- if (IS_MANAGED_PAGE(pai)) {
-#if TESTING
- if (pmap->stats.resident_count < 1)
- panic("pmap_enter: resident_count");
-#endif
- assert(pmap->stats.resident_count >= 1);
- OSAddAtomic(-1,
- &pmap->stats.resident_count);
-
- if (iswired(*pte)) {
-#if TESTING
- if (pmap->stats.wired_count < 1)
- panic("pmap_enter: wired_count");
-#endif
- assert(pmap->stats.wired_count >= 1);
- OSAddAtomic(-1,
- &pmap->stats.wired_count);
- }
- pmap_phys_attributes[pai] |= oattr;
-
- /*
- * Remove the mapping from the pvlist for
- * this physical page.
- * We'll end up with either a rooted pv or a
- * hashed pv
- */
- pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t) pai);
-
- } else {
-
- /*
- * old_pa is not managed.
- * Do removal part of accounting.
- */
-
- if (iswired(*pte)) {
- assert(pmap->stats.wired_count >= 1);
- OSAddAtomic(-1,
- &pmap->stats.wired_count);
- }
- }
- }
-
- /*
- * if we had a previously managed paged locked, unlock it now
- */
- if (old_pa_locked) {
- UNLOCK_PVH(pai);
- old_pa_locked = FALSE;
- }
-
- pai = pa_index(pa); /* now working with new incoming phys page */
- if (IS_MANAGED_PAGE(pai)) {
-
- /*
- * Step 2) Enter the mapping in the PV list for this
- * physical page.
- */
- pv_h = pai_to_pvh(pai);
-
- LOCK_PVH(pai);
-
- if (pv_h->pmap == PMAP_NULL) {
- /*
- * No mappings yet, use rooted pv
- */
- pv_h->va = vaddr;
- pv_h->pmap = pmap;
- queue_init(&pv_h->qlink);
- } else {
- /*
- * Add new pv_hashed_entry after header.
- */
- if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
- pvh_e = pvh_new;
- pvh_new = PV_HASHED_ENTRY_NULL;
- } else if (PV_HASHED_ENTRY_NULL == pvh_e) {
- PV_HASHED_ALLOC(pvh_e);
- if (PV_HASHED_ENTRY_NULL == pvh_e) {
- /*
- * the pv list is empty. if we are on
- * the kernel pmap we'll use one of
- * the special private kernel pv_e's,
- * else, we need to unlock
- * everything, zalloc a pv_e, and
- * restart bringing in the pv_e with
- * us.
- */
- if (kernel_pmap == pmap) {
- PV_HASHED_KERN_ALLOC(pvh_e);
- } else {
- UNLOCK_PVH(pai);
- PMAP_UNLOCK(pmap);
- pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
- goto Retry;
- }
- }
- }
- if (PV_HASHED_ENTRY_NULL == pvh_e)
- panic("pvh_e exhaustion");
-
- pvh_e->va = vaddr;
- pvh_e->pmap = pmap;
- pvh_e->ppn = pn;
- pv_hash_add(pvh_e, pv_h);
-
- /*
- * Remember that we used the pvlist entry.
- */
- pvh_e = PV_HASHED_ENTRY_NULL;
- }
-
- /*
- * only count the mapping
- * for 'managed memory'
- */
- OSAddAtomic(+1, & pmap->stats.resident_count);
- if (pmap->stats.resident_count > pmap->stats.resident_max) {
- pmap->stats.resident_max = pmap->stats.resident_count;
- }
- }
- /*
- * Step 3) Enter the mapping.
- *
- * Build a template to speed up entering -
- * only the pfn changes.
- */
- template = pa_to_pte(pa) | INTEL_PTE_VALID;
-
- if (flags & VM_MEM_NOT_CACHEABLE) {
- if (!(flags & VM_MEM_GUARDED))
- template |= INTEL_PTE_PTA;
- template |= INTEL_PTE_NCACHE;
- }
- if (pmap != kernel_pmap)
- template |= INTEL_PTE_USER;
- if (prot & VM_PROT_WRITE)
- template |= INTEL_PTE_WRITE;
- if (set_NX)
- template |= INTEL_PTE_NX;
- if (wired) {
- template |= INTEL_PTE_WIRED;
- OSAddAtomic(+1, & pmap->stats.wired_count);
- }
- if (superpage)
- template |= INTEL_PTE_PS;
- pmap_store_pte(pte, template);
-
- /*
- * if this was a managed page we delayed unlocking the pv until here
- * to prevent pmap_page_protect et al from finding it until the pte
- * has been stored
- */
- if (IS_MANAGED_PAGE(pai)) {
- UNLOCK_PVH(pai);
- }
-Done:
- if (need_tlbflush == TRUE)
- PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
-
- if (pvh_e != PV_HASHED_ENTRY_NULL) {
- PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
- }
- if (pvh_new != PV_HASHED_ENTRY_NULL) {
- PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
- }
- PMAP_UNLOCK(pmap);
-
- if (delpage_pm_obj) {
- vm_page_t m;
-
- vm_object_lock(delpage_pm_obj);
- m = vm_page_lookup(delpage_pm_obj, delpage_pde_index);
- if (m == VM_PAGE_NULL)
- panic("pmap_enter: pte page not in object");
- VM_PAGE_FREE(m);
- OSAddAtomic(-1, &inuse_ptepages_count);
- vm_object_unlock(delpage_pm_obj);
- }
-
- PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
-}
-
/*
* Routine: pmap_change_wiring
* Function: Change the wiring attribute for a map/virtual-address
return TRUE;
}
-void
-mapping_free_prime(void)
-{
- int i;
- pv_hashed_entry_t pvh_e;
- pv_hashed_entry_t pvh_eh;
- pv_hashed_entry_t pvh_et;
- int pv_cnt;
-
- pv_cnt = 0;
- pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
- for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) {
- pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
-
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pv_cnt++;
- }
- PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
-
- pv_cnt = 0;
- pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
- for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
- pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
-
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pv_cnt++;
- }
- PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
-
-}
-
-void
-mapping_adjust(void)
-{
- pv_hashed_entry_t pvh_e;
- pv_hashed_entry_t pvh_eh;
- pv_hashed_entry_t pvh_et;
- int pv_cnt;
- int i;
-
- if (mapping_adjust_call == NULL) {
- thread_call_setup(&mapping_adjust_call_data,
- (thread_call_func_t) mapping_adjust,
- (thread_call_param_t) NULL);
- mapping_adjust_call = &mapping_adjust_call_data;
- }
-
- pv_cnt = 0;
- pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
- if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) {
- for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
- pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
-
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pv_cnt++;
- }
- PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
- }
-
- pv_cnt = 0;
- pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
- if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) {
- for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) {
- pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
-
- pvh_e->qlink.next = (queue_entry_t)pvh_eh;
- pvh_eh = pvh_e;
-
- if (pvh_et == PV_HASHED_ENTRY_NULL)
- pvh_et = pvh_e;
- pv_cnt++;
- }
- PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
- }
- mappingrecurse = 0;
-}
-
-
void
pmap_switch(pmap_t tpmap)
{
*val = 1;
return (NUM);
}
+
+boolean_t
+PE_imgsrc_mount_supported()
+{
+ return TRUE;
+}
/* Hack! FIXME.. */
outb(0x21, 0xff); /* Maskout all interrupts Pic1 */
outb(0xa1, 0xff); /* Maskout all interrupts Pic2 */
-
if (PE_state.deviceTreeHead) {
DTInit(PE_state.deviceTreeHead);
- }
+ }
pe_identify_machine(args);
} else {
pe_init_debug();
}
+
}
void PE_create_console( void )
int (*PE_poll_input)(unsigned int options, char * c)
= PE_stub_poll_input;
-
-
+boolean_t
+PE_reboot_on_panic(void)
+{
+ return FALSE;
+}
extern void initialize_screen(PE_Video *, unsigned int);
+extern void dim_screen(void);
+
extern int PE_current_console(
PE_Video *info);
extern void pe_init_debug(void);
+extern boolean_t PE_imgsrc_mount_supported(void);
+
+
+#if KERNEL_PRIVATE
+boolean_t PE_reboot_on_panic(void);
+#endif
__END_DECLS