From b4c24cb9d3df001f2892dc4ed451bc769ff28a9f Mon Sep 17 00:00:00 2001 From: Apple Date: Tue, 12 Aug 2003 21:04:55 +0000 Subject: [PATCH] xnu-344.12.2.tar.gz --- bsd/conf/files | 3 + bsd/conf/version.minor | 2 +- bsd/hfs/hfs.h | 63 +- bsd/hfs/hfs_attrlist.c | 93 +- bsd/hfs/hfs_btreeio.c | 234 ++- bsd/hfs/hfs_catalog.c | 104 +- bsd/hfs/hfs_cnode.c | 44 +- bsd/hfs/hfs_format.h | 28 +- bsd/hfs/hfs_link.c | 73 +- bsd/hfs/hfs_lookup.c | 3 +- bsd/hfs/hfs_mount.h | 5 + bsd/hfs/hfs_readwrite.c | 220 +- bsd/hfs/hfs_search.c | 28 + bsd/hfs/hfs_vfsops.c | 536 ++++- bsd/hfs/hfs_vfsutils.c | 583 +++++- bsd/hfs/hfs_vnops.c | 598 +++++- bsd/hfs/hfscommon/BTree/BTree.c | 137 +- bsd/hfs/hfscommon/BTree/BTreeAllocate.c | 20 +- bsd/hfs/hfscommon/BTree/BTreeMiscOps.c | 18 +- bsd/hfs/hfscommon/BTree/BTreeScanner.c | 12 +- bsd/hfs/hfscommon/BTree/BTreeTreeOps.c | 62 +- bsd/hfs/hfscommon/Catalog/FileIDsServices.c | 3 + bsd/hfs/hfscommon/Misc/FileExtentMapping.c | 80 +- bsd/hfs/hfscommon/Misc/VolumeAllocation.c | 54 +- bsd/hfs/hfscommon/headers/BTreesInternal.h | 7 +- bsd/hfs/hfscommon/headers/BTreesPrivate.h | 4 + bsd/kern/kern_mman.c | 4 + bsd/kern/qsort.c | 3 +- bsd/kern/ubc_subr.c | 129 +- bsd/miscfs/specfs/spec_vnops.c | 3 +- bsd/nfs/nfs_bio.c | 7 +- bsd/nfs/nfs_socket.c | 6 +- bsd/nfs/nfs_vnops.c | 6 +- bsd/sys/buf.h | 11 +- bsd/sys/disk.h | 4 + bsd/sys/malloc.h | 10 +- bsd/sys/mount.h | 3 +- bsd/sys/ubc.h | 3 + bsd/vfs/Makefile | 2 +- bsd/vfs/vfs_bio.c | 139 +- bsd/vfs/vfs_cluster.c | 455 ++-- bsd/vfs/vfs_journal.c | 2067 +++++++++++++++++++ bsd/vfs/vfs_journal.h | 238 +++ bsd/vfs/vfs_subr.c | 26 +- iokit/IOKit/IOKitKeys.h | 12 +- iokit/KernelConfigTables.cpp | 20 +- iokit/conf/version.minor | 2 +- libkern/conf/version.minor | 2 +- libsa/conf/version.minor | 2 +- osfmk/conf/kernelversion.minor | 2 +- osfmk/conf/version.minor | 2 +- osfmk/i386/loose_ends.c | 43 + osfmk/ppc/cswtch.s | 4 +- osfmk/ppc/mappings.c | 47 +- osfmk/ppc/pmap.c | 3 + osfmk/vm/vm_init.c | 6 +- osfmk/vm/vm_kern.c | 147 +- pexpert/conf/version.minor | 2 +- 58 files changed, 5937 insertions(+), 487 deletions(-) create mode 100644 bsd/vfs/vfs_journal.c create mode 100644 bsd/vfs/vfs_journal.h diff --git a/bsd/conf/files b/bsd/conf/files index 7012205fa..817d99f42 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -137,6 +137,7 @@ bsd/vfs/vfs_support.c standard bsd/vfs/vfs_utfconv.c standard bsd/vfs/vfs_vnops.c standard bsd/vfs/vnode_if.c standard +bsd/vfs/vfs_journal.c standard bsd/miscfs/deadfs/dead_vnops.c standard bsd/miscfs/fdesc/fdesc_vfsops.c optional fdesc @@ -501,6 +502,8 @@ bsd/kern/mach_header.c standard bsd/kern/mach_loader.c standard bsd/kern/posix_sem.c standard bsd/kern/posix_shm.c standard +# XXXdbg - I need this in the journaling and block cache code +bsd/kern/qsort.c standard bsd/vm/vnode_pager.c standard bsd/vm/vm_unix.c standard diff --git a/bsd/conf/version.minor b/bsd/conf/version.minor index d00491fd7..0cfbf0888 100644 --- a/bsd/conf/version.minor +++ b/bsd/conf/version.minor @@ -1 +1 @@ -1 +2 diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h index 9086981b0..b82adcf2e 100644 --- a/bsd/hfs/hfs.h +++ b/bsd/hfs/hfs.h @@ -36,6 +36,8 @@ #include #include +#include + #include #include #include @@ -108,6 +110,7 @@ struct vcb_t { int16_t vcbAtrb; int16_t vcbFlags; int16_t vcbspare; + u_int32_t vcbJinfoBlock; u_int32_t vcbCrDate; u_int32_t vcbLsMod; @@ -180,6 +183,7 @@ typedef struct hfsmount { u_int8_t hfs_fs_ronly; /* Whether this was mounted as read-initially */ u_int8_t hfs_unknownpermissions; /* Whether this was mounted with MNT_UNKNOWNPERMISSIONS */ u_int8_t hfs_media_writeable; + u_int8_t hfs_orphans_cleaned; /* Physical Description */ u_long hfs_phys_block_count; /* Num of PHYSICAL blocks of volume */ @@ -211,10 +215,55 @@ typedef struct hfsmount { unicode_to_hfs_func_t hfs_get_hfsname; struct quotafile hfs_qfiles[MAXQUOTAS]; /* quota files */ + + // XXXdbg + void *jnl; // the journal for this volume (if one exists) + struct vnode *jvp; // device where the journal lives (may be equal to devvp) + u_int32_t jnl_start; // start block of the journal file (so we don't delete it) + u_int32_t hfs_jnlfileid; + u_int32_t hfs_jnlinfoblkid; + volatile int readers; + volatile int blocker; } hfsmount_t; #define hfs_private_metadata_dir hfs_privdir_desc.cd_cnid +#define hfs_global_shared_lock_acquire(hfsmp) \ + do { \ + if (hfsmp->blocker) { \ + tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \ + continue; \ + } \ + hfsmp->readers++; \ + break; \ + } while (1) + +#define hfs_global_shared_lock_release(hfsmp) \ + do { \ + hfsmp->readers--; \ + if (hfsmp->readers == 0) { \ + wakeup((caddr_t)&hfsmp->readers); \ + } \ + } while (0) + +#define hfs_global_exclusive_lock_acquire(hfsmp) \ + do { \ + if (hfsmp->blocker) { \ + tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \ + continue; \ + } \ + if (hfsmp->readers != 0) { \ + tsleep((caddr_t)&hfsmp->readers, PRIBIO, "journal_enable/disble", 0); \ + continue; \ + } \ + hfsmp->blocker = 1; \ + break; \ + } while (1) + +#define hfs_global_exclusive_lock_release(hfsmp) \ + hfsmp->blocker = 0; \ + wakeup((caddr_t)&hfsmp->blocker) + #define MAXHFSVNODELEN 31 @@ -325,6 +374,7 @@ enum { kdirentMaxNameBytes = NAME_MAX }; #define VTOHFS(VP) ((struct hfsmount *)((VP)->v_mount->mnt_data)) #define VFSTOHFS(MP) ((struct hfsmount *)(MP)->mnt_data) #define VCBTOHFS(VCB) (((struct vfsVCB *)(VCB))->vcb_hfsmp) +#define FCBTOHFS(FCB) ((struct hfsmount *)(FCB)->ff_cp->c_vp->v_mount->mnt_data) /* * Various ways to acquire a VCB pointer: @@ -332,6 +382,7 @@ enum { kdirentMaxNameBytes = NAME_MAX }; #define VTOVCB(VP) (&(((struct hfsmount *)((VP)->v_mount->mnt_data))->hfs_vcb.vcb_vcb)) #define VFSTOVCB(MP) (&(((struct hfsmount *)(MP)->mnt_data)->hfs_vcb.vcb_vcb)) #define HFSTOVCB(HFSMP) (&(HFSMP)->hfs_vcb.vcb_vcb) +#define FCBTOVCB(FCB) (&(((struct hfsmount *)((FCB)->ff_cp->c_vp->v_mount->mnt_data))->hfs_vcb.vcb_vcb)) #define E_NONE 0 @@ -376,6 +427,8 @@ extern int hfs_metafilelocking(struct hfsmount *hfsmp, u_long fileID, u_int flag extern u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve); +extern void hfs_remove_orphans(struct hfsmount *); + short MacToVFSError(OSErr err); @@ -388,6 +441,8 @@ u_long FindMetaDataDirectory(ExtendedVCB *vcb); #define HFS_SYNCTRANS 1 extern int hfs_btsync(struct vnode *vp, int sync_transaction); +// used as a callback by the journaling code +extern void hfs_sync_metadata(void *arg); short make_dir_entry(FCB **fileptr, char *name, u_int32_t fileID); @@ -399,7 +454,13 @@ unsigned long BestBlockSizeFit(unsigned long allocationBlockSize, OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, struct proc *p); OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, - off_t embeddedOffset, u_int64_t disksize, struct proc *p); + off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args); + +extern int hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, + void *_args, int embeddedOffset, int mdb_offset, + HFSMasterDirectoryBlock *mdbp, struct ucred *cred); +extern u_long GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name, + struct cat_attr *fattr, struct cat_fork *forkinfo); int hfs_getconverter(u_int32_t encoding, hfs_to_unicode_func_t *get_unicode, unicode_to_hfs_func_t *get_hfsname); diff --git a/bsd/hfs/hfs_attrlist.c b/bsd/hfs/hfs_attrlist.c index 650d6fa4a..f53d05e3f 100644 --- a/bsd/hfs/hfs_attrlist.c +++ b/bsd/hfs/hfs_attrlist.c @@ -194,15 +194,35 @@ hfs_getattrlist(ap) if ((error = hfs_write_access(vp, ap->a_cred, ap->a_p, false)) != 0) return (error); + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { + hfs_global_shared_lock_release(hfsmp); + return error; + } + } + /* Lock catalog b-tree */ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, ap->a_p); - if (error) - return (error); + if (error) { + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + return (error); + } error = cat_insertfilethread(hfsmp, &cp->c_desc); /* Unlock catalog b-tree */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, ap->a_p); + + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + if (error) return (error); } @@ -350,6 +370,17 @@ hfs_setattrlist(ap) } if (cp->c_flag & (C_NOEXISTS | C_DELETED)) return (ENOENT); + + // XXXdbg - don't allow modifying the journal or journal_info_block + if (hfsmp->jnl && cp->c_datafork) { + struct HFSPlusExtentDescriptor *extd; + + extd = &cp->c_datafork->ff_data.cf_extents[0]; + if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { + return EPERM; + } + } + /* * Ownership of a file is required in one of two classes of calls: * @@ -447,14 +478,12 @@ hfs_setattrlist(ap) * If any cnode attributes changed then do an update. */ if (alist->volattr == 0) { - struct timeval atime, mtime; + struct timeval tv; - atime.tv_sec = cp->c_atime; - atime.tv_usec = 0; - mtime.tv_sec = cp->c_mtime; - mtime.tv_usec = cp->c_mtime_nsec / 1000; cp->c_flag |= C_MODIFIED; - if ((error = VOP_UPDATE(vp, &atime, &mtime, 1))) + tv = time; + CTIMES(cp, &tv, &tv); + if ((error = VOP_UPDATE(vp, &tv, &tv, 1))) goto ErrorExit; } /* Volume Rename */ @@ -482,9 +511,28 @@ hfs_setattrlist(ap) to_desc.cd_cnid = cp->c_cnid; to_desc.cd_flags = CD_ISDIR; + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + error = EINVAL; + /* Restore the old name in the VCB */ + copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL); + vcb->vcbFlags |= 0xFF00; + goto ErrorExit; + } + } + + /* Lock catalog b-tree */ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); if (error) { + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + /* Restore the old name in the VCB */ copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL); vcb->vcbFlags |= 0xFF00; @@ -495,7 +543,12 @@ hfs_setattrlist(ap) /* Unlock the Catalog */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - + + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + if (error) { /* Restore the old name in the VCB */ copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL); @@ -601,12 +654,17 @@ hfs_readdirattr(ap) int error = 0; int depleted = 0; int index, startindex; - int i; + int i, dir_entries; struct cat_desc *lastdescp = NULL; struct cat_desc prevdesc; char * prevnamebuf = NULL; struct cat_entrylist *ce_list = NULL; + dir_entries = dcp->c_entries; + if (dcp->c_attr.ca_fileid == kHFSRootFolderID && hfsmp->jnl) { + dir_entries -= 3; + } + *(ap->a_actualcount) = 0; *(ap->a_eofflag) = 0; @@ -639,7 +697,7 @@ hfs_readdirattr(ap) /* Convert uio_offset into a directory index. */ startindex = index = uio->uio_offset / sizeof(struct dirent); - if ((index + 1) > dcp->c_entries) { + if ((index + 1) > dir_entries) { *(ap->a_eofflag) = 1; error = 0; goto exit; @@ -781,7 +839,7 @@ hfs_readdirattr(ap) /* Termination checks */ if ((--maxcount <= 0) || (uio->uio_resid < (fixedblocksize + HFS_AVERAGE_NAME_SIZE)) || - (index >= dcp->c_entries)) { + (index >= dir_entries)) { depleted = 1; break; } @@ -789,7 +847,7 @@ hfs_readdirattr(ap) } /* for each catalog entry */ /* If there are more entries then save the last name. */ - if (index < dcp->c_entries + if (index < dir_entries && !(*(ap->a_eofflag)) && lastdescp != NULL) { if (prevnamebuf == NULL) @@ -1408,9 +1466,12 @@ packdirattr( if (ATTR_DIR_ENTRYCOUNT & attr) { u_long entries = cattrp->ca_entries; - if ((descp->cd_parentcnid == kRootParID) && - (hfsmp->hfs_private_metadata_dir != 0)) - --entries; /* hide private dir */ + if (descp->cd_parentcnid == kRootParID) { + if (hfsmp->hfs_private_metadata_dir != 0) + --entries; /* hide private dir */ + if (hfsmp->jnl) + entries -= 2; /* hide the journal files */ + } *((u_long *)attrbufptr)++ = entries; } diff --git a/bsd/hfs/hfs_btreeio.c b/bsd/hfs/hfs_btreeio.c index 6947a695a..a70290d05 100644 --- a/bsd/hfs/hfs_btreeio.c +++ b/bsd/hfs/hfs_btreeio.c @@ -68,7 +68,7 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option if (options & kGetEmptyBlock) bp = getblk(vp, blockNum, block->blockSize, 0, 0, BLK_META); else - retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp); + retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp); DBG_ASSERT(bp != NULL); DBG_ASSERT(bp->b_data != NULL); @@ -83,6 +83,9 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option block->buffer = bp->b_data; block->blockReadFromDisk = (bp->b_flags & B_CACHE) == 0; /* not found in cache ==> came from disk */ + // XXXdbg + block->isModified = 0; + #if BYTE_ORDER == LITTLE_ENDIAN /* Endian swap B-Tree node (only if it's a valid block) */ if (!(options & kGetEmptyBlock)) { @@ -116,9 +119,31 @@ OSStatus GetBTreeBlock(FileReference vp, UInt32 blockNum, GetBlockOptions option } +__private_extern__ +void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr) +{ + struct hfsmount *hfsmp = VTOHFS(vp); + struct buf *bp = NULL; + + if (hfsmp->jnl == NULL) { + return; + } + + bp = (struct buf *) blockPtr->blockHeader; + if (bp == NULL) { + panic("ModifyBlockStart: null bp for blockdescptr 0x%x?!?\n", blockPtr); + return; + } + + journal_modify_block_start(hfsmp->jnl, bp); + blockPtr->isModified = 1; +} + + __private_extern__ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options) { + struct hfsmount *hfsmp = VTOHFS(vp); extern int bdwrite_internal(struct buf *, int); OSStatus retval = E_NONE; struct buf *bp = NULL; @@ -131,16 +156,25 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock } if (options & kTrashBlock) { - bp->b_flags |= B_INVAL; - brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ + bp->b_flags |= B_INVAL; + if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) { + journal_kill_block(hfsmp->jnl, bp); + } else { + brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ + } } else { if (options & kForceWriteBlock) { - retval = VOP_BWRITE(bp); + if (hfsmp->jnl) { + if (blockPtr->isModified == 0) { + panic("hfs: releaseblock: modified is 0 but forcewrite set! bp 0x%x\n", bp); + } + retval = journal_modify_block_end(hfsmp->jnl, bp); + blockPtr->isModified = 0; + } else { + retval = VOP_BWRITE(bp); + } } else if (options & kMarkBlockDirty) { -#if FORCESYNCBTREEWRITES - VOP_BWRITE(bp); -#else - if (options & kLockTransaction) { + if ((options & kLockTransaction) && hfsmp->jnl == NULL) { /* * * Set the B_LOCKED flag and unlock the buffer, causing brelse to move @@ -156,24 +190,44 @@ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlock /* Rollback sync time to cause a sync on lock release... */ (void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1)); } - bp->b_flags |= B_LOCKED; - } + + bp->b_flags |= B_LOCKED; + } + /* * Delay-write this block. * If the maximum delayed buffers has been exceeded then * free up some buffers and fall back to an asynchronous write. */ - if (bdwrite_internal(bp, 1) != 0) { + if (hfsmp->jnl) { + if (blockPtr->isModified == 0) { + panic("hfs: releaseblock: modified is 0 but markdirty set! bp 0x%x\n", bp); + } + retval = journal_modify_block_end(hfsmp->jnl, bp); + blockPtr->isModified = 0; + } else if (bdwrite_internal(bp, 1) != 0) { hfs_btsync(vp, 0); /* Rollback sync time to cause a sync on lock release... */ (void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1)); bp->b_flags &= ~B_LOCKED; bawrite(bp); } - -#endif } else { - brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ + // check if we had previously called journal_modify_block_start() + // on this block and if so, abort it (which will call brelse()). + if (hfsmp->jnl && blockPtr->isModified) { + // XXXdbg - I don't want to call modify_block_abort() + // because I think it may be screwing up the + // journal and blowing away a block that has + // valid data in it. + // + // journal_modify_block_abort(hfsmp->jnl, bp); + //panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp); + journal_modify_block_end(hfsmp->jnl, bp); + blockPtr->isModified = 0; + } else { + brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */ + } }; }; @@ -187,17 +241,16 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) { #pragma unused (maxEOF) - OSStatus retval; - UInt64 actualBytesAdded; + OSStatus retval, ret; + UInt64 actualBytesAdded, origSize; UInt64 bytesToAdd; - UInt32 extendFlags; u_int32_t startAllocation; u_int32_t fileblocks; BTreeInfoRec btInfo; ExtendedVCB *vcb; FCB *filePtr; struct proc *p = NULL; - + UInt64 trim = 0; filePtr = GetFileControlBlock(vp); @@ -225,13 +278,14 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) { p = current_proc(); /* lock extents b-tree (also protects volume bitmap) */ - retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p); + retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p); if (retval) return (retval); } (void) BTGetInformation(filePtr, 0, &btInfo); +#if 0 // XXXdbg /* * The b-tree code expects nodes to be contiguous. So when * the allocation block size is less than the b-tree node @@ -241,14 +295,38 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) extendFlags = 0; } else { /* Ensure that all b-tree nodes are contiguous on disk */ - extendFlags = kEFAllMask | kEFContigMask; + extendFlags = kEFContigMask; } +#endif + origSize = filePtr->fcbEOF; fileblocks = filePtr->ff_blocks; startAllocation = vcb->nextAllocation; - retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, extendFlags, &actualBytesAdded); - + // loop trying to get a contiguous chunk that's an integer multiple + // of the btree node size. if we can't get a contiguous chunk that + // is at least the node size then we break out of the loop and let + // the error propagate back up. + do { + retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, kEFContigMask, &actualBytesAdded); + if (retval == dskFulErr && actualBytesAdded == 0) { + + if (bytesToAdd == btInfo.nodeSize || bytesToAdd < (minEOF - origSize)) { + // if we're here there's nothing else to try, we're out + // of space so we break and bail out. + break; + } else { + bytesToAdd >>= 1; + if (bytesToAdd < btInfo.nodeSize) { + bytesToAdd = btInfo.nodeSize; + } else if ((bytesToAdd % btInfo.nodeSize) != 0) { + // make sure it's an integer multiple of the nodeSize + bytesToAdd -= (bytesToAdd % btInfo.nodeSize); + } + } + } + } while (retval == dskFulErr && actualBytesAdded == 0); + /* * If a new extent was added then move the roving allocator * reference forward by the current b-tree file size so @@ -260,25 +338,74 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) vcb->nextAllocation += fileblocks; } + filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; + + // XXXdbg ExtendFileC() could have returned an error even though + // it grew the file to be big enough for our needs. If this is + // the case, we don't care about retval so we blow it away. + // + if (filePtr->fcbEOF >= minEOF && retval != 0) { + retval = 0; + } + + // XXXdbg if the file grew but isn't large enough or isn't an + // even multiple of the nodeSize then trim things back. if + // the file isn't large enough we trim back to the original + // size. otherwise we trim back to be an even multiple of the + // btree node size. + // + if ((filePtr->fcbEOF < minEOF) || (actualBytesAdded % btInfo.nodeSize) != 0) { + + if (filePtr->fcbEOF < minEOF) { + retval = dskFulErr; + + if (filePtr->fcbEOF < origSize) { + panic("hfs: btree file eof %lld less than orig size %lld!\n", + filePtr->fcbEOF, origSize); + } + + trim = filePtr->fcbEOF - origSize; + if (trim != actualBytesAdded) { + panic("hfs: trim == %lld but actualBytesAdded == %lld\n", + trim, actualBytesAdded); + } + } else { + trim = (actualBytesAdded % btInfo.nodeSize); + } + + ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0); + filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; + + // XXXdbg - panic if the file didn't get trimmed back properly + if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) { + panic("hfs: truncate file didn't! fcbEOF %lld nsize %d fcb 0x%x\n", + filePtr->fcbEOF, btInfo.nodeSize, filePtr); + } + + if (ret) { + // XXXdbg - this probably doesn't need to be a panic() + panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %d)\n", + filePtr->fcbEOF, trim, ret); + return ret; + } + actualBytesAdded -= trim; + } + if(VTOC(vp)->c_fileid != kHFSExtentsFileID) { /* * Get any extents overflow b-tree changes to disk ASAP! */ - if (retval == 0) { - (void) BTFlushPath(VTOF(vcb->extentsRefNum)); - (void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p); - } + (void) BTFlushPath(VTOF(vcb->extentsRefNum)); + (void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p); + (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p); } - if (retval) - return (retval); - - filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; - retval = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded); - if (retval) - return (retval); - + if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) { + panic("hfs: extendbtree: fcb 0x%x has eof 0x%llx not a multiple of 0x%x (trim %llx)\n", + filePtr, filePtr->fcbEOF, btInfo.nodeSize, trim); + } + /* * Update the Alternate MDB or Alternate VolumeHeader */ @@ -287,8 +414,12 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) (VTOC(vp)->c_fileid == kHFSAttributesFileID) ) { MarkVCBDirty( vcb ); - retval = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH); + ret = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH); } + + ret = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded); + if (ret) + return (ret); return retval; } @@ -300,6 +431,7 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) static int ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount) { + struct hfsmount *hfsmp = VTOHFS(vp); struct buf *bp = NULL; daddr_t blk; daddr_t blkcnt; @@ -311,14 +443,36 @@ ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount) bp = getblk(vp, blk, blksize, 0, 0, BLK_META); if (bp == NULL) continue; + + // XXXdbg + if (hfsmp->jnl) { + // XXXdbg -- skipping this for now since it makes a transaction + // become *way* too large + //journal_modify_block_start(hfsmp->jnl, bp); + } + bzero((char *)bp->b_data, blksize); bp->b_flags |= B_AGE; - /* wait/yield every 32 blocks so we don't hog all the buffers */ - if ((blk % 32) == 0) - VOP_BWRITE(bp); - else - bawrite(bp); + // XXXdbg + if (hfsmp->jnl) { + // XXXdbg -- skipping this for now since it makes a transaction + // become *way* too large + //journal_modify_block_end(hfsmp->jnl, bp); + + // XXXdbg - remove this once we decide what to do with the + // writes to the journal + if ((blk % 32) == 0) + VOP_BWRITE(bp); + else + bawrite(bp); + } else { + /* wait/yield every 32 blocks so we don't hog all the buffers */ + if ((blk % 32) == 0) + VOP_BWRITE(bp); + else + bawrite(bp); + } --blkcnt; ++blk; } diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c index 7d6999e65..769576d7e 100644 --- a/bsd/hfs/hfs_catalog.c +++ b/bsd/hfs/hfs_catalog.c @@ -261,6 +261,11 @@ cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp) if (result) goto exit; + // XXXdbg - preflight all btree operations to make sure there's enough space + result = BTCheckFreeSpace(fcb); + if (result) + goto exit; + BDINIT(file_data, &file_rec); result = BTSearchRecord(fcb, &iterator[0], &file_data, &datasize, &iterator[0]); if (result) @@ -288,6 +293,7 @@ cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp) (void) BTFlushPath(fcb); } exit: + (void) BTFlushPath(fcb); FREE(iterator, M_TEMP); return MacToVFSError(result); @@ -426,6 +432,15 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, u_long hint, int wantr encoding = getencoding(recp); hint = iterator->hint.nodeNum; + /* Hide the journal files (if any) */ + if (hfsmp->jnl && + ((cnid == hfsmp->hfs_jnlfileid) || + (cnid == hfsmp->hfs_jnlinfoblkid))) { + + result = ENOENT; + goto exit; + } + /* * When a hardlink link is encountered, auto resolve it */ @@ -529,6 +544,11 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr hfs_setencodingbits(hfsmp, encoding); } + // XXXdbg - preflight all btree operations to make sure there's enough space + result = BTCheckFreeSpace(fcb); + if (result) + goto exit; + /* * Insert the thread record first */ @@ -617,9 +637,8 @@ cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr vcb->vcbNxtCNID = nextCNID; vcb->vcbFlags |= 0xFF00; - (void) BTFlushPath(fcb); - exit: + (void) BTFlushPath(fcb); FREE(bto, M_TEMP); return MacToVFSError(result); @@ -678,6 +697,11 @@ cat_rename ( if ((result = buildkey(hfsmp, to_cdp, (HFSPlusCatalogKey *)&to_iterator->key, 0))) goto exit; + // XXXdbg - preflight all btree operations to make sure there's enough space + result = BTCheckFreeSpace(fcb); + if (result) + goto exit; + to_key = (HFSPlusCatalogKey *)&to_iterator->key; MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); BDINIT(btdata, recp); @@ -781,7 +805,17 @@ cat_rename ( result = BTInsertRecord(fcb, to_iterator, &btdata, datasize); if (result) { /* Try and restore original before leaving */ + // XXXdbg + #if 1 + { + int err; + err = BTInsertRecord(fcb, from_iterator, &btdata, datasize); + if (err) + panic("cat_create: could not undo (BTInsert = %d)", err); + } + #else (void) BTInsertRecord(fcb, from_iterator, &btdata, datasize); + #endif goto exit; } sourcegone = 1; @@ -794,7 +828,17 @@ cat_rename ( result = BTDeleteRecord(fcb, from_iterator); if (result) { /* Try and delete new record before leaving */ + // XXXdbg + #if 1 + { + int err; + err = BTDeleteRecord(fcb, to_iterator); + if (err) + panic("cat_create: could not undo (BTDelete = %d)", err); + } + #else (void) BTDeleteRecord(fcb, to_iterator); + #endif goto exit; } } @@ -834,8 +878,8 @@ cat_rename ( FREE(pluskey, M_TEMP); } } - (void) BTFlushPath(fcb); exit: + (void) BTFlushPath(fcb); if (from_iterator) FREE(from_iterator, M_TEMP); if (to_iterator) @@ -874,7 +918,6 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr * A directory must be empty * A file must be zero length (no blocks) */ - if (descp->cd_cnid < kHFSFirstUserCatalogNodeID || descp->cd_parentcnid == kRootParID) return (EINVAL); @@ -899,6 +942,11 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr if (result) goto exit; + // XXXdbg - preflight all btree operations to make sure there's enough space + result = BTCheckFreeSpace(fcb); + if (result) + goto exit; + /* Delete record */ result = BTDeleteRecord(fcb, iterator); if (result) @@ -910,8 +958,8 @@ cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr TrashCatalogIterator(vcb, descp->cd_parentcnid); - (void) BTFlushPath(fcb); exit: + (void) BTFlushPath(fcb); FREE(iterator, M_TEMP); return MacToVFSError(result); @@ -973,9 +1021,8 @@ cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr /* Update the node hint. */ descp->cd_hint = iterator->hint.nodeNum; - (void) BTFlushPath(fcb); - exit: + (void) BTFlushPath(fcb); FREE(iterator, M_TEMP); return MacToVFSError(result); @@ -1242,13 +1289,22 @@ catrec_readattr(const CatalogKey *key, const CatalogRecord *rec, return (0); /* stop */ } - /* Hide the private meta data directory. */ - if (parentcnid == kRootDirID && - rec->recordType == kHFSPlusFolderRecord && - rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir) { - return (1); /* continue */ + /* Hide the private meta data directory and journal files */ + if (parentcnid == kRootDirID) { + if ((rec->recordType == kHFSPlusFolderRecord) && + (rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir)) { + return (1); /* continue */ + } + if (hfsmp->jnl && + (rec->recordType == kHFSPlusFileRecord) && + ((rec->hfsPlusFile.fileID == hfsmp->hfs_jnlfileid) || + (rec->hfsPlusFile.fileID == hfsmp->hfs_jnlinfoblkid))) { + + return (1); /* continue */ + } } + cep = &list->entry[list->realentries++]; if (state->stdhfs) { @@ -1408,6 +1464,8 @@ exit: struct read_state { u_int32_t cbs_parentID; u_int32_t cbs_hiddenDirID; + u_int32_t cbs_hiddenJournalID; + u_int32_t cbs_hiddenInfoBlkID; off_t cbs_lastoffset; struct uio * cbs_uio; ExtendedVCB * cbs_vcb; @@ -1517,6 +1575,15 @@ lastitem: catent.d_type == DT_DIR) goto lastitem; + /* Hide the journal files */ + if ((curID == kRootDirID) && + (catent.d_type == DT_REG) && + ((catent.d_fileno == state->cbs_hiddenJournalID) || + (catent.d_fileno == state->cbs_hiddenInfoBlkID))) { + + return (1); /* skip and continue */ + } + state->cbs_lastoffset = state->cbs_uio->uio_offset; /* if this entry won't fit then we're done */ @@ -1565,6 +1632,11 @@ cat_getdirentries(struct hfsmount *hfsmp, struct cat_desc *descp, goto cleanup; state.cbs_hiddenDirID = hfsmp->hfs_private_metadata_dir; + if (hfsmp->jnl) { + state.cbs_hiddenJournalID = hfsmp->hfs_jnlfileid; + state.cbs_hiddenInfoBlkID = hfsmp->hfs_jnlinfoblkid; + } + state.cbs_lastoffset = cip->currentOffset; state.cbs_vcb = vcb; state.cbs_uio = uio; @@ -2203,7 +2275,11 @@ getcnid(const CatalogRecord *crp) case kHFSPlusFileRecord: cnid = crp->hfsPlusFile.fileID; break; + default: + panic("hfs: getcnid: unknown recordType (crp @ 0x%x)\n", crp); + break; } + return (cnid); } @@ -2225,7 +2301,11 @@ getparentcnid(const CatalogRecord *recp) case kHFSPlusFolderThreadRecord: cnid = recp->hfsPlusThread.parentID; break; + default: + panic("hfs: getparentcnid: unknown recordType (crp @ 0x%x)\n", recp); + break; } + return (cnid); } diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c index d59163ab5..65617595f 100644 --- a/bsd/hfs/hfs_cnode.c +++ b/bsd/hfs/hfs_cnode.c @@ -62,6 +62,7 @@ hfs_inactive(ap) int recycle = 0; int forkcount = 0; int truncated = 0; + int started_tr = 0, grabbed_lock = 0; if (prtactive && vp->v_usecount != 0) vprint("hfs_inactive: pushing active", vp); @@ -85,9 +86,11 @@ hfs_inactive(ap) vp->v_type == VREG && (VTOF(vp)->ff_blocks != 0)) { error = VOP_TRUNCATE(vp, (off_t)0, IO_NDELAY, NOCRED, p); - if (error) goto out; truncated = 1; + // have to do this to prevent the lost ubc_info panic + SET(cp->c_flag, C_TRANSIT); recycle = 1; + if (error) goto out; } /* @@ -103,6 +106,17 @@ hfs_inactive(ap) cp->c_flag &= ~C_DELETED; cp->c_rdev = 0; + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + grabbed_lock = 1; + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + error = EINVAL; + goto out; + } + started_tr = 1; + } + /* Lock catalog b-tree */ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); if (error) goto out; @@ -148,11 +162,21 @@ hfs_inactive(ap) if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord) cp->c_flag |= C_MODIFIED; } - if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) { - tv = time; - VOP_UPDATE(vp, &tv, &tv, 0); - } + + if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) { + tv = time; + VOP_UPDATE(vp, &tv, &tv, 0); + } out: + // XXXdbg - have to do this because a goto could have come here + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + started_tr = 0; + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + } + VOP_UNLOCK(vp, 0, p); /* * If we are done with the vnode, reclaim it @@ -313,6 +337,16 @@ hfs_getcnode(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *descp, int wa retval = ENOENT; goto exit; } + + /* Hide private journal files */ + if (hfsmp->jnl && + (cp->c_parentcnid == kRootDirID) && + ((cp->c_cnid == hfsmp->hfs_jnlfileid) || + (cp->c_cnid == hfsmp->hfs_jnlinfoblkid))) { + retval = ENOENT; + goto exit; + } + if (wantrsrc && rvp != NULL) { vp = rvp; rvp = NULL; diff --git a/bsd/hfs/hfs_format.h b/bsd/hfs/hfs_format.h index ffbef0fb9..a8833dedd 100644 --- a/bsd/hfs/hfs_format.h +++ b/bsd/hfs/hfs_format.h @@ -45,9 +45,11 @@ extern "C" { enum { kHFSSigWord = 0x4244, /* 'BD' in ASCII */ kHFSPlusSigWord = 0x482B, /* 'H+' in ASCII */ + kHFSJSigWord = 0x484a, /* 'HJ' in ASCII */ kHFSPlusVersion = 0x0004, /* will change as format changes */ /* version 4 shipped with Mac OS 8.1 */ - kHFSPlusMountVersion = 0x31302E30 /* '10.0' for Mac OS X */ + kHFSPlusMountVersion = 0x31302E30, /* '10.0' for Mac OS X */ + kHFSJMountVersion = 0x4846534a /* 'HFSJ' for journaled HFS+ on OS X */ }; @@ -452,7 +454,8 @@ enum { kHFSVolumeNoCacheRequiredBit = 10, /* don't cache volume blocks (i.e. RAM or ROM disk) */ kHFSBootVolumeInconsistentBit = 11, /* boot volume is inconsistent (System 7.6 and later) */ kHFSCatalogNodeIDsReusedBit = 12, - /* Bits 13-14 are reserved for future use */ + kHFSVolumeJournaledBit = 13, /* this volume has a journal on it */ + /* Bit 14 is reserved for future use */ kHFSVolumeSoftwareLockBit = 15, /* volume is locked by software */ kHFSVolumeHardwareLockMask = 1 << kHFSVolumeHardwareLockBit, @@ -461,6 +464,7 @@ enum { kHFSVolumeNoCacheRequiredMask = 1 << kHFSVolumeNoCacheRequiredBit, kHFSBootVolumeInconsistentMask = 1 << kHFSBootVolumeInconsistentBit, kHFSCatalogNodeIDsReusedMask = 1 << kHFSCatalogNodeIDsReusedBit, + kHFSVolumeJournaledMask = 1 << kHFSVolumeJournaledBit, kHFSVolumeSoftwareLockMask = 1 << kHFSVolumeSoftwareLockBit, kHFSMDBAttributesMask = 0x8380 }; @@ -509,7 +513,8 @@ struct HFSPlusVolumeHeader { u_int16_t version; /* == kHFSPlusVersion */ u_int32_t attributes; /* volume attributes */ u_int32_t lastMountedVersion; /* implementation version which last mounted volume */ - u_int32_t reserved; /* reserved - initialized as zero */ +//XXXdbg u_int32_t reserved; /* reserved - initialized as zero */ + u_int32_t journalInfoBlock; /* block addr of journal info (if volume is journaled, zero otherwise) */ u_int32_t createDate; /* date and time of volume creation */ u_int32_t modifyDate; /* date and time of last modification */ @@ -601,6 +606,23 @@ enum { kBTVariableIndexKeysMask = 0x00000004 /* keys in index nodes are variable length */ }; +/* JournalInfoBlock - Structure that describes where our journal lives */ +struct JournalInfoBlock { + u_int32_t flags; + u_int32_t device_signature[8]; // signature used to locate our device. + u_int64_t offset; // byte offset to the journal on the device + u_int64_t size; // size in bytes of the journal + u_int32_t reserved[32]; +}; +typedef struct JournalInfoBlock JournalInfoBlock; + +enum { + kJIJournalInFSMask = 0x00000001, + kJIJournalOnOtherDeviceMask = 0x00000002, + kJIJournalNeedInitMask = 0x00000004 +}; + + #pragma options align=reset #ifdef __cplusplus diff --git a/bsd/hfs/hfs_link.c b/bsd/hfs/hfs_link.c index 6a78cd752..97a36516c 100644 --- a/bsd/hfs/hfs_link.c +++ b/bsd/hfs/hfs_link.c @@ -72,12 +72,25 @@ createindirectlink(struct hfsmount *hfsmp, u_int32_t linknum, fip->fdCreator = SWAP_BE32 (kHFSPlusCreator); /* 'hfs+' */ fip->fdFlags = SWAP_BE16 (kHasBeenInited); + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + return EINVAL; + } + } + /* Create the indirect link directly in the catalog */ result = cat_create(hfsmp, &desc, &attr, NULL); - if (linkcnid != NULL) + if (result == 0 && linkcnid != NULL) *linkcnid = attr.ca_fileid; + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + return (result); } @@ -111,8 +124,9 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, /* Lock catalog b-tree */ retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (retval) - return retval; + if (retval) { + return retval; + } /* * If this is a new hardlink then we need to create the data @@ -123,6 +137,7 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, bzero(&to_desc, sizeof(to_desc)); to_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid; to_desc.cd_cnid = cp->c_fileid; + do { /* get a unique indirect node number */ indnodeno = ((random() & 0x3fffffff) + 100); @@ -144,7 +159,17 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, cp->c_desc.cd_nameptr, &cp->c_desc.cd_cnid); if (retval) { /* put it source file back */ + // XXXdbg + #if 1 + { + int err; + err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); + if (err) + panic("hfs_makelink: error %d from cat_rename backout 1", err); + } + #else (void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); + #endif goto out; } cp->c_rdev = indnodeno; @@ -161,7 +186,17 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, (void) cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); /* Put the source file back */ + // XXXdbg + #if 1 + { + int err; + err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); + if (err) + panic("hfs_makelink: error %d from cat_rename backout 2", err); + } + #else (void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL); + #endif goto out; } @@ -205,6 +240,7 @@ hfs_link(ap) struct componentname *a_cnp; } */ *ap; { + struct hfsmount *hfsmp; struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; @@ -214,6 +250,8 @@ hfs_link(ap) struct timeval tv; int error; + hfsmp = VTOHFS(vp); + #if HFS_DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) panic("hfs_link: no name"); @@ -226,7 +264,7 @@ hfs_link(ap) if (VTOVCB(tdvp)->vcbSigWord != kHFSPlusSigWord) return err_link(ap); /* hfs disks don't support hard links */ - if (VTOHFS(vp)->hfs_private_metadata_dir == 0) + if (hfsmp->hfs_private_metadata_dir == 0) return err_link(ap); /* no private metadata dir, no links possible */ if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) { @@ -252,12 +290,22 @@ hfs_link(ap) goto out1; } + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + return EINVAL; + } + } + cp->c_nlink++; cp->c_flag |= C_CHANGE; tv = time; + error = VOP_UPDATE(vp, &tv, &tv, 1); - if (!error) - error = hfs_makelink(VTOHFS(vp), cp, tdcp, cnp); + if (!error) { + error = hfs_makelink(hfsmp, cp, tdcp, cnp); + } if (error) { cp->c_nlink--; cp->c_flag |= C_CHANGE; @@ -268,10 +316,21 @@ hfs_link(ap) tdcp->c_flag |= C_CHANGE | C_UPDATE; tv = time; (void) VOP_UPDATE(tdvp, &tv, &tv, 0); - hfs_volupdate(VTOHFS(vp), VOL_MKFILE, + + hfs_volupdate(hfsmp, VOL_MKFILE, (tdcp->c_cnid == kHFSRootFolderID)); } + + // XXXdbg - need to do this here as well because cp could have changed + error = VOP_UPDATE(vp, &tv, &tv, 1); + FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); + + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + out1: if (tdvp != vp) VOP_UNLOCK(vp, 0, p); diff --git a/bsd/hfs/hfs_lookup.c b/bsd/hfs/hfs_lookup.c index 824f615dc..db88b99c0 100644 --- a/bsd/hfs/hfs_lookup.c +++ b/bsd/hfs/hfs_lookup.c @@ -261,8 +261,9 @@ notfound: * creation of files in the directory. */ retval = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_proc); - if (retval) + if (retval) { goto exit; + } cnp->cn_flags |= SAVENAME; if (!(flags & LOCKPARENT)) diff --git a/bsd/hfs/hfs_mount.h b/bsd/hfs/hfs_mount.h index 06afe6df8..502926a42 100644 --- a/bsd/hfs/hfs_mount.h +++ b/bsd/hfs/hfs_mount.h @@ -52,10 +52,15 @@ struct hfs_mount_args { u_long hfs_encoding; /* encoding for this volume (standard HFS only) */ struct timezone hfs_timezone; /* user time zone info (standard HFS only) */ int flags; /* mounting flags, see below */ + int journal_tbuffer_size; /* size in bytes of the journal transaction buffer */ + int journal_flags; /* flags to pass to journal_open/create */ + int journal_disable; /* don't use journaling (potentially dangerous) */ }; #define HFSFSMNT_NOXONFILES 0x1 /* disable execute permissions for files */ #define HFSFSMNT_WRAPPER 0x2 /* mount HFS wrapper (if it exists) */ +#define HFSFSMNT_EXTENDED_ARGS 0x4 /* indicates new fields after "flags" are valid */ + #endif /* __APPLE_API_UNSTABLE */ #endif /* ! _HFS_MOUNT_H_ */ diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 4544a7685..6f0311411 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -267,6 +267,8 @@ hfs_write(ap) int retval; off_t filebytes; u_long fileblocks; + struct hfsmount *hfsmp; + int started_tr = 0, grabbed_lock = 0; ioflag = ap->a_ioflag; @@ -288,6 +290,16 @@ hfs_write(ap) if ((cp->c_flags & APPEND) && uio->uio_offset != fp->ff_size) return (EPERM); + // XXXdbg - don't allow modification of the journal or journal_info_block + if (VTOHFS(vp)->jnl && cp->c_datafork) { + struct HFSPlusExtentDescriptor *extd; + + extd = &cp->c_datafork->ff_data.cf_extents[0]; + if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) { + return EPERM; + } + } + writelimit = uio->uio_offset + uio->uio_resid; /* @@ -333,13 +345,26 @@ hfs_write(ap) if(writelimit > filebytes) { bytesToAdd = writelimit - filebytes; - retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, fp->ff_clumpsize)), + retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, vcb->blockSize)), ap->a_cred, 0); if (retval) return (retval); } #endif /* QUOTA */ + hfsmp = VTOHFS(vp); + if (writelimit > filebytes) { + hfs_global_shared_lock_acquire(hfsmp); + grabbed_lock = 1; + } + if (hfsmp->jnl && (writelimit > filebytes)) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + return EINVAL; + } + started_tr = 1; + } + while (writelimit > filebytes) { bytesToAdd = writelimit - filebytes; @@ -364,6 +389,17 @@ hfs_write(ap) (int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0); } + // XXXdbg + if (started_tr) { + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + journal_end_transaction(hfsmp->jnl); + started_tr = 0; + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + grabbed_lock = 0; + } + if (UBCISVALID(vp) && retval == E_NONE) { off_t filesize; off_t zero_off; @@ -952,6 +988,7 @@ hfs_cmap(ap) struct proc *p = NULL; struct rl_entry *invalid_range; enum rl_overlaptype overlaptype; + int started_tr = 0, grabbed_lock = 0; /* * Check for underlying vnode requests and ensure that logical @@ -960,12 +997,37 @@ hfs_cmap(ap) if (ap->a_bpn == NULL) return (0); - if (overflow_extents(fp) || fp->ff_unallocblocks) { + p = current_proc(); + if (fp->ff_unallocblocks) { lockExtBtree = 1; - p = current_proc(); + + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + grabbed_lock = 1; + + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + return EINVAL; + } else { + started_tr = 1; + } + } + if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) { + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + } return (retval); - } + } + } else if (overflow_extents(fp)) { + lockExtBtree = 1; + if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) { + return retval; + } } /* @@ -1007,9 +1069,16 @@ hfs_cmap(ap) } if (retval) { - (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); - return (retval); - } + (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); + if (started_tr) { + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + journal_end_transaction(hfsmp->jnl); + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + } + return (retval); + } VTOC(ap->a_vp)->c_flag |= C_MODIFIED; } @@ -1024,6 +1093,17 @@ hfs_cmap(ap) if (lockExtBtree) (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p); + // XXXdbg + if (started_tr) { + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + journal_end_transaction(hfsmp->jnl); + started_tr = 0; + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + grabbed_lock = 0; + } + if (retval == E_NONE) { /* Adjust the mapping information for invalid file ranges: */ overlaptype = rl_scan(&fp->ff_invalidranges, @@ -1153,6 +1233,11 @@ hfs_strategy_fragmented(struct buf *bp) } frag->b_vp = NULL; + // + // XXXdbg - in the case that this is a meta-data block, it won't affect + // the journal because this bp is for a physical disk block, + // not a logical block that is part of the catalog or extents + // files. SET(frag->b_flags, B_INVAL); brelse(frag); @@ -1291,6 +1376,7 @@ int hfs_truncate(ap) off_t filebytes; u_long fileblocks; int blksize; + struct hfsmount *hfsmp; if (vp->v_type != VREG && vp->v_type != VLNK) return (EISDIR); /* cannot truncate an HFS directory! */ @@ -1309,6 +1395,7 @@ int hfs_truncate(ap) if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE)) return (EFBIG); + hfsmp = VTOHFS(vp); tv = time; retval = E_NONE; @@ -1329,7 +1416,7 @@ int hfs_truncate(ap) */ if (length > fp->ff_size) { #if QUOTA - retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, fp->ff_clumpsize)), + retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)), ap->a_cred, 0); if (retval) goto Err_Exit; @@ -1347,10 +1434,25 @@ int hfs_truncate(ap) if (suser(ap->a_cred, NULL) != 0) eflags |= kEFReserveMask; /* keep a reserve */ + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + retval = EINVAL; + goto Err_Exit; + } + } + /* lock extents b-tree (also protects volume bitmap) */ retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) + if (retval) { + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + goto Err_Exit; + } while ((length > filebytes) && (retval == E_NONE)) { bytesToAdd = length - filebytes; @@ -1368,7 +1470,16 @@ int hfs_truncate(ap) break; } } /* endwhile */ + (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); + + // XXXdbg + if (hfsmp->jnl) { + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + if (retval) goto Err_Exit; @@ -1484,16 +1595,38 @@ int hfs_truncate(ap) #if QUOTA off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize); #endif /* QUOTA */ + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + retval = EINVAL; + goto Err_Exit; + } + } + /* lock extents b-tree (also protects volume bitmap) */ retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) + if (retval) { + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); goto Err_Exit; + } if (fp->ff_unallocblocks == 0) retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, false)); (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); + + // XXXdbg + if (hfsmp->jnl) { + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + filebytes = (off_t)fp->ff_blocks * (off_t)blksize; if (retval) goto Err_Exit; @@ -1564,6 +1697,9 @@ int hfs_allocate(ap) int retval, retval2; UInt32 blockHint; UInt32 extendFlags =0; /* For call to ExtendFileC */ + struct hfsmount *hfsmp; + + hfsmp = VTOHFS(vp); *(ap->a_bytesallocated) = 0; fileblocks = fp->ff_blocks; @@ -1610,15 +1746,31 @@ int hfs_allocate(ap) moreBytesRequested = length - filebytes; #if QUOTA - retval = hfs_chkdq(cp, (int64_t)(roundup(moreBytesRequested, fp->ff_clumpsize)), + retval = hfs_chkdq(cp, + (int64_t)(roundup(moreBytesRequested, VTOVCB(vp)->blockSize)), ap->a_cred, 0); if (retval) return (retval); #endif /* QUOTA */ + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + retval = EINVAL; + goto Err_Exit; + } + } + /* lock extents b-tree (also protects volume bitmap) */ retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) goto Err_Exit; + if (retval) { + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + goto Err_Exit; + } retval = MacToVFSError(ExtendFileC(VTOVCB(vp), (FCB*)fp, @@ -1629,8 +1781,16 @@ int hfs_allocate(ap) *(ap->a_bytesallocated) = actualBytesAdded; filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize; + (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); + // XXXdbg + if (hfsmp->jnl) { + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + /* * if we get an error and no changes were made then exit * otherwise we must do the VOP_UPDATE to reflect the changes @@ -1661,9 +1821,25 @@ int hfs_allocate(ap) (void) vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0); } + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + retval = EINVAL; + goto Err_Exit; + } + } + /* lock extents b-tree (also protects volume bitmap) */ retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p); - if (retval) goto Err_Exit; + if (retval) { + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + + goto Err_Exit; + } retval = MacToVFSError( TruncateFileC( @@ -1673,6 +1849,14 @@ int hfs_allocate(ap) false)); (void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p); filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize; + + if (hfsmp->jnl) { + hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0); + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + + /* * if we get an error and no changes were made then exit * otherwise we must do the VOP_UPDATE to reflect the changes @@ -1794,9 +1978,9 @@ hfs_bwrite(ap) } */ *ap; { int retval = 0; -#if BYTE_ORDER == LITTLE_ENDIAN register struct buf *bp = ap->a_bp; register struct vnode *vp = bp->b_vp; +#if BYTE_ORDER == LITTLE_ENDIAN BlockDescriptor block; /* Trap B-Tree writes */ @@ -1820,8 +2004,12 @@ hfs_bwrite(ap) } #endif /* This buffer shouldn't be locked anymore but if it is clear it */ - if (ISSET(ap->a_bp->b_flags, B_LOCKED)) { - CLR(ap->a_bp->b_flags, B_LOCKED); + if (ISSET(bp->b_flags, B_LOCKED)) { + // XXXdbg + if (VTOHFS(vp)->jnl) { + panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp); + } + CLR(bp->b_flags, B_LOCKED); printf("hfs_bwrite: called with lock bit set\n"); } retval = vn_bwrite (ap); diff --git a/bsd/hfs/hfs_search.c b/bsd/hfs/hfs_search.c index 0c7638fbe..84aecbb01 100644 --- a/bsd/hfs/hfs_search.c +++ b/bsd/hfs/hfs_search.c @@ -193,6 +193,8 @@ hfs_search( ap ) CatalogRecord * myCurrentDataPtr; CatPosition * myCatPositionPtr; BTScanState myBTScanState; + void *user_start = NULL; + int user_len; /* XXX Parameter check a_searchattrs? */ @@ -223,6 +225,20 @@ hfs_search( ap ) MALLOC( attributesBuffer, void *, eachReturnBufferSize, M_TEMP, M_WAITOK ); variableBuffer = (void*)((char*) attributesBuffer + fixedBlockSize); + // XXXdbg - have to lock the user's buffer so we don't fault + // while holding the shared catalog file lock. see the comment + // in hfs_readdir() for more details. + // + if (VTOHFS(ap->a_vp)->jnl && ap->a_uio->uio_segflg == UIO_USERSPACE) { + user_start = ap->a_uio->uio_iov->iov_base; + user_len = ap->a_uio->uio_iov->iov_len; + + if ((err = vslock(user_start, user_len)) != 0) { + user_start = NULL; + goto ExitThisRoutine; + } + } + /* Lock catalog b-tree */ err = hfs_metafilelocking(VTOHFS(ap->a_vp), kHFSCatalogFileID, LK_SHARED, p); if (err) @@ -383,6 +399,10 @@ QuickExit: ExitThisRoutine: FREE( attributesBuffer, M_TEMP ); + if (VTOHFS(ap->a_vp)->jnl && user_start) { + vsunlock(user_start, user_len, TRUE); + } + return (MacToVFSError(err)); } @@ -858,6 +878,14 @@ InsertMatch( struct vnode *root_vp, struct uio *a_uio, CatalogRecord *rec, goto exit; } + /* Hide the private journal files */ + if (VTOHFS(root_vp)->jnl && + ((c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlfileid) || + (c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlinfoblkid))) { + err = 0; + goto exit; + } + if (returnAttrList->commonattr & ATTR_CMN_NAME) { cat_convertkey(VTOHFS(root_vp), key, rec, &c_desc); } else { diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index c92af136d..cff8a45ec 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -77,6 +77,9 @@ #include #include +// XXXdbg +#include + #include #include @@ -259,6 +262,8 @@ hfs_mount(mp, path, data, ndp, p) (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)) { /* setup private/hidden directory for unlinked files */ hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(HFSTOVCB(hfsmp)); + if (hfsmp->jnl) + hfs_remove_orphans(hfsmp); } if (args.fspec == 0) { @@ -325,7 +330,6 @@ hfs_mount(mp, path, data, ndp, p) goto error_exit; } - /* Set the mount flag to indicate that we support volfs */ mp->mnt_flag |= MNT_DOVOLFS; if (VFSTOVCB(mp)->vcbSigWord == kHFSSigWord) { @@ -333,6 +337,7 @@ hfs_mount(mp, path, data, ndp, p) mp->mnt_flag |= MNT_FIXEDSCRIPTENCODING; } (void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN-1, &size); + bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size); (void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); @@ -641,6 +646,7 @@ loop: vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate)); vcb->vcbAtrb = (UInt16) SWAP_BE32 (vhp->attributes); /* VCB only uses lower 16 bits */ + vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock); vcb->vcbClpSiz = SWAP_BE32 (vhp->rsrcClumpSize); vcb->vcbNxtCNID = SWAP_BE32 (vhp->nextCatalogID); vcb->vcbVolBkUp = to_bsd_time(SWAP_BE32(vhp->backupDate)); @@ -720,6 +726,84 @@ loop: } +static int +get_raw_device(char *fspec, int is_user, int ronly, struct vnode **rvp, struct ucred *cred, struct proc *p) +{ + char *rawbuf; + char *dp; + size_t namelen; + struct nameidata nd; + int retval; + + *rvp = NULL; + + MALLOC(rawbuf, char *, MAXPATHLEN, M_HFSMNT, M_WAITOK); + if (rawbuf == NULL) { + retval = ENOMEM; + goto error_exit; + } + + if (is_user) { + retval = copyinstr(fspec, rawbuf, MAXPATHLEN - 1, &namelen); + if (retval != E_NONE) { + FREE(rawbuf, M_HFSMNT); + goto error_exit; + } + } else { + strcpy(rawbuf, fspec); + namelen = strlen(rawbuf); + } + + /* make sure it's null terminated */ + rawbuf[MAXPATHLEN-1] = '\0'; + + dp = &rawbuf[namelen-1]; + while(dp >= rawbuf && *dp != '/') { + dp--; + } + + if (dp != NULL) { + dp++; + } else { + dp = rawbuf; + } + + /* make room for and insert the 'r' for the raw device */ + memmove(dp+1, dp, strlen(dp)+1); + *dp = 'r'; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, rawbuf, p); + retval = namei(&nd); + if (retval != E_NONE) { + DBG_ERR(("hfs_mountfs: can't open raw device for journal: %s, %x\n", rawbuf, nd.ni_vp->v_rdev)); + FREE(rawbuf, M_HFSMNT); + goto error_exit; + } + + *rvp = nd.ni_vp; + if ((retval = VOP_OPEN(*rvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p))) { + *rvp = NULL; + goto error_exit; + } + + // don't need this any more + FREE(rawbuf, M_HFSMNT); + + return 0; + + error_exit: + if (*rvp) { + (void)VOP_CLOSE(*rvp, ronly ? FREAD : FREAD|FWRITE, cred, p); + } + + if (rawbuf) { + FREE(rawbuf, M_HFSMNT); + } + return retval; +} + + + /* * Common code for mount and mountroot */ @@ -741,6 +825,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, u_int32_t blksize; u_int32_t minblksize; u_int32_t iswritable; + daddr_t mdb_offset; dev = devvp->v_rdev; cred = p ? p->p_ucred : NOCRED; @@ -825,6 +910,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, return (retval); } + mdb_offset = HFS_PRI_SECTOR(blksize); if ((retval = meta_bread(devvp, HFS_PRI_SECTOR(blksize), blksize, cred, &bp))) { goto error_exit; } @@ -837,7 +923,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, bzero(hfsmp, sizeof(struct hfsmount)); simple_lock_init(&hfsmp->hfs_renamelock); - + /* * Init the volume information structure */ @@ -932,6 +1018,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, } else /* Mount an HFS Plus disk */ { HFSPlusVolumeHeader *vhp; off_t embeddedOffset; + int jnl_disable = 0; /* Get the embedded Volume Header */ if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) { @@ -973,8 +1060,8 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, hfsmp->hfs_phys_block_count = disksize / blksize; - retval = meta_bread(devvp, (embeddedOffset / blksize) + - HFS_PRI_SECTOR(blksize), blksize, cred, &bp); + mdb_offset = (embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize); + retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp); if (retval) goto error_exit; bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512); @@ -987,9 +1074,42 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, vhp = (HFSPlusVolumeHeader*) mdbp; } + // XXXdbg + // + hfsmp->jnl = NULL; + hfsmp->jvp = NULL; + if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) && args->journal_disable) { + jnl_disable = 1; + } + + // + // We only initialize the journal here if the last person + // to mount this volume was journaling aware. Otherwise + // we delay journal initialization until later at the end + // of hfs_MountHFSPlusVolume() because the last person who + // mounted it could have messed things up behind our back + // (so we need to go find the .journal file, make sure it's + // the right size, re-sync up if it was moved, etc). + // + if ( (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion) + && (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask) + && !jnl_disable) { + + // if we're able to init the journal, mark the mount + // point as journaled. + // + if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) { + mp->mnt_flag |= MNT_JOURNALED; + } else { + retval = EINVAL; + goto error_exit; + } + } + // XXXdbg + (void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname); - retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p); + retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args); /* * If the backend didn't like our physical blocksize * then retry with physical blocksize of 512. @@ -1012,7 +1132,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p, hfsmp->hfs_phys_block_size = blksize; /* Try again with a smaller block size... */ - retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p); + retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args); } if (retval) (void) hfs_relconverter(0); @@ -1039,6 +1159,10 @@ error_exit: if (mdbp) FREE(mdbp, M_TEMP); (void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p); + if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { + (void)VOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, cred, p); + hfsmp->jvp = NULL; + } if (hfsmp) { FREE(hfsmp, M_HFSMNT); mp->mnt_data = (qaddr_t)0; @@ -1075,6 +1199,7 @@ hfs_unmount(mp, mntflags, p) int retval = E_NONE; int flags; int force; + int started_tr = 0, grabbed_lock = 0; flags = 0; force = 0; @@ -1090,17 +1215,33 @@ hfs_unmount(mp, mntflags, p) * Flush out the b-trees, volume bitmap and Volume Header */ if (hfsmp->hfs_fs_ronly == 0) { + hfs_global_shared_lock_acquire(hfsmp); + grabbed_lock = 1; + if (hfsmp->jnl) { + journal_start_transaction(hfsmp->jnl); + started_tr = 1; + } + retval = VOP_FSYNC(HFSTOVCB(hfsmp)->catalogRefNum, NOCRED, MNT_WAIT, p); if (retval && !force) - return (retval); - + goto err_exit; + retval = VOP_FSYNC(HFSTOVCB(hfsmp)->extentsRefNum, NOCRED, MNT_WAIT, p); if (retval && !force) - return (retval); + goto err_exit; + + // if we have an allocation file, sync it too so we don't leave dirty + // blocks around + if (HFSTOVCB(hfsmp)->allocationsRefNum) { + if (retval = VOP_FSYNC(HFSTOVCB(hfsmp)->allocationsRefNum, NOCRED, MNT_WAIT, p)) { + if (!force) + goto err_exit; + } + } if (retval = VOP_FSYNC(hfsmp->hfs_devvp, NOCRED, MNT_WAIT, p)) { if (!force) - return (retval); + goto err_exit; } /* See if this volume is damaged, is so do not unmount cleanly */ @@ -1110,14 +1251,27 @@ hfs_unmount(mp, mntflags, p) HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask; } - retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); + retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); if (retval) { HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask; if (!force) - return (retval); /* could not flush everything */ + goto err_exit; /* could not flush everything */ + } + + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + started_tr = 0; + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + grabbed_lock = 0; } } + if (hfsmp->jnl) { + journal_flush(hfsmp->jnl); + } + /* * Invalidate our caches and release metadata vnodes */ @@ -1126,6 +1280,19 @@ hfs_unmount(mp, mntflags, p) if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) (void) hfs_relconverter(hfsmp->hfs_encoding); + // XXXdbg + if (hfsmp->jnl) { + journal_close(hfsmp->jnl); + } + + if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { + retval = VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, + NOCRED, p); + vrele(hfsmp->jvp); + hfsmp->jvp = NULL; + } + // XXXdbg + hfsmp->hfs_devvp->v_specflags &= ~SI_MOUNTEDON; retval = VOP_CLOSE(hfsmp->hfs_devvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, @@ -1137,6 +1304,15 @@ hfs_unmount(mp, mntflags, p) FREE(hfsmp, M_HFSMNT); mp->mnt_data = (qaddr_t)0; return (0); + + err_exit: + if (hfsmp->jnl && started_tr) { + journal_end_transaction(hfsmp->jnl); + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + } + return retval; } @@ -1241,6 +1417,8 @@ hfs_quotactl(mp, cmds, uid, arg, p) } + + /* * Get file system statistics. */ @@ -1276,6 +1454,70 @@ hfs_statfs(mp, sbp, p) } +// +// XXXdbg -- this is a callback to be used by the journal to +// get meta data blocks flushed out to disk. +// +// XXXdbg -- be smarter and don't flush *every* block on each +// call. try to only flush some so we don't wind up +// being too synchronous. +// +__private_extern__ +void +hfs_sync_metadata(void *arg) +{ + struct mount *mp = (struct mount *)arg; + struct cnode *cp; + struct hfsmount *hfsmp; + ExtendedVCB *vcb; + struct vnode *meta_vp[3]; + struct buf *bp; + int i, sectorsize, priIDSector, altIDSector, retval; + int error, allerror = 0; + + hfsmp = VFSTOHFS(mp); + vcb = HFSTOVCB(hfsmp); + + bflushq(BQ_META, mp); + + +#if 1 // XXXdbg - I do not believe this is necessary... + // but if I pull it out, then the journal + // does not seem to get flushed properly + // when it is closed.... + + // now make sure the super block is flushed + sectorsize = hfsmp->hfs_phys_block_size; + priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) + + HFS_PRI_SECTOR(sectorsize); + retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp); + if (retval != 0) { + panic("hfs: sync_metadata: can't read super-block?! (retval 0x%x, priIDSector)\n", + retval, priIDSector); + } + + if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) { + bwrite(bp); + } else if (bp) { + brelse(bp); + } + + // the alternate super block... + // XXXdbg - we probably don't need to do this each and every time. + // hfs_btreeio.c:FlushAlternate() should flag when it was + // written... + altIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) + + HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count); + retval = meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &bp); + if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) { + bwrite(bp); + } else if (bp) { + brelse(bp); + } +#endif + +} + /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; @@ -1310,6 +1552,17 @@ hfs_sync(mp, waitfor, cred, p) panic("update: rofs mod"); }; +#if 0 + // XXXdbg first go through and flush out any modified + // meta data blocks so they go out in order... + bflushq(BQ_META, mp); + bflushq(BQ_LRU, mp); + // only flush locked blocks if we're not doing journaling + if (hfsmp->jnl == NULL) { + bflushq(BQ_LOCKED, mp); + } +#endif + /* * Write back each 'modified' vnode */ @@ -1326,10 +1579,19 @@ loop: simple_unlock(&mntvnode_slock); goto loop; } + simple_lock(&vp->v_interlock); nvp = vp->v_mntvnodes.le_next; + cp = VTOC(vp); + // restart our whole search if this guy is locked + // or being reclaimed. + if (cp == NULL || vp->v_flag & (VXLOCK|VORECLAIM)) { + simple_unlock(&vp->v_interlock); + continue; + } + if ((vp->v_flag & VSYSTEM) || (vp->v_type == VNON) || (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) && (vp->v_dirtyblkhd.lh_first == NULL) && !(vp->v_flag & VHASDIRTY))) { @@ -1372,6 +1634,7 @@ loop: btvp = btvp = meta_vp[i];; if ((btvp==0) || (btvp->v_type == VNON) || (btvp->v_mount != mp)) continue; + simple_lock(&btvp->v_interlock); cp = VTOC(btvp); if (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) && @@ -1409,11 +1672,22 @@ loop: */ if (IsVCBDirty(vcb)) { + // XXXdbg - debugging, remove + if (hfsmp->jnl) { + //printf("hfs: sync: strange, a journaled volume w/dirty VCB? jnl 0x%x hfsmp 0x%x\n", + // hfsmp->jnl, hfsmp); + } + error = hfs_flushvolumeheader(hfsmp, waitfor, 0); - if (error) - allerror = error; + if (error) + allerror = error; } + if (hfsmp->jnl) { + journal_flush(hfsmp->jnl); + } + + err_exit: return (allerror); } @@ -1534,6 +1808,10 @@ hfs_init(vfsp) } +// XXXdbg +#include + + /* * HFS filesystem related variables. */ @@ -1550,12 +1828,133 @@ hfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) extern u_int32_t hfs_encodingbias; /* all sysctl names at this level are terminal */ - if (namelen != 1) - return (ENOTDIR); /* overloaded */ if (name[0] == HFS_ENCODINGBIAS) return (sysctl_int(oldp, oldlenp, newp, newlen, &hfs_encodingbias)); + else if (name[0] == 0x082969) { + // make the file system journaled... + struct vnode *vp = p->p_fd->fd_cdir, *jvp; + struct hfsmount *hfsmp; + ExtendedVCB *vcb; + int retval; + struct cat_attr jnl_attr, jinfo_attr; + struct cat_fork jnl_fork, jinfo_fork; + void *jnl = NULL; + + hfsmp = VTOHFS(vp); + if (hfsmp->hfs_fs_ronly) { + return EROFS; + } + if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) { + printf("hfs: can't make a plain hfs volume journaled.\n"); + return EINVAL; + } + + if (hfsmp->jnl) { + printf("hfs: volume @ mp 0x%x is already journaled!\n", vp->v_mount); + return EAGAIN; + } + + vcb = HFSTOVCB(hfsmp); + if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 || + BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) { + + printf("hfs: volume has a btree w/non-contiguous nodes. can not enable journaling.\n"); + return EINVAL; + } + + // make sure these both exist! + if ( GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0 + || GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, &jnl_fork) == 0) { + + return EINVAL; + } + + hfs_sync(hfsmp->hfs_mp, MNT_WAIT, FSCRED, p); + bflushq(BQ_META); + + printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", + (off_t)name[2], (off_t)name[3]); + + jvp = hfsmp->hfs_devvp; + jnl = journal_create(jvp, + (off_t)name[2] * (off_t)HFSTOVCB(hfsmp)->blockSize + + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, + (off_t)name[3], + hfsmp->hfs_devvp, + hfsmp->hfs_phys_block_size, + 0, + 0, + hfs_sync_metadata, hfsmp->hfs_mp); + + if (jnl == NULL) { + printf("hfs: FAILED to create the journal!\n"); + if (jvp && jvp != hfsmp->hfs_devvp) { + VOP_CLOSE(jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p); + } + jvp = NULL; + + return EINVAL; + } + + hfs_global_exclusive_lock_acquire(hfsmp); + + HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1]; + HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask; + hfsmp->jvp = jvp; + hfsmp->jnl = jnl; + + // save this off for the hack-y check in hfs_remove() + hfsmp->jnl_start = (u_int32_t)name[2]; + hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid; + hfsmp->hfs_jnlfileid = jnl_attr.ca_fileid; + + hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED; + + hfs_global_exclusive_lock_release(hfsmp); + hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); + + return 0; + } else if (name[0] == 0x031272) { + // clear the journaling bit + struct vnode *vp = p->p_fd->fd_cdir; + struct hfsmount *hfsmp; + void *jnl; + int retval; + + hfsmp = VTOHFS(vp); + if (hfsmp->jnl == NULL) { + return EINVAL; + } + + printf("hfs: disabling journaling for mount @ 0x%x\n", vp->v_mount); + + jnl = hfsmp->jnl; + + hfs_global_exclusive_lock_acquire(hfsmp); + + // Lights out for you buddy! + hfsmp->jnl = NULL; + journal_close(jnl); + + if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) { + VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p); + } + hfsmp->jnl = NULL; + hfsmp->jvp = NULL; + hfsmp->hfs_mp->mnt_flag &= ~MNT_JOURNALED; + hfsmp->jnl_start = 0; + hfsmp->hfs_jnlinfoblkid = 0; + hfsmp->hfs_jnlfileid = 0; + + HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask; + + hfs_global_exclusive_lock_release(hfsmp); + hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); + + return 0; + } return (EOPNOTSUPP); } @@ -1688,6 +2087,11 @@ hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot) --vcb->vcbNmFls; break; } + + if (hfsmp->jnl) { + hfs_flushvolumeheader(hfsmp, 0, 0); + } + return (0); } @@ -1704,7 +2108,6 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) ByteCount namelen; sectorsize = hfsmp->hfs_phys_block_size; - retval = bread(hfsmp->hfs_devvp, HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp); if (retval) { if (bp) @@ -1716,6 +2119,10 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) DBG_ASSERT(bp->b_data != NULL); DBG_ASSERT(bp->b_bcount == size); + if (hfsmp->jnl) { + panic("hfs: standard hfs volumes should not be journaled!\n"); + } + mdb = (HFSMasterDirectoryBlock *)(bp->b_data + HFS_PRI_OFFSET(sectorsize)); mdb->drCrDate = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbCrDate))); @@ -1770,6 +2177,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) { bcopy(mdb, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize); + (void) VOP_BWRITE(alt_bp); } else if (alt_bp) brelse(alt_bp); @@ -1777,7 +2185,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) if (waitfor != MNT_WAIT) bawrite(bp); - else + else retval = VOP_BWRITE(bp); MarkVCBClean( vcb ); @@ -1809,13 +2217,32 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) + HFS_PRI_SECTOR(sectorsize); + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + return EINVAL; + } + } + retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp); if (retval) { if (bp) brelse(bp); + + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + return (retval); } + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, bp); + } + volumeHeader = (HFSPlusVolumeHeader *)((char *)bp->b_data + HFS_PRI_OFFSET(sectorsize)); /* @@ -1839,9 +2266,19 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate ) { + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, bp2); + } + mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate); /* pick up the new create date */ - (void) VOP_BWRITE(bp2); /* write out the changes */ + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, bp2); + } else { + (void) VOP_BWRITE(bp2); /* write out the changes */ + } } else { @@ -1850,9 +2287,36 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) } } +// XXXdbg - only monkey around with the volume signature on non-root volumes +// +#if 0 + if (hfsmp->jnl && + hfsmp->hfs_fs_ronly == 0 && + (HFSTOVFS(hfsmp)->mnt_flag & MNT_ROOTFS) == 0) { + + int old_sig = volumeHeader->signature; + + if (vcb->vcbAtrb & kHFSVolumeUnmountedMask) { + volumeHeader->signature = kHFSPlusSigWord; + } else { + volumeHeader->signature = kHFSJSigWord; + } + + if (old_sig != volumeHeader->signature) { + altflush = 1; + } + } +#endif +// XXXdbg + /* Note: only update the lower 16 bits worth of attributes */ volumeHeader->attributes = SWAP_BE32 ((SWAP_BE32 (volumeHeader->attributes) & 0xFFFF0000) + (UInt16) vcb->vcbAtrb); - volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion); + volumeHeader->journalInfoBlock = SWAP_BE32(vcb->vcbJinfoBlock); + if (hfsmp->jnl) { + volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion); + } else { + volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion); + } volumeHeader->createDate = SWAP_BE32 (vcb->localCreateDate); /* volume create date is in local time */ volumeHeader->modifyDate = SWAP_BE32 (to_hfs_time(vcb->vcbLsMod)); volumeHeader->backupDate = SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp)); @@ -1918,22 +2382,38 @@ hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count); if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) { + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, alt_bp); + } + bcopy(volumeHeader, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize); - (void) VOP_BWRITE(alt_bp); + + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, alt_bp); + } else { + (void) VOP_BWRITE(alt_bp); + } } else if (alt_bp) brelse(alt_bp); } - if (waitfor != MNT_WAIT) - bawrite(bp); - else { - retval = VOP_BWRITE(bp); - /* When critical data changes, flush the device cache */ - if (critical && (retval == 0)) { + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, bp); + journal_end_transaction(hfsmp->jnl); + } else { + if (waitfor != MNT_WAIT) + bawrite(bp); + else { + retval = VOP_BWRITE(bp); + /* When critical data changes, flush the device cache */ + if (critical && (retval == 0)) { (void) VOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, - NULL, FWRITE, NOCRED, current_proc()); + NULL, FWRITE, NOCRED, current_proc()); + } } } + hfs_global_shared_lock_release(hfsmp); vcb->vcbFlags &= 0x00FF; return (retval); diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c index c45f8a898..386acae02 100644 --- a/bsd/hfs/hfs_vfsutils.c +++ b/bsd/hfs/hfs_vfsutils.c @@ -55,6 +55,7 @@ extern uid_t console_user; static void ReleaseMetaFileVNode(struct vnode *vp); +static int hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args); u_int32_t GetLogicalBlockSize(struct vnode *vp); @@ -246,7 +247,7 @@ CmdDone: //******************************************************************************* OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, - off_t embeddedOffset, u_int64_t disksize, struct proc *p) + off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args) { register ExtendedVCB *vcb; struct cat_desc cndesc; @@ -254,9 +255,15 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, UInt32 blockSize; OSErr retval; - if (SWAP_BE16(vhp->signature) != kHFSPlusSigWord || - SWAP_BE16(vhp->version) != kHFSPlusVersion) - return (EINVAL); + // XXXdbg - added the kHFSJSigWord case + if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord && + SWAP_BE16(vhp->signature) != kHFSJSigWord) || + SWAP_BE16(vhp->version) != kHFSPlusVersion) { + // XXXdbg + printf("hfs: mount: sig 0x%x and version 0x%x are not HFS or HFS+.\n", + vhp->signature, vhp->version); + return (EINVAL); + } /* Block size must be at least 512 and a power of 2 */ blockSize = SWAP_BE32(vhp->blockSize); @@ -264,7 +271,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, return (EINVAL); /* don't mount a writable volume if its dirty, it must be cleaned by fsck_hfs */ - if (hfsmp->hfs_fs_ronly == 0 && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0) + if (hfsmp->hfs_fs_ronly == 0 && hfsmp->jnl == NULL && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0) return (EINVAL); /* Make sure we can live with the physical block size. */ @@ -280,6 +287,13 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, vcb = HFSTOVCB(hfsmp); vcb->vcbSigWord = SWAP_BE16(vhp->signature); + + // XXXdbg - remap this in case we've mounted a dirty journaled volume + if (vcb->vcbSigWord == kHFSJSigWord) { + vcb->vcbSigWord = kHFSPlusSigWord; + } + + vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock); vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate)); vcb->vcbAtrb = (UInt16)SWAP_BE32(vhp->attributes); vcb->vcbClpSiz = SWAP_BE32(vhp->rsrcClumpSize); @@ -413,6 +427,9 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, /* mark the volume dirty (clear clean unmount bit) */ vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; + if (hfsmp->jnl && hfsmp->hfs_fs_ronly == 0) { + hfs_flushvolumeheader(hfsmp, TRUE, TRUE); + } /* * all done with metadata files so we can unlock now... @@ -423,12 +440,46 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, /* setup private/hidden directory for unlinked files */ hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(vcb); + if (hfsmp->jnl && (hfsmp->hfs_fs_ronly == 0)) + hfs_remove_orphans(hfsmp); if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected { MarkVCBDirty( vcb ); // mark VCB dirty so it will be written } + + // + // Check if we need to do late journal initialization. This only + // happens if a previous version of MacOS X (or 9) touched the disk. + // In that case hfs_late_journal_init() will go re-locate the journal + // and journal_info_block files and validate that they're still kosher. + // + if ( (vcb->vcbAtrb & kHFSVolumeJournaledMask) + && (SWAP_BE32(vhp->lastMountedVersion) != kHFSJMountVersion) + && (hfsmp->jnl == NULL)) { + + retval = hfs_late_journal_init(hfsmp, vhp, args); + if (retval != 0) { + hfsmp->jnl = NULL; + goto ErrorExit; + } else if (hfsmp->jnl) { + hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED; + } + } else if (hfsmp->jnl) { + struct cat_attr jinfo_attr, jnl_attr; + + // if we're here we need to fill in the fileid's for the + // journal and journal_info_block. + hfsmp->hfs_jnlinfoblkid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, NULL); + hfsmp->hfs_jnlfileid = GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, NULL); + if (hfsmp->hfs_jnlinfoblkid == 0 || hfsmp->hfs_jnlfileid == 0) { + printf("hfs: danger! couldn't find the file-id's for the journal or journal_info_block\n"); + printf("hfs: jnlfileid %d, jnlinfoblkid %d\n", hfsmp->hfs_jnlfileid, hfsmp->hfs_jnlinfoblkid); + } + } + + return (0); ErrorExit: @@ -759,13 +810,28 @@ FindMetaDataDirectory(ExtendedVCB *vcb) fndrinfo->frLocation.h = SWAP_BE16 (22460); fndrinfo->frFlags |= SWAP_BE16 (kIsInvisible + kNameLocked); + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { + hfs_global_shared_lock_release(hfsmp); + return (0); + } + } + error = cat_create(hfsmp, &hfsmp->hfs_privdir_desc, &hfsmp->hfs_privdir_attr, &out_desc); /* Unlock catalog b-tree */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc()); - if (error) - return (0); + if (error) { + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + + return (0); + } hfsmp->hfs_privdir_desc.cd_hint = out_desc.cd_hint; hfsmp->hfs_privdir_desc.cd_cnid = out_desc.cd_cnid; @@ -783,11 +849,209 @@ FindMetaDataDirectory(ExtendedVCB *vcb) vput(dvp); } hfs_volupdate(hfsmp, VOL_MKDIR, 1); + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + cat_releasedesc(&out_desc); return (out_desc.cd_cnid); } +__private_extern__ +u_long +GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name, + struct cat_attr *fattr, struct cat_fork *forkinfo) +{ + struct hfsmount * hfsmp; + struct vnode * dvp = NULL; + struct cnode * dcp = NULL; + struct FndrDirInfo * fndrinfo; + struct cat_desc jdesc; + struct timeval tv; + int error; + + if (vcb->vcbSigWord != kHFSPlusSigWord) + return (0); + + hfsmp = VCBTOHFS(vcb); + + memset(&jdesc, 0, sizeof(struct cat_desc)); + jdesc.cd_parentcnid = kRootDirID; + jdesc.cd_nameptr = name; + jdesc.cd_namelen = strlen(name); + + /* Lock catalog b-tree */ + error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc()); + if (error) + return (0); + + error = cat_lookup(hfsmp, &jdesc, 0, NULL, fattr, forkinfo); + + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc()); + + if (error == 0) { + return (fattr->ca_fileid); + } else if (hfsmp->hfs_fs_ronly) { + return (0); + } +} + + +/* + * On Journaled HFS, there can be orphaned files. These + * are files that were unlinked while busy. If the volume + * was not cleanly unmounted then some of these files may + * have persisted and need to be removed. + */ +__private_extern__ +void +hfs_remove_orphans(struct hfsmount * hfsmp) +{ + struct BTreeIterator * iterator = NULL; + struct FSBufferDescriptor btdata; + struct HFSPlusCatalogFile filerec; + struct HFSPlusCatalogKey * keyp; + FCB *fcb; + ExtendedVCB *vcb; + char filename[32]; + char tempname[32]; + size_t namelen; + int catlock = 0; + int result, started_tr = 0; + + if (hfsmp->hfs_orphans_cleaned) + return; + + vcb = HFSTOVCB(hfsmp); + fcb = VTOF(vcb->catalogRefNum); + + btdata.bufferAddress = &filerec; + btdata.itemSize = sizeof(filerec); + btdata.itemCount = 1; + + MALLOC(iterator, struct BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + keyp = (HFSPlusCatalogKey*)&iterator->key; + keyp->parentID = hfsmp->hfs_private_metadata_dir; + + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if (journal_start_transaction(hfsmp->jnl) != 0) { + hfs_global_shared_lock_release(hfsmp); + return; + } + started_tr = 1; + } + + /* Lock catalog b-tree */ + result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc()); + if (result) + goto exit; + catlock = 1; + + /* + * Position the iterator at the folder thread record. + * (i.e. one record before first child) + */ + result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator); + if (result) + goto exit; + + /* Visit all the children in the HFS+ private directory. */ + for (;;) { + result = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); + if (result) + break; + if (keyp->parentID != hfsmp->hfs_private_metadata_dir) + break; + if (filerec.recordType != kHFSPlusFileRecord) + continue; + + (void) utf8_encodestr(keyp->nodeName.unicode, keyp->nodeName.length * 2, + filename, &namelen, sizeof(filename), 0, 0); + + (void) sprintf(tempname, "%s%d", HFS_DELETE_PREFIX, filerec.fileID); + + /* + * Delete all files named "tempxxx", where + * xxx is the file's cnid in decimal. + * + * Delete all files named "iNodexxx", that + * have a link count of zero. + */ + if (bcmp(tempname, filename, namelen) == 0) { + struct filefork fork = {0}; + struct cnode cnode = {0}; + + // XXXdebug + //printf("hfs_remove_orphans: removing %s\n", filename); + + /* Build a fake cnode */ + cnode.c_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir; + cnode.c_desc.cd_nameptr = filename; + cnode.c_desc.cd_namelen = namelen; + cnode.c_desc.cd_cnid = filerec.fileID; + cnode.c_attr.ca_fileid = filerec.fileID; + cnode.c_blocks = filerec.dataFork.totalBlocks + + filerec.resourceFork.totalBlocks; + + /* Position iterator at previous entry */ + if (BTIterateRecord(fcb, kBTreePrevRecord, iterator, + NULL, NULL) != 0) + break; + + /* Truncate the file to zero (both forks) */ + if (filerec.dataFork.totalBlocks > 0) { + fork.ff_cp = &cnode; + cnode.c_datafork = ⋔ + bcopy(&filerec.dataFork, &fork.ff_data, sizeof(struct cat_fork)); + if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) { + printf("error truncting data fork!\n"); + break; + } + } + if (filerec.resourceFork.totalBlocks > 0) { + fork.ff_cp = &cnode; + cnode.c_datafork = NULL; + cnode.c_rsrcfork = ⋔ + bcopy(&filerec.resourceFork, &fork.ff_data, sizeof(struct cat_fork)); + if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) { + printf("error truncting rsrc fork!\n"); + break; + } + } + + /* Remove the file record from the Catalog */ + if (cat_delete(hfsmp, &cnode.c_desc, &cnode.c_attr) != 0) { + printf("error deleting cat rec!\n"); + break; + } + + /* Update parent and volume counts */ + hfsmp->hfs_privdir_attr.ca_entries--; + (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc, + &hfsmp->hfs_privdir_attr, NULL, NULL); + hfs_volupdate(hfsmp, VOL_RMFILE, 0); + } + } + +exit: + /* Unlock catalog b-tree */ + if (catlock) + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc()); + + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); + + FREE(iterator, M_TEMP); + hfsmp->hfs_orphans_cleaned = 1; +} + /* * This will return the correct logical block size for a given vnode. @@ -860,12 +1124,14 @@ short MacToVFSError(OSErr err) switch (err) { case dskFulErr: /* -34 */ - case btNoSpaceAvail: /* -32733 */ + return ENOSPC; + case btNoSpaceAvail: /* -32733 */ + return EFBIG; case fxOvFlErr: /* -32750 */ - return ENOSPC; /* +28 */ + return EOVERFLOW; case btBadNode: /* -32731 */ - return EIO; /* +5 */ + return EBADF; case memFullErr: /* -108 */ return ENOMEM; /* +12 */ @@ -885,7 +1151,7 @@ short MacToVFSError(OSErr err) return EISDIR; /* 21 */ case fxRangeErr: /* -32751 */ - return EIO; /* 5 */ + return ERANGE; case bdNamErr: /* -37 */ return ENAMETOOLONG; /* 63 */ @@ -995,4 +1261,299 @@ hfs_relnamehints(struct cnode *dcp) } +__private_extern__ +int +hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, + void *_args, int embeddedOffset, int mdb_offset, + HFSMasterDirectoryBlock *mdbp, struct ucred *cred) +{ + JournalInfoBlock *jibp; + struct buf *jinfo_bp, *bp; + int sectors_per_fsblock, arg_flags=0, arg_tbufsz=0; + int retval, blksize = hfsmp->hfs_phys_block_size; + struct vnode *devvp; + struct hfs_mount_args *args = _args; + + devvp = hfsmp->hfs_devvp; + + if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) { + arg_flags = args->journal_flags; + arg_tbufsz = args->journal_tbuffer_size; + } + + sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / blksize; + + retval = meta_bread(devvp, + embeddedOffset/blksize + + (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock), + SWAP_BE32(vhp->blockSize), cred, &jinfo_bp); + if (retval) + return retval; + + jibp = (JournalInfoBlock *)jinfo_bp->b_data; + jibp->flags = SWAP_BE32(jibp->flags); + jibp->offset = SWAP_BE64(jibp->offset); + jibp->size = SWAP_BE64(jibp->size); + + if (jibp->flags & kJIJournalInFSMask) { + hfsmp->jvp = hfsmp->hfs_devvp; + } else { + printf("hfs: journal not stored in fs! don't know what to do.\n"); + brelse(jinfo_bp); + return EINVAL; + } + + // save this off for the hack-y check in hfs_remove() + hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize); + + if (jibp->flags & kJIJournalNeedInitMask) { + printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", + jibp->offset + (off_t)embeddedOffset, jibp->size); + hfsmp->jnl = journal_create(hfsmp->jvp, + jibp->offset + (off_t)embeddedOffset, + jibp->size, + devvp, + blksize, + arg_flags, + arg_tbufsz, + hfs_sync_metadata, hfsmp->hfs_mp); + + // no need to start a transaction here... if this were to fail + // we'd just re-init it on the next mount. + jibp->flags &= ~kJIJournalNeedInitMask; + jibp->flags = SWAP_BE32(jibp->flags); + bwrite(jinfo_bp); + jinfo_bp = NULL; + jibp = NULL; + } else { + //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n", + // jibp->offset + (off_t)embeddedOffset, + // jibp->size, SWAP_BE32(vhp->blockSize)); + + hfsmp->jnl = journal_open(hfsmp->jvp, + jibp->offset + (off_t)embeddedOffset, + jibp->size, + devvp, + blksize, + arg_flags, + arg_tbufsz, + hfs_sync_metadata, hfsmp->hfs_mp); + + brelse(jinfo_bp); + jinfo_bp = NULL; + jibp = NULL; + + if (hfsmp->jnl && mdbp) { + // reload the mdb because it could have changed + // if the journal had to be replayed. + retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp); + if (retval) { + brelse(bp); + printf("hfs: failed to reload the mdb after opening the journal (retval %d)!\n", + retval); + return retval; + } + bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512); + brelse(bp); + bp = NULL; + } + } + + + //printf("journal @ 0x%x\n", hfsmp->jnl); + + // if we expected the journal to be there and we couldn't + // create it or open it then we have to bail out. + if (hfsmp->jnl == NULL) { + hfsmp->jnl_start = 0; + + printf("hfs: failed to open/create the journal (retval %d).\n", retval); + return EINVAL; + } + return 0; +} + + +// +// This function will go and re-locate the .journal_info_block and +// the .journal files in case they moved (which can happen if you +// run Norton SpeedDisk). If we fail to find either file we just +// disable journaling for this volume and return. We turn off the +// journaling bit in the vcb and assume it will get written to disk +// later (if it doesn't on the next mount we'd do the same thing +// again which is harmless). If we disable journaling we don't +// return an error so that the volume is still mountable. +// +// If the info we find for the .journal_info_block and .journal files +// isn't what we had stored, we re-set our cached info and proceed +// with opening the journal normally. +// +static int +hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args) +{ + JournalInfoBlock *jibp; + struct buf *jinfo_bp, *bp; + int sectors_per_fsblock, arg_flags=0, arg_tbufsz=0; + int retval, need_flush = 0, write_jibp = 0; + struct vnode *devvp; + struct cat_attr jib_attr, jattr; + struct cat_fork jib_fork, jfork; + ExtendedVCB *vcb; + u_long fid; + struct hfs_mount_args *args = _args; + + devvp = hfsmp->hfs_devvp; + vcb = HFSTOVCB(hfsmp); + + if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) { + if (args->journal_disable) { + return 0; + } + + arg_flags = args->journal_flags; + arg_tbufsz = args->journal_tbuffer_size; + } + + fid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jib_attr, &jib_fork); + if (fid == 0 || jib_fork.cf_extents[0].startBlock == 0 || jib_fork.cf_size == 0) { + printf("hfs: can't find the .journal_info_block! disabling journaling (start: %d).\n", + jib_fork.cf_extents[0].startBlock); + vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; + return 0; + } + hfsmp->hfs_jnlinfoblkid = fid; + + // make sure the journal_info_block begins where we think it should. + if (SWAP_BE32(vhp->journalInfoBlock) != jib_fork.cf_extents[0].startBlock) { + printf("hfs: The journal_info_block moved (was: %d; is: %d). Fixing up\n", + SWAP_BE32(vhp->journalInfoBlock), jib_fork.cf_extents[0].startBlock); + + vcb->vcbJinfoBlock = jib_fork.cf_extents[0].startBlock; + vhp->journalInfoBlock = SWAP_BE32(jib_fork.cf_extents[0].startBlock); + } + + + sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / hfsmp->hfs_phys_block_size; + retval = meta_bread(devvp, + vcb->hfsPlusIOPosOffset / hfsmp->hfs_phys_block_size + + (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock), + SWAP_BE32(vhp->blockSize), NOCRED, &jinfo_bp); + if (retval) { + printf("hfs: can't read journal info block. disabling journaling.\n"); + vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; + return 0; + } + + jibp = (JournalInfoBlock *)jinfo_bp->b_data; + jibp->flags = SWAP_BE32(jibp->flags); + jibp->offset = SWAP_BE64(jibp->offset); + jibp->size = SWAP_BE64(jibp->size); + + fid = GetFileInfo(vcb, kRootDirID, ".journal", &jattr, &jfork); + if (fid == 0 || jfork.cf_extents[0].startBlock == 0 || jfork.cf_size == 0) { + printf("hfs: can't find the journal file! disabling journaling (start: %d)\n", + jfork.cf_extents[0].startBlock); + brelse(jinfo_bp); + vcb->vcbAtrb &= ~kHFSVolumeJournaledMask; + return 0; + } + hfsmp->hfs_jnlfileid = fid; + + // make sure the journal file begins where we think it should. + if ((jibp->offset / (u_int64_t)vcb->blockSize) != jfork.cf_extents[0].startBlock) { + printf("hfs: The journal file moved (was: %lld; is: %d). Fixing up\n", + (jibp->offset / (u_int64_t)vcb->blockSize), jfork.cf_extents[0].startBlock); + + jibp->offset = (u_int64_t)jfork.cf_extents[0].startBlock * (u_int64_t)vcb->blockSize; + write_jibp = 1; + } + + // check the size of the journal file. + if (jibp->size != (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize) { + printf("hfs: The journal file changed size! (was %lld; is %lld). Fixing up.\n", + jibp->size, (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize); + + jibp->size = (u_int64_t)jfork.cf_extents[0].blockCount * vcb->blockSize; + write_jibp = 1; + } + + if (jibp->flags & kJIJournalInFSMask) { + hfsmp->jvp = hfsmp->hfs_devvp; + } else { + printf("hfs: journal not stored in fs! don't know what to do.\n"); + brelse(jinfo_bp); + return EINVAL; + } + + // save this off for the hack-y check in hfs_remove() + hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize); + + if (jibp->flags & kJIJournalNeedInitMask) { + printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n", + jibp->offset + (off_t)vcb->hfsPlusIOPosOffset, jibp->size); + hfsmp->jnl = journal_create(hfsmp->jvp, + jibp->offset + (off_t)vcb->hfsPlusIOPosOffset, + jibp->size, + devvp, + hfsmp->hfs_phys_block_size, + arg_flags, + arg_tbufsz, + hfs_sync_metadata, hfsmp->hfs_mp); + + // no need to start a transaction here... if this were to fail + // we'd just re-init it on the next mount. + jibp->flags &= ~kJIJournalNeedInitMask; + write_jibp = 1; + + } else { + // + // if we weren't the last person to mount this volume + // then we need to throw away the journal because it + // is likely that someone else mucked with the disk. + // if the journal is empty this is no big deal. if the + // disk is dirty this prevents us from replaying the + // journal over top of changes that someone else made. + // + arg_flags |= JOURNAL_RESET; + + //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n", + // jibp->offset + (off_t)vcb->hfsPlusIOPosOffset, + // jibp->size, SWAP_BE32(vhp->blockSize)); + + hfsmp->jnl = journal_open(hfsmp->jvp, + jibp->offset + (off_t)vcb->hfsPlusIOPosOffset, + jibp->size, + devvp, + hfsmp->hfs_phys_block_size, + arg_flags, + arg_tbufsz, + hfs_sync_metadata, hfsmp->hfs_mp); + } + + + if (write_jibp) { + jibp->flags = SWAP_BE32(jibp->flags); + jibp->offset = SWAP_BE64(jibp->offset); + jibp->size = SWAP_BE64(jibp->size); + + bwrite(jinfo_bp); + } else { + brelse(jinfo_bp); + } + jinfo_bp = NULL; + jibp = NULL; + + //printf("journal @ 0x%x\n", hfsmp->jnl); + + // if we expected the journal to be there and we couldn't + // create it or open it then we have to bail out. + if (hfsmp->jnl == NULL) { + hfsmp->jnl_start = 0; + + printf("hfs: failed to open/create the journal (retval %d).\n", retval); + return EINVAL; + } + + return 0; +} diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index 19006da0e..0080c1400 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -561,6 +561,17 @@ hfs_setattr(ap) if (cp->c_flags & (IMMUTABLE | APPEND)) return (EPERM); + + // XXXdbg - don't allow modification of the journal or journal_info_block + if (VTOHFS(vp)->jnl && cp->c_datafork) { + struct HFSPlusExtentDescriptor *extd; + + extd = &cp->c_datafork->ff_data.cf_extents[0]; + if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) { + return EPERM; + } + } + /* * Go through the fields and update iff not VNOVAL. */ @@ -649,6 +660,16 @@ hfs_chmod(vp, mode, cred, p) if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord) return (0); + // XXXdbg - don't allow modification of the journal or journal_info_block + if (VTOHFS(vp)->jnl && cp && cp->c_datafork) { + struct HFSPlusExtentDescriptor *extd; + + extd = &cp->c_datafork->ff_data.cf_extents[0]; + if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) { + return EPERM; + } + } + #if OVERRIDE_UNKNOWN_PERMISSIONS if (VTOVFS(vp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) { return (0); @@ -915,7 +936,7 @@ hfs_exchange(ap) struct hfsmount *hfsmp = VTOHFS(from_vp); struct cat_desc tempdesc; struct cat_attr tempattr; - int error = 0; + int error = 0, started_tr = 0, grabbed_lock = 0; /* The files must be on the same volume. */ if (from_vp->v_mount != to_vp->v_mount) @@ -927,6 +948,25 @@ hfs_exchange(ap) VNODE_IS_RSRC(from_vp) || VNODE_IS_RSRC(to_vp)) return (EINVAL); + // XXXdbg - don't allow modification of the journal or journal_info_block + if (hfsmp->jnl) { + struct HFSPlusExtentDescriptor *extd; + + if (from_cp->c_datafork) { + extd = &from_cp->c_datafork->ff_data.cf_extents[0]; + if (extd->startBlock == VTOVCB(from_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { + return EPERM; + } + } + + if (to_cp->c_datafork) { + extd = &to_cp->c_datafork->ff_data.cf_extents[0]; + if (extd->startBlock == VTOVCB(to_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { + return EPERM; + } + } + } + from_rvp = from_cp->c_rsrc_vp; to_rvp = to_cp->c_rsrc_vp; @@ -952,6 +992,16 @@ hfs_exchange(ap) if (to_rvp) (void) vinvalbuf(to_rvp, V_SAVE, ap->a_cred, ap->a_p, 0, 0); + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + grabbed_lock = 1; + if (hfsmp->jnl) { + if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { + goto Err_Exit; + } + started_tr = 1; + } + /* Lock catalog b-tree */ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, ap->a_p); if (error) goto Err_Exit; @@ -994,6 +1044,7 @@ hfs_exchange(ap) * (except the modify date) */ bcopy(&to_cp->c_desc, &from_cp->c_desc, sizeof(struct cat_desc)); + from_cp->c_hint = 0; from_cp->c_fileid = from_cp->c_cnid; from_cp->c_itime = to_cp->c_itime; @@ -1031,6 +1082,14 @@ Err_Exit: if (from_rvp) vrele(from_rvp); + // XXXdbg + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + } + return (error); } @@ -1046,7 +1105,6 @@ Err_Exit: IN struct proc *p; */ - static int hfs_fsync(ap) struct vop_fsync_args /* { @@ -1063,6 +1121,7 @@ hfs_fsync(ap) register struct buf *bp; struct timeval tv; struct buf *nbp; + struct hfsmount *hfsmp = VTOHFS(ap->a_vp); int s; int wait; int retry = 0; @@ -1078,8 +1137,17 @@ hfs_fsync(ap) * for regular files write out any clusters */ if (vp->v_flag & VSYSTEM) { - if (VTOF(vp)->fcbBTCBPtr != NULL) - BTFlushPath(VTOF(vp)); + if (VTOF(vp)->fcbBTCBPtr != NULL) { + // XXXdbg + if (hfsmp->jnl) { + if (BTIsDirty(VTOF(vp))) { + panic("hfs: system file vp 0x%x has dirty blocks (jnl 0x%x)\n", + vp, hfsmp->jnl); + } + } else { + BTFlushPath(VTOF(vp)); + } + } } else if (UBCINFOEXISTS(vp)) (void) cluster_push(vp); @@ -1139,11 +1207,27 @@ loop: if ((bp->b_flags & B_BUSY)) continue; if ((bp->b_flags & B_DELWRI) == 0) - panic("hfs_fsync: not dirty"); + panic("hfs_fsync: bp 0x% not dirty (hfsmp 0x%x)", bp, hfsmp); + // XXXdbg + if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) { + if ((bp->b_flags & B_META) == 0) { + panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n", + bp, hfsmp->jnl); + } + // if journal_active() returns >= 0 then the journal is ok and we + // shouldn't do anything to this locked block (because it is part + // of a transaction). otherwise we'll just go through the normal + // code path and flush the buffer. + if (journal_active(hfsmp->jnl) >= 0) { + continue; + } + } + bremfree(bp); bp->b_flags |= B_BUSY; /* Clear B_LOCKED, should only be set on meta files */ bp->b_flags &= ~B_LOCKED; + splx(s); /* * Wait for I/O associated with indirect blocks to complete, @@ -1162,7 +1246,9 @@ loop: tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "hfs_fsync", 0); } - if (vp->v_dirtyblkhd.lh_first) { + // XXXdbg -- is checking for hfsmp->jnl == NULL the right + // thing to do? + if (hfsmp->jnl == NULL && vp->v_dirtyblkhd.lh_first) { /* still have some dirty buffers */ if (retry++ > 10) { vprint("hfs_fsync: dirty", vp); @@ -1216,6 +1302,11 @@ hfs_metasync(struct hfsmount *hfsmp, daddr_t node, struct proc *p) vp = HFSTOVCB(hfsmp)->catalogRefNum; + // XXXdbg - don't need to do this on a journaled volume + if (hfsmp->jnl) { + return 0; + } + if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p) != 0) return (0); @@ -1254,6 +1345,7 @@ hfs_btsync(struct vnode *vp, int sync_transaction) register struct buf *bp; struct timeval tv; struct buf *nbp; + struct hfsmount *hfsmp = VTOHFS(vp); int s; /* @@ -1267,13 +1359,30 @@ loop: if ((bp->b_flags & B_BUSY)) continue; if ((bp->b_flags & B_DELWRI) == 0) - panic("hfs_fsync: not dirty"); + panic("hfs_btsync: not dirty (bp 0x%x hfsmp 0x%x)", bp, hfsmp); + + // XXXdbg + if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) { + if ((bp->b_flags & B_META) == 0) { + panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n", + bp, hfsmp->jnl); + } + // if journal_active() returns >= 0 then the journal is ok and we + // shouldn't do anything to this locked block (because it is part + // of a transaction). otherwise we'll just go through the normal + // code path and flush the buffer. + if (journal_active(hfsmp->jnl) >= 0) { + continue; + } + } + if (sync_transaction && !(bp->b_flags & B_LOCKED)) continue; bremfree(bp); bp->b_flags |= B_BUSY; bp->b_flags &= ~B_LOCKED; + splx(s); (void) bawrite(bp); @@ -1316,7 +1425,7 @@ hfs_rmdir(ap) struct cnode *dcp; struct hfsmount * hfsmp; struct timeval tv; - int error = 0; + int error = 0, started_tr = 0, grabbed_lock = 0; cp = VTOC(vp); dcp = VTOC(dvp); @@ -1327,6 +1436,17 @@ hfs_rmdir(ap) vput(vp); return (EINVAL); /* cannot remove "." */ } + + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + grabbed_lock = 1; + if (hfsmp->jnl) { + if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { + goto out; + } + started_tr = 1; + } + /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since @@ -1372,6 +1492,7 @@ hfs_rmdir(ap) dcp->c_flag |= C_CHANGE | C_UPDATE; tv = time; (void) VOP_UPDATE(dvp, &tv, &tv, 0); + hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID)); cp->c_mode = 0; /* Makes the vnode go away...see inactive */ @@ -1380,6 +1501,15 @@ out: if (dvp) vput(dvp); vput(vp); + + // XXXdbg + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + } + return (error); } @@ -1415,6 +1545,7 @@ hfs_remove(ap) int truncated = 0; struct timeval tv; int error = 0; + int started_tr = 0, grabbed_lock = 0; /* Redirect directories to rmdir */ if (vp->v_type == VDIR) @@ -1435,7 +1566,7 @@ hfs_remove(ap) VNODE_IS_RSRC(vp)) { error = EPERM; goto out; - } + } /* * Aquire a vnode for a non-empty resource fork. @@ -1447,6 +1578,17 @@ hfs_remove(ap) goto out; } + // XXXdbg - don't allow deleting the journal or journal_info_block + if (hfsmp->jnl && cp->c_datafork) { + struct HFSPlusExtentDescriptor *extd; + + extd = &cp->c_datafork->ff_data.cf_extents[0]; + if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { + error = EPERM; + goto out; + } + } + /* * Check if this file is being used. * @@ -1470,9 +1612,48 @@ hfs_remove(ap) goto out; } + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + grabbed_lock = 1; + if (hfsmp->jnl) { + if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { + goto out; + } + started_tr = 1; + } + /* Remove our entry from the namei cache. */ cache_purge(vp); + // XXXdbg - if we're journaled, kill any dirty symlink buffers + if (hfsmp->jnl && vp->v_type == VLNK && vp->v_dirtyblkhd.lh_first) { + struct buf *bp, *nbp; + + recheck: + for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) { + nbp = bp->b_vnbufs.le_next; + + if ((bp->b_flags & B_BUSY)) { + // if it was busy, someone else must be dealing + // with it so just move on. + continue; + } + + if (!(bp->b_flags & B_META)) { + panic("hfs: symlink bp @ 0x%x is not marked meta-data!\n", bp); + } + + // if it's part of the current transaction, kill it. + if (bp->b_flags & B_LOCKED) { + bremfree(bp); + bp->b_flags |= B_BUSY; + journal_kill_block(hfsmp->jnl, bp); + goto recheck; + } + } + } + // XXXdbg + /* * Truncate any non-busy forks. Busy forks will * get trucated when their vnode goes inactive. @@ -1535,8 +1716,42 @@ hfs_remove(ap) if (error) goto out; + /* Delete the link record */ error = cat_delete(hfsmp, &desc, &cp->c_attr); + if ((error == 0) && (--cp->c_nlink < 1)) { + char inodename[32]; + char delname[32]; + struct cat_desc to_desc; + struct cat_desc from_desc; + + /* + * This is now esentially an open deleted file. + * Rename it to reflect this state which makes + * orphan file cleanup easier (see hfs_remove_orphans). + * Note: a rename failure here is not fatal. + */ + MAKE_INODE_NAME(inodename, cp->c_rdev); + bzero(&from_desc, sizeof(from_desc)); + from_desc.cd_nameptr = inodename; + from_desc.cd_namelen = strlen(inodename); + from_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir; + from_desc.cd_flags = 0; + from_desc.cd_cnid = cp->c_fileid; + + MAKE_DELETED_NAME(delname, cp->c_fileid); + bzero(&to_desc, sizeof(to_desc)); + to_desc.cd_nameptr = delname; + to_desc.cd_namelen = strlen(delname); + to_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir; + to_desc.cd_flags = 0; + to_desc.cd_cnid = cp->c_fileid; + + (void) cat_rename(hfsmp, &from_desc, &hfsmp->hfs_privdir_desc, + &to_desc, (struct cat_desc *)NULL); + cp->c_flag |= C_DELETED; + } + /* Unlock the Catalog */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); @@ -1548,8 +1763,9 @@ hfs_remove(ap) goto out; cp->c_flag |= C_CHANGE; - if (--cp->c_nlink < 1) - cp->c_flag |= C_DELETED; + tv = time; + (void) VOP_UPDATE(vp, &tv, &tv, 0); + hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); } else if (dataforkbusy || rsrcforkbusy) { @@ -1573,12 +1789,16 @@ hfs_remove(ap) /* Lock catalog b-tree */ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (error) goto out; + if (error) + goto out; error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, (struct cat_desc *)NULL); - hfsmp->hfs_privdir_attr.ca_entries++; + // XXXdbg - only bump this count if we were successful + if (error == 0) { + hfsmp->hfs_privdir_attr.ca_entries++; + } (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc, &hfsmp->hfs_privdir_attr, NULL, NULL); @@ -1588,22 +1808,33 @@ hfs_remove(ap) cp->c_flag |= C_CHANGE | C_DELETED | C_NOEXISTS; --cp->c_nlink; + tv = time; + (void) VOP_UPDATE(vp, &tv, &tv, 0); } else /* Not busy */ { - /* Lock catalog b-tree */ - error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); - if (error) goto out; - if (vp->v_type == VDIR && cp->c_entries > 0) panic("hfs_remove: attempting to delete a non-empty directory!"); if (vp->v_type != VDIR && cp->c_blocks > 0) panic("hfs_remove: attempting to delete a non-empty file!"); + /* Lock catalog b-tree */ + error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); + if (error) + goto out; + error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); - if (error && truncated) - panic("hfs_remove: couldn't delete a truncated file!"); + if (error && error != ENXIO && truncated) { + if ((cp->c_datafork && cp->c_datafork->ff_data.cf_size != 0) || + (cp->c_rsrcfork && cp->c_rsrcfork->ff_data.cf_size != 0)) { + panic("hfs: remove: couldn't delete a truncated file! (%d, data sz %lld; rsrc sz %lld)", + error, cp->c_datafork->ff_data.cf_size, cp->c_rsrcfork->ff_data.cf_size); + } else { + printf("hfs: remove: strangely enough, deleting truncated file %s (%d) got err %d\n", + cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error); + } + } /* Unlock the Catalog */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); @@ -1642,10 +1873,23 @@ hfs_remove(ap) if (rvp) vrele(rvp); VOP_UNLOCK(vp, 0, p); - (void) ubc_uncache(vp); + // XXXdbg - try to prevent the lost ubc_info panic + if ((cp->c_flag & C_HARDLINK) == 0 || cp->c_nlink == 0) { + (void) ubc_uncache(vp); + } vrele(vp); vput(dvp); + + // XXXdbg + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + } + return (0); + out: if (rvp) vrele(rvp); @@ -1658,6 +1902,15 @@ out: } vput(vp); vput(dvp); + + // XXXdbg + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + } + return (error); } @@ -1736,10 +1989,20 @@ hfs_rename(ap) struct hfsmount *hfsmp; struct proc *p = fcnp->cn_proc; struct timeval tv; - int retval = 0; + int retval = 0, started_tr = 0, grabbed_lock = 0; + int fdvp_locked = 0; + int fvp_locked = 0; cnid_t oldparent = 0; cnid_t newparent = 0; + // XXXdbg + if (fvp) + hfsmp = VTOHFS(fvp); + else if (tvp) + hfsmp = VTOHFS(tvp); + else + hfsmp = NULL; + #if HFS_DIAGNOSTIC if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) @@ -1780,9 +2043,6 @@ hfs_rename(ap) goto abortop; } - if ((retval = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p))) - goto abortop; - /* * Make sure "from" vnode and its parent are changeable. */ @@ -1790,13 +2050,11 @@ hfs_rename(ap) fcp = VTOC(fvp); oldparent = fdcp->c_cnid; if ((fcp->c_flags & (IMMUTABLE | APPEND)) || (fdcp->c_flags & APPEND)) { - VOP_UNLOCK(fvp, 0, p); retval = EPERM; goto abortop; } if (fcp->c_parentcnid != fdcp->c_cnid) { - VOP_UNLOCK(fvp, 0, p); retval = EINVAL; goto abortop; } @@ -1812,7 +2070,6 @@ hfs_rename(ap) if (fvp == ap->a_tvp && (bcmp(fcp->c_desc.cd_nameptr, tcnp->cn_nameptr, fcp->c_desc.cd_namelen) == 0)) { - VOP_UNLOCK(fvp, 0, p); retval = 0; goto abortop; } @@ -1829,7 +2086,6 @@ hfs_rename(ap) || fdcp == fcp || (fcnp->cn_flags&ISDOTDOT) || (fcp->c_flag & C_RENAME)) { - VOP_UNLOCK(fvp, 0, p); retval = EINVAL; goto abortop; } @@ -1846,6 +2102,27 @@ hfs_rename(ap) newparent = tdcp->c_cnid; + // XXXdbg - don't allow renaming the journal or journal_info_block + if (hfsmp->jnl && fcp->c_datafork) { + struct HFSPlusExtentDescriptor *extd; + + extd = &fcp->c_datafork->ff_data.cf_extents[0]; + if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { + retval = EPERM; + goto bad; + } + } + + if (hfsmp->jnl && tcp && tcp->c_datafork) { + struct HFSPlusExtentDescriptor *extd; + + extd = &tcp->c_datafork->ff_data.cf_extents[0]; + if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) { + retval = EPERM; + goto bad; + } + } + retval = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc); if ((fvp->v_type == VDIR) && (newparent != oldparent)) { if (retval) /* write access check above */ @@ -1853,6 +2130,42 @@ hfs_rename(ap) } retval = 0; /* Reset value from above, we dont care about it anymore */ + /* XXX + * Prevent lock heirarchy violation (deadlock): + * + * If fdvp is the parent of tdvp then we must drop + * tdvp lock before aquiring the lock for fdvp. + * + * XXXdbg - moved this to happen up here *before* we + * start a transaction. otherwise we can + * deadlock because the vnode layer may get + * this lock for someone else and then they'll + * never be able to start a transaction. + */ + if (newparent != oldparent) { + if (fdcp->c_cnid == tdcp->c_parentcnid) { + vput(tdvp); + vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p); + vget(tdvp, LK_EXCLUSIVE | LK_RETRY, p); + } else { + vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p); + } + } + fdvp_locked = 1; + if ((retval = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p))) + goto bad; + fvp_locked = 1; + + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + grabbed_lock = 1; + if (hfsmp->jnl) { + if ((retval = journal_start_transaction(hfsmp->jnl)) != 0) { + goto bad; + } + started_tr = 1; + } + /* * If the destination exists, then be sure its type (file or dir) * matches that of the source. And, if it is a directory make sure @@ -1904,19 +2217,9 @@ hfs_rename(ap) } - /* XXX - * Prevent lock heirarchy violation (deadlock): - * - * If fdvp is the parent of tdvp then we must drop - * tdvp lock before aquiring the lock for fdvp. - */ - if (newparent != oldparent) - vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p); - /* remove the existing entry from the namei cache: */ cache_purge(fvp); - hfsmp = VTOHFS(fvp); bzero(&from_desc, sizeof(from_desc)); from_desc.cd_nameptr = fcnp->cn_nameptr; from_desc.cd_namelen = fcnp->cn_namelen; @@ -1933,18 +2236,18 @@ hfs_rename(ap) /* Lock catalog b-tree */ retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p); if (retval) { - if (newparent != oldparent) /* unlock the lock we just got */ - VOP_UNLOCK(fdvp, 0, p); goto bad; } - retval = cat_rename(hfsmp, &from_desc, &tdcp->c_desc, - &to_desc, &out_desc); + retval = cat_rename(hfsmp, &from_desc, &tdcp->c_desc, + &to_desc, &out_desc); /* Unlock catalog b-tree */ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); - if (newparent != oldparent) + if (newparent != oldparent) { VOP_UNLOCK(fdvp, 0, p); + fdvp_locked = 0; + } if (retval) goto bad; @@ -1965,13 +2268,19 @@ hfs_rename(ap) fdcp->c_entries--; tdcp->c_nlink++; tdcp->c_entries++; - fdcp->c_flag |= C_UPDATE; - tdcp->c_flag |= C_UPDATE; + fdcp->c_flag |= C_CHANGE | C_UPDATE; + tdcp->c_flag |= C_CHANGE | C_UPDATE; tv = time; CTIMES(fdcp, &tv, &tv); CTIMES(tdcp, &tv, &tv); tdcp->c_childhint = out_desc.cd_hint; /* Cache directory's location */ + // make sure both directories get updated on disk. + if (fdvp != tdvp) { + (void) VOP_UPDATE(fdvp, &tv, &tv, 0); + } + (void) VOP_UPDATE(tdvp, &tv, &tv, 0); + hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_RMDIR : VOL_RMFILE, (fdcp->c_cnid == kHFSRootFolderID)); hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_MKDIR : VOL_MKFILE, @@ -1980,23 +2289,52 @@ hfs_rename(ap) vput(tdvp); vrele(fdvp); vput(fvp); + + // XXXdbg + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + } + return (0); bad: if (fcp) fcp->c_flag &= ~C_RENAME; + + // XXXdbg make sure both directories get updated on disk. + if (fdvp != tdvp) { + (void) VOP_UPDATE(fdvp, &tv, &tv, 0); + } + (void) VOP_UPDATE(tdvp, &tv, &tv, 0); + if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); - vrele(fdvp); - if (VOP_ISLOCKED(fvp)) + if (fdvp_locked) + vput(fdvp); + else + vrele(fdvp); + + if (fvp_locked) vput(fvp); else vrele(fvp); + + // XXXdbg + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + } + return (retval); abortop: @@ -2011,6 +2349,7 @@ abortop: VOP_ABORTOP(fdvp, fcnp); vrele(fdvp); vrele(fvp); + return (retval); } @@ -2079,6 +2418,7 @@ hfs_symlink(ap) } */ *ap; { register struct vnode *vp, **vpp = ap->a_vpp; + struct hfsmount *hfsmp; struct filefork *fp; int len, error; struct buf *bp = NULL; @@ -2097,16 +2437,31 @@ hfs_symlink(ap) return (EINVAL); } + + hfsmp = VTOHFS(ap->a_dvp); + /* Create the vnode */ if ((error = hfs_makenode(S_IFLNK | ap->a_vap->va_mode, - ap->a_dvp, vpp, ap->a_cnp))) + ap->a_dvp, vpp, ap->a_cnp))) { return (error); + } vp = *vpp; len = strlen(ap->a_target); fp = VTOF(vp); fp->ff_clumpsize = VTOVCB(vp)->blockSize; + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { + hfs_global_shared_lock_release(hfsmp); + VOP_ABORTOP(ap->a_dvp, ap->a_cnp); + vput(ap->a_dvp); + return (error); + } + } + /* Allocate space for the link */ error = VOP_TRUNCATE(vp, len, IO_NOZEROFILL, ap->a_cnp->cn_cred, ap->a_cnp->cn_proc); @@ -2116,10 +2471,21 @@ hfs_symlink(ap) /* Write the link to disk */ bp = getblk(vp, 0, roundup((int)fp->ff_size, VTOHFS(vp)->hfs_phys_block_size), 0, 0, BLK_META); + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, bp); + } bzero(bp->b_data, bp->b_bufsize); bcopy(ap->a_target, bp->b_data, len); - bawrite(bp); + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, bp); + } else { + bawrite(bp); + } out: + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); vput(vp); return (error); } @@ -2207,11 +2573,41 @@ hfs_readdir(ap) off_t off = uio->uio_offset; int retval = 0; int eofflag = 0; - + void *user_start = NULL; + int user_len; + /* We assume it's all one big buffer... */ if (uio->uio_iovcnt > 1 || uio->uio_resid < AVERAGE_HFSDIRENTRY_SIZE) return EINVAL; + // XXXdbg + // We have to lock the user's buffer here so that we won't + // fault on it after we've acquired a shared lock on the + // catalog file. The issue is that you can get a 3-way + // deadlock if someone else starts a transaction and then + // tries to lock the catalog file but can't because we're + // here and we can't service our page fault because VM is + // blocked trying to start a transaction as a result of + // trying to free up pages for our page fault. It's messy + // but it does happen on dual-procesors that are paging + // heavily (see radar 3082639 for more info). By locking + // the buffer up-front we prevent ourselves from faulting + // while holding the shared catalog file lock. + // + // Fortunately this and hfs_search() are the only two places + // currently (10/30/02) that can fault on user data with a + // shared lock on the catalog file. + // + if (hfsmp->jnl && uio->uio_segflg == UIO_USERSPACE) { + user_start = uio->uio_iov->iov_base; + user_len = uio->uio_iov->iov_len; + + if ((retval = vslock(user_start, user_len)) != 0) { + return retval; + } + } + + /* Create the entries for . and .. */ if (uio->uio_offset < sizeof(rootdots)) { caddr_t dep; @@ -2297,6 +2693,10 @@ hfs_readdir(ap) } Exit:; + if (hfsmp->jnl && user_start) { + vsunlock(user_start, user_len, TRUE); + } + if (ap->a_eofflag) *ap->a_eofflag = eofflag; @@ -2359,7 +2759,9 @@ hfs_readlink(ap) } bcopy(bp->b_data, fp->ff_symlinkptr, (size_t)fp->ff_size); if (bp) { - bp->b_flags |= B_INVAL; /* data no longer needed */ + if (VTOHFS(vp)->jnl && (bp->b_flags & B_LOCKED) == 0) { + bp->b_flags |= B_INVAL; /* data no longer needed */ + } brelse(bp); } } @@ -2693,8 +3095,11 @@ hfs_update(ap) struct cat_fork *rsrcforkp = NULL; struct cat_fork datafork; int updateflag; + struct hfsmount *hfsmp; int error; + hfsmp = VTOHFS(vp); + /* XXX do we really want to clear the sytem cnode flags here???? */ if ((vp->v_flag & VSYSTEM) || (VTOVFS(vp)->mnt_flag & MNT_RDONLY) || @@ -2706,11 +3111,13 @@ hfs_update(ap) updateflag = cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE); /* Nothing to update. */ - if (updateflag == 0) + if (updateflag == 0) { return (0); + } /* HFS standard doesn't have access times. */ - if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord)) + if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord)) { return (0); + } if (updateflag & C_ACCESS) { /* * If only the access time is changing then defer @@ -2764,12 +3171,24 @@ hfs_update(ap) (dataforkp && cp->c_datafork->ff_unallocblocks) || (rsrcforkp && cp->c_rsrcfork->ff_unallocblocks)) { if (updateflag & (C_CHANGE | C_UPDATE)) - hfs_volupdate(VTOHFS(vp), VOL_UPDATE, 0); + hfs_volupdate(hfsmp, VOL_UPDATE, 0); cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_UPDATE); cp->c_flag |= C_MODIFIED; + return (0); } + + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + if (hfsmp->jnl) { + if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { + hfs_global_shared_lock_release(hfsmp); + return error; + } + } + + /* * For files with invalid ranges (holes) the on-disk * field representing the size of the file (cf_size) @@ -2786,18 +3205,29 @@ hfs_update(ap) * A shared lock is sufficient since an update doesn't change * the tree and the lock on vp protects the cnode. */ - error = hfs_metafilelocking(VTOHFS(vp), kHFSCatalogFileID, LK_SHARED, p); - if (error) + error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p); + if (error) { + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); return (error); + } /* XXX - waitfor is not enforced */ - error = cat_update(VTOHFS(vp), &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp); + error = cat_update(hfsmp, &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp); /* Unlock the Catalog b-tree file. */ - (void) hfs_metafilelocking(VTOHFS(vp), kHFSCatalogFileID, LK_RELEASE, p); + (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p); if (updateflag & (C_CHANGE | C_UPDATE)) - hfs_volupdate(VTOHFS(vp), VOL_UPDATE, 0); + hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + // XXXdbg + if (hfsmp->jnl) { + journal_end_transaction(hfsmp->jnl); + } + hfs_global_shared_lock_release(hfsmp); /* After the updates are finished, clear the flags */ cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE | C_ATIMEMOD); @@ -2826,7 +3256,7 @@ hfs_makenode(mode, dvp, vpp, cnp) struct proc *p; struct cat_desc in_desc, out_desc; struct cat_attr attr; - int error; + int error, started_tr = 0, grabbed_lock = 0; enum vtype vnodetype; p = cnp->cn_proc; @@ -2902,6 +3332,16 @@ hfs_makenode(mode, dvp, vpp, cnp) in_desc.cd_parentcnid = dcp->c_cnid; in_desc.cd_flags = S_ISDIR(mode) ? CD_ISDIR : 0; + // XXXdbg + hfs_global_shared_lock_acquire(hfsmp); + grabbed_lock = 1; + if (hfsmp->jnl) { + if ((error = journal_start_transaction(hfsmp->jnl)) != 0) { + goto exit; + } + started_tr = 1; + } + /* Lock catalog b-tree */ error = hfs_metafilelocking(VTOHFS(dvp), kHFSCatalogFileID, LK_EXCLUSIVE, p); if (error) @@ -2921,14 +3361,37 @@ hfs_makenode(mode, dvp, vpp, cnp) dcp->c_flag |= C_CHANGE | C_UPDATE; tv = time; (void) VOP_UPDATE(dvp, &tv, &tv, 0); + hfs_volupdate(hfsmp, vnodetype == VDIR ? VOL_MKDIR : VOL_MKFILE, (dcp->c_cnid == kHFSRootFolderID)); + // XXXdbg + // have to end the transaction here before we call hfs_getnewvnode() + // because that can cause us to try and reclaim a vnode on a different + // file system which could cause us to start a transaction which can + // deadlock with someone on that other file system (since we could be + // holding two transaction locks as well as various vnodes and we did + // not obtain the locks on them in the proper order). + // + // NOTE: this means that if the quota check fails or we have to update + // the change time on a block-special device that those changes + // will happen as part of independent transactions. + // + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + started_tr = 0; + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + grabbed_lock = 0; + } + /* Create a vnode for the object just created: */ error = hfs_getnewvnode(hfsmp, NULL, &out_desc, 0, &attr, NULL, &tvp); if (error) goto exit; + #if QUOTA cp = VTOC(tvp); /* @@ -2945,6 +3408,7 @@ hfs_makenode(mode, dvp, vpp, cnp) VOP_RMDIR(dvp,tvp, cnp); else VOP_REMOVE(dvp,tvp, cnp); + return (error); } #endif /* QUOTA */ @@ -2960,8 +3424,8 @@ hfs_makenode(mode, dvp, vpp, cnp) tvp->v_type = IFTOVT(mode); cp->c_flag |= C_CHANGE; tv = time; - if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) { - vput(tvp); + if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) { + vput(tvp); goto exit; } } @@ -2974,6 +3438,16 @@ exit: FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI); vput(dvp); + // XXXdbg + if (started_tr) { + journal_end_transaction(hfsmp->jnl); + started_tr = 0; + } + if (grabbed_lock) { + hfs_global_shared_lock_release(hfsmp); + grabbed_lock = 0; + } + return (error); } diff --git a/bsd/hfs/hfscommon/BTree/BTree.c b/bsd/hfs/hfscommon/BTree/BTree.c index 12c2680af..65c12839f 100644 --- a/bsd/hfs/hfscommon/BTree/BTree.c +++ b/bsd/hfs/hfscommon/BTree/BTree.c @@ -339,6 +339,20 @@ OSStatus BTOpenPath (FCB *filePtr, err = ReleaseNode (btreePtr, &nodeRec); M_ExitOnError (err); + /* + * Under Mac OS, b-tree nodes can be non-contiguous on disk when the + * allocation block size is smaller than the b-tree node size. + * + * If journaling is turned on for this volume we can't deal with this + * situation and so we bail out. If journaling isn't on it's ok as + * hfs_strategy_fragmented() deals with it. Journaling can't support + * this because it assumes that if you give it a block that it's + * contiguous on disk. + */ + if ( FCBTOHFS(filePtr)->jnl && !NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize) ) { + return fsBTInvalidNodeErr; + } + //////////////////////////////// Success //////////////////////////////////// //€€ align LEOF to multiple of node size? - just on close @@ -456,6 +470,9 @@ OSStatus BTSearchRecord (FCB *filePtr, if (filePtr == nil) return paramErr; if (searchIterator == nil) return paramErr; + node.buffer = nil; + node.blockHeader = nil; + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; if (btreePtr == nil) return fsBTInvalidFileErr; @@ -629,9 +646,12 @@ OSStatus BTIterateRecord (FCB *filePtr, ////////////////////////// Priliminary Checks /////////////////////////////// - left.buffer = nil; - right.buffer = nil; - node.buffer = nil; + left.buffer = nil; + left.blockHeader = nil; + right.buffer = nil; + right.blockHeader = nil; + node.buffer = nil; + node.blockHeader = nil; if (filePtr == nil) @@ -928,9 +948,12 @@ BTIterateRecords(FCB *filePtr, BTreeIterationOperation operation, BTreeIterator ////////////////////////// Priliminary Checks /////////////////////////////// - left.buffer = nil; - right.buffer = nil; - node.buffer = nil; + left.buffer = nil; + left.blockHeader = nil; + right.buffer = nil; + right.blockHeader = nil; + node.buffer = nil; + node.blockHeader = nil; btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; @@ -1201,10 +1224,10 @@ OSStatus BTInsertRecord (FCB *filePtr, UInt16 index; Boolean recordFit; - ////////////////////////// Priliminary Checks /////////////////////////////// nodeRec.buffer = nil; // so we can call ReleaseNode + nodeRec.blockHeader = nil; err = CheckInsertParams (filePtr, iterator, record, recordLen); if (err != noErr) @@ -1241,6 +1264,9 @@ OSStatus BTInsertRecord (FCB *filePtr, err = GetNewNode (btreePtr, insertNodeNum, &nodeRec); M_ExitOnError (err); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + ((NodeDescPtr)nodeRec.buffer)->kind = kBTLeafNode; ((NodeDescPtr)nodeRec.buffer)->height = 1; @@ -1261,6 +1287,7 @@ OSStatus BTInsertRecord (FCB *filePtr, btreePtr->rootNode = insertNodeNum; btreePtr->firstLeafNode = insertNodeNum; btreePtr->lastLeafNode = insertNodeNum; + M_BTreeHeaderDirty (btreePtr); goto Success; @@ -1270,6 +1297,9 @@ OSStatus BTInsertRecord (FCB *filePtr, if (index > 0) { + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + recordFit = InsertKeyRecord (btreePtr, nodeRec.buffer, index, &iterator->key, KeyLength(btreePtr, &iterator->key), record->bufferAddress, recordLen); @@ -1308,7 +1338,7 @@ Success: ++btreePtr->writeCount; ++btreePtr->leafRecords; M_BTreeHeaderDirty (btreePtr); - + // create hint iterator->hint.writeCount = btreePtr->writeCount; iterator->hint.nodeNum = insertNodeNum; @@ -1359,6 +1389,7 @@ OSStatus BTReplaceRecord (FCB *filePtr, ////////////////////////// Priliminary Checks /////////////////////////////// nodeRec.buffer = nil; // so we can call ReleaseNode + nodeRec.blockHeader = nil; err = CheckInsertParams (filePtr, iterator, record, recordLen); if (err != noErr) @@ -1380,6 +1411,9 @@ OSStatus BTReplaceRecord (FCB *filePtr, err = GetNode (btreePtr, insertNodeNum, &nodeRec); if( err == noErr ) { + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit); M_ExitOnError (err); @@ -1415,6 +1449,9 @@ OSStatus BTReplaceRecord (FCB *filePtr, // optimization - if simple replace will work then don't extend btree // €€ if we tried this before, and failed because it wouldn't fit then we shouldn't try this again... + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit); M_ExitOnError (err); @@ -1441,6 +1478,9 @@ OSStatus BTReplaceRecord (FCB *filePtr, } + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + DeleteRecord (btreePtr, nodeRec.buffer, index); // delete existing key/record err = InsertTree (btreePtr, treePathTable, &iterator->key, record->bufferAddress, @@ -1498,6 +1538,7 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, ////////////////////////// Priliminary Checks /////////////////////////////// nodeRec.buffer = nil; // so we can call ReleaseNode + nodeRec.blockHeader = nil; btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; @@ -1521,6 +1562,9 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen); M_ExitOnError (err); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + err = callBackProc(keyPtr, recordPtr, recordLen, callBackState); M_ExitOnError (err); @@ -1553,6 +1597,9 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen); M_ExitOnError (err); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &nodeRec); + err = callBackProc(keyPtr, recordPtr, recordLen, callBackState); M_ExitOnError (err); @@ -1600,6 +1647,7 @@ OSStatus BTDeleteRecord (FCB *filePtr, ////////////////////////// Priliminary Checks /////////////////////////////// nodeRec.buffer = nil; // so we can call ReleaseNode + nodeRec.blockHeader = nil; M_ReturnErrorIf (filePtr == nil, paramErr); M_ReturnErrorIf (iterator == nil, paramErr); @@ -1630,7 +1678,7 @@ OSStatus BTDeleteRecord (FCB *filePtr, ++btreePtr->writeCount; --btreePtr->leafRecords; M_BTreeHeaderDirty (btreePtr); - + iterator->hint.nodeNum = 0; return noErr; @@ -1682,7 +1730,16 @@ OSStatus BTGetInformation (FCB *filePtr, return noErr; } +// XXXdbg +__private_extern__ +OSStatus +BTIsDirty(FCB *filePtr) +{ + BTreeControlBlockPtr btreePtr; + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + return TreeIsDirty(btreePtr); +} /*------------------------------------------------------------------------------- Routine: BTFlushPath - Flush BTreeControlBlock to Header Node. @@ -1743,6 +1800,9 @@ BTReloadData(FCB *filePtr) BTHeaderRec *header; + node.buffer = nil; + node.blockHeader = nil; + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; if (btreePtr == nil) return (fsBTInvalidFileErr); @@ -1877,3 +1937,62 @@ OSStatus BTSetLastSync (FCB *filePtr, } +/*------------------------------------------------------------------------------- +Routine: BTCheckFreeSpace + +Function: Makes sure there is enough free space so that a tree operation + will succeed. + +Input: fcb - pointer file control block + +Output: none + +Result: noErr - success + +-------------------------------------------------------------------------------*/ + +__private_extern__ +OSStatus BTCheckFreeSpace (FCB *filePtr) +{ + BTreeControlBlockPtr btreePtr; + int nodesNeeded, err = noErr; + + + M_ReturnErrorIf (filePtr == nil, paramErr); + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + + M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); + + // XXXdbg this is highly conservative but so much better than + // winding up with turds on your disk. + // + nodesNeeded = (btreePtr->treeDepth + 1) * 10; + + if (btreePtr->freeNodes < nodesNeeded) { + err = ExtendBTree(btreePtr, nodesNeeded + btreePtr->totalNodes - btreePtr->freeNodes); + } + + return err; +} + + +__private_extern__ +OSStatus BTHasContiguousNodes (FCB *filePtr) +{ + BTreeControlBlockPtr btreePtr; + int nodesNeeded, err = noErr; + + + M_ReturnErrorIf (filePtr == nil, paramErr); + + btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr; + + REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true); + + M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr); + + return NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize); +} diff --git a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c index 60cfa0635..a902d5087 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c +++ b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c @@ -125,12 +125,16 @@ OSStatus AllocateNode (BTreeControlBlockPtr btreePtr, UInt32 *nodeNum) nodeNumber = 0; // first node number of header map record node.buffer = nil; // clear node.buffer to get header node // - and for ErrorExit + node.blockHeader = nil; while (true) { err = GetMapNode (btreePtr, &node, &mapPtr, &mapSize); M_ExitOnError (err); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &node); + //////////////////////// Find Word with Free Bit //////////////////////////// pos = mapPtr; @@ -233,6 +237,7 @@ OSStatus FreeNode (BTreeControlBlockPtr btreePtr, UInt32 nodeNum) //////////////////////////// Find Map Record //////////////////////////////// nodeIndex = 0; // first node number of header map record node.buffer = nil; // invalidate node.buffer to get header node + node.blockHeader = nil; while (nodeNum >= nodeIndex) { @@ -244,6 +249,9 @@ OSStatus FreeNode (BTreeControlBlockPtr btreePtr, UInt32 nodeNum) //////////////////////////// Mark Node Free ///////////////////////////////// + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &node); + nodeNum -= (nodeIndex - (mapSize << 3)); // relative to this map record bitOffset = 15 - (nodeNum & 0x0000000F); // last 4 bits are bit offset mapPos += nodeNum >> 4; // point to word containing map bit @@ -319,7 +327,9 @@ OSStatus ExtendBTree (BTreeControlBlockPtr btreePtr, filePtr = GetFileControlBlock(btreePtr->fileRefNum); mapNode.buffer = nil; + mapNode.blockHeader = nil; newNode.buffer = nil; + newNode.blockHeader = nil; mapNodeRecSize = nodeSize - sizeof(BTNodeDescriptor) - 6; // 2 bytes of free space (see note) @@ -379,6 +389,8 @@ OSStatus ExtendBTree (BTreeControlBlockPtr btreePtr, /////////////////////// Initialize New Map Nodes //////////////////////////// + // XXXdbg - this is the correct place for this: + ModifyBlockStart(btreePtr->fileRefNum, &mapNode); ((BTNodeDescriptor*)mapNode.buffer)->fLink = firstNewMapNodeNum; @@ -388,6 +400,9 @@ OSStatus ExtendBTree (BTreeControlBlockPtr btreePtr, err = GetNewNode (btreePtr, nodeNum, &newNode); M_ExitOnError (err); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &newNode); + ((NodeDescPtr)newNode.buffer)->numRecords = 1; ((NodeDescPtr)newNode.buffer)->kind = kBTMapNode; @@ -428,6 +443,9 @@ OSStatus ExtendBTree (BTreeControlBlockPtr btreePtr, err = GetNode (btreePtr, nextNodeNum, &mapNode); M_ExitOnError (err); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &mapNode); + mapIndex = 0; mapStart = (UInt16 *) GetRecordAddress (btreePtr, mapNode.buffer, mapIndex); @@ -476,7 +494,7 @@ Success: ////////////////////////////// Error Exit /////////////////////////////////// ErrorExit: - + (void) ReleaseNode (btreePtr, &mapNode); (void) ReleaseNode (btreePtr, &newNode); diff --git a/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c b/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c index c71fab021..7d56bf4f8 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c +++ b/bsd/hfs/hfscommon/BTree/BTreeMiscOps.c @@ -209,6 +209,14 @@ OSStatus VerifyHeader (FCB *filePtr, +__private_extern__ +OSStatus TreeIsDirty(BTreeControlBlockPtr btreePtr) +{ + return (btreePtr->flags & kBTHeaderDirty); +} + + + /*------------------------------------------------------------------------------- Routine: UpdateHeader - Write BTreeInfoRec fields to Header node. @@ -229,15 +237,18 @@ OSStatus UpdateHeader(BTreeControlBlockPtr btreePtr, Boolean forceWrite) BTHeaderRec *header; UInt32 options; - if ((btreePtr->flags & kBTHeaderDirty) == 0) // btree info already flushed return noErr; err = GetNode (btreePtr, kHeaderNodeNum, &node ); - if (err != noErr) + if (err != noErr) { return err; + } + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &node); + header = (BTHeaderRec*) ((char *)node.buffer + sizeof(BTNodeDescriptor)); header->treeDepth = btreePtr->treeDepth; @@ -315,8 +326,11 @@ OSStatus FindIteratorPosition (BTreeControlBlockPtr btreePtr, // assume foundRecord points to Boolean left->buffer = nil; + left->blockHeader = nil; middle->buffer = nil; + middle->blockHeader = nil; right->buffer = nil; + right->blockHeader = nil; foundIt = false; diff --git a/bsd/hfs/hfscommon/BTree/BTreeScanner.c b/bsd/hfs/hfscommon/BTree/BTreeScanner.c index 014069807..8cc50aaa1 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeScanner.c +++ b/bsd/hfs/hfscommon/BTree/BTreeScanner.c @@ -221,7 +221,7 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr ) // release old buffer if we have one if ( theScanStatePtr->bufferPtr != NULL ) { - theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE); + theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE); brelse( theScanStatePtr->bufferPtr ); theScanStatePtr->bufferPtr = NULL; theScanStatePtr->currentNodePtr = NULL; @@ -249,10 +249,10 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr ) // now read blocks from the device myErr = bread( myDevPtr, - myPhyBlockNum, - myBufferSize, - NOCRED, - &theScanStatePtr->bufferPtr ); + myPhyBlockNum, + myBufferSize, + NOCRED, + &theScanStatePtr->bufferPtr ); if ( myErr != E_NONE ) { goto ExitThisRoutine; @@ -374,7 +374,7 @@ int BTScanTerminate( BTScanState * scanState, if ( scanState->bufferPtr != NULL ) { scanState->bufferPtr->b_flags |= (B_INVAL | B_AGE); - brelse( scanState->bufferPtr ); + brelse( scanState->bufferPtr ); scanState->bufferPtr = NULL; scanState->currentNodePtr = NULL; } diff --git a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c index 2de280321..3a8463911 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c +++ b/bsd/hfs/hfscommon/BTree/BTreeTreeOps.c @@ -395,13 +395,17 @@ OSStatus InsertLevel (BTreeControlBlockPtr btreePtr, PanicIf ((level == 1) && (((NodeDescPtr)targetNode->buffer)->kind != kBTLeafNode), "\P InsertLevel: non-leaf at level 1! "); #endif leftNode.buffer = nil; + leftNode.blockHeader = nil; targetNodeNum = treePathTable [level].node; insertParent = false; updateParent = false; + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, targetNode); + ////// process first insert ////// - + err = InsertNode (btreePtr, primaryKey, targetNode, targetNodeNum, index, &newNodeNum, &newIndex, &leftNode, &updateParent, &insertParent, &newRoot ); M_ExitOnError (err); @@ -446,6 +450,9 @@ OSStatus InsertLevel (BTreeControlBlockPtr btreePtr, UInt8 * recPtr; UInt16 recSize; + parentNode.buffer = nil; + parentNode.blockHeader = nil; + secondaryKey = nil; PanicIf ( (level == btreePtr->treeDepth), "\p InsertLevel: unfinished insert!?"); @@ -468,6 +475,9 @@ OSStatus InsertLevel (BTreeControlBlockPtr btreePtr, if ( updateParent ) { + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &parentNode); + //€€ debug: check if ptr == targetNodeNum GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize); PanicIf( (*(UInt32 *) recPtr) != targetNodeNum, "\p InsertLevel: parent ptr doesn't match target node!"); @@ -594,6 +604,8 @@ static OSErr InsertNode (BTreeControlBlockPtr btreePtr, { err = GetNode (btreePtr, leftNodeNum, leftNode); // will be released by caller or a split below M_ExitOnError (err); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, leftNode); } PanicIf ( ((NodeDescPtr) leftNode->buffer)->fLink != node, "\p InsertNode, RotateLeft: invalid sibling link!" ); @@ -642,7 +654,6 @@ static OSErr InsertNode (BTreeControlBlockPtr btreePtr, return noErr; ErrorExit: - (void) ReleaseNode (btreePtr, leftNode); return err; @@ -678,7 +689,11 @@ OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, Boolean deleteRequired; Boolean updateRequired; - + // XXXdbg - initialize these to null in case we get an + // error and try to exit before it's initialized + parentNode.buffer = nil; + parentNode.blockHeader = nil; + deleteRequired = false; updateRequired = false; @@ -686,6 +701,9 @@ OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, targetNodePtr = targetNode->buffer; PanicIf (targetNodePtr == nil, "\pDeleteTree: targetNode has nil buffer!"); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, targetNode); + DeleteRecord (btreePtr, targetNodePtr, index); //€€ coalesce remaining records? @@ -697,6 +715,9 @@ OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, deleteRequired = true; + siblingNode.buffer = nil; + siblingNode.blockHeader = nil; + ////////////////// Get Siblings & Update Links ////////////////////////// siblingNodeNum = targetNodePtr->bLink; // Left Sibling Node @@ -704,6 +725,10 @@ OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, { err = GetNode (btreePtr, siblingNodeNum, &siblingNode); M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &siblingNode); + ((NodeDescPtr)siblingNode.buffer)->fLink = targetNodePtr->fLink; err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction); M_ExitOnError (err); @@ -718,6 +743,10 @@ OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, { err = GetNode (btreePtr, siblingNodeNum, &siblingNode); M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &siblingNode); + ((NodeDescPtr)siblingNode.buffer)->bLink = targetNodePtr->bLink; err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction); M_ExitOnError (err); @@ -733,6 +762,7 @@ OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction); M_ExitOnError (err); + err = FreeNode (btreePtr, targetNodeNum); M_ExitOnError (err); } @@ -776,6 +806,9 @@ OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, UInt16 recSize; UInt32 insertNode; + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &parentNode); + //€€ debug: check if ptr == targetNodeNum GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize); PanicIf( (*(UInt32 *) recPtr) != targetNodeNum, "\p DeleteTree: parent ptr doesn't match targetNodeNum!!"); @@ -805,7 +838,7 @@ OSStatus DeleteTree (BTreeControlBlockPtr btreePtr, return noErr; ErrorExit: - + (void) ReleaseNode (btreePtr, targetNode); (void) ReleaseNode (btreePtr, &parentNode); @@ -826,6 +859,9 @@ static OSStatus CollapseTree (BTreeControlBlockPtr btreePtr, originalRoot = btreePtr->rootNode; + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, blockPtr); + while (true) { if ( ((NodeDescPtr)blockPtr->buffer)->numRecords > 1) @@ -848,6 +884,9 @@ static OSStatus CollapseTree (BTreeControlBlockPtr btreePtr, //// Get New Root Node err = GetNode (btreePtr, btreePtr->rootNode, blockPtr); M_ExitOnError (err); + + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, blockPtr); } if (btreePtr->rootNode != originalRoot) @@ -1110,6 +1149,9 @@ static OSStatus SplitLeft (BTreeControlBlockPtr btreePtr, if ( left != nil ) { + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, leftNode); + left->fLink = newNodeNum; err = UpdateNode (btreePtr, leftNode, 0, kLockTransaction); M_ExitOnError (err); @@ -1121,6 +1163,9 @@ static OSStatus SplitLeft (BTreeControlBlockPtr btreePtr, err = GetNewNode (btreePtr, newNodeNum, leftNode); M_ExitOnError (err); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, leftNode); + left = leftNode->buffer; left->fLink = rightNodeNum; @@ -1145,8 +1190,9 @@ static OSStatus SplitLeft (BTreeControlBlockPtr btreePtr, err = RotateLeft (btreePtr, left, right, index, keyPtr, recPtr, recSize, insertIndex, insertNodeNum, &recordFit, recsRotated); - M_ExitOnError (err); + M_ExitOnError (err); + return noErr; ErrorExit: @@ -1202,6 +1248,9 @@ static OSStatus AddNewRootNode (BTreeControlBlockPtr btreePtr, Boolean didItFit; UInt16 keyLength; + rootNode.buffer = nil; + rootNode.blockHeader = nil; + PanicIf (leftNode == nil, "\pAddNewRootNode: leftNode == nil"); PanicIf (rightNode == nil, "\pAddNewRootNode: rightNode == nil"); @@ -1214,6 +1263,9 @@ static OSStatus AddNewRootNode (BTreeControlBlockPtr btreePtr, err = GetNewNode (btreePtr, rootNum, &rootNode); M_ExitOnError (err); + // XXXdbg + ModifyBlockStart(btreePtr->fileRefNum, &rootNode); + ((NodeDescPtr)rootNode.buffer)->kind = kBTIndexNode; ((NodeDescPtr)rootNode.buffer)->height = ++btreePtr->treeDepth; diff --git a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c index 923e90334..44e5996a0 100644 --- a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c +++ b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c @@ -65,6 +65,9 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey, NULL); ReturnIfError(err); + err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum)); + ReturnIfError(err); + if ( isHFSPlus ) { //-- Step 1: Check the catalog nodes for extents diff --git a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c index b294edd9a..6831d79c0 100644 --- a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c +++ b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c @@ -495,6 +495,12 @@ static OSErr CreateExtentRecord( err = noErr; *hint = 0; + + // XXXdbg - preflight that there's enough space + err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum)); + if (err) + return err; + MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK); bzero(btIterator, sizeof(*btIterator)); @@ -530,6 +536,8 @@ static OSErr CreateExtentRecord( if (err == noErr) *hint = btIterator->hint.nodeNum; + (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum)); + FREE(btIterator, M_TEMP); return err; } @@ -545,6 +553,12 @@ OSErr DeleteExtentRecord( OSErr err; err = noErr; + + // XXXdbg - preflight that there's enough space + err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum)); + if (err) + return err; + MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK); bzero(btIterator, sizeof(*btIterator)); @@ -569,7 +583,8 @@ OSErr DeleteExtentRecord( } err = BTDeleteRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator); - + (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum)); + FREE(btIterator, M_TEMP); return err; } @@ -1730,6 +1745,12 @@ static OSErr UpdateExtentRecord ( // Need to find and change a record in Extents BTree // btFCB = GetFileControlBlock(vcb->extentsRefNum); + + // XXXdbg - preflight that there's enough space + err = BTCheckFreeSpace(btFCB); + if (err) + return err; + MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK); bzero(btIterator, sizeof(*btIterator)); @@ -1757,6 +1778,7 @@ static OSErr UpdateExtentRecord ( if (err == noErr) err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize); + (void) BTFlushPath(btFCB); } else { // HFS Plus volume HFSPlusExtentRecord foundData; // The extent data actually found @@ -1776,6 +1798,7 @@ static OSErr UpdateExtentRecord ( BlockMoveData(extentData, &foundData, sizeof(HFSPlusExtentRecord)); err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize); } + (void) BTFlushPath(btFCB); } FREE(btIterator, M_TEMP); } @@ -1887,3 +1910,58 @@ static Boolean ExtentsAreIntegral( return true; } + + +//_________________________________________________________________________________ +// +// Routine: NodesAreContiguous +// +// Purpose: Ensure that all b-tree nodes are contiguous on disk +// Called by BTOpenPath during volume mount +//_________________________________________________________________________________ + +Boolean NodesAreContiguous( + ExtendedVCB *vcb, + FCB *fcb, + UInt32 nodeSize) +{ + UInt32 mask; + UInt32 startBlock; + UInt32 blocksChecked; + UInt32 hint; + HFSPlusExtentKey key; + HFSPlusExtentRecord extents; + OSErr result; + Boolean lastExtentReached; + + + if (vcb->blockSize >= nodeSize) + return TRUE; + + mask = (nodeSize / vcb->blockSize) - 1; + + // check the local extents + (void) GetFCBExtentRecord(fcb, extents); + if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) ) + return FALSE; + + if (lastExtentReached || (SInt64)((SInt64)blocksChecked * (SInt64)vcb->blockSize) >= fcb->ff_size) + return TRUE; + + startBlock = blocksChecked; + + // check the overflow extents (if any) + while ( !lastExtentReached ) + { + result = FindExtentRecord(vcb, kDataForkType, fcb->ff_cp->c_fileid, startBlock, FALSE, &key, extents, &hint); + if (result) break; + + if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) ) + return FALSE; + + startBlock += blocksChecked; + } + + return TRUE; +} + diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c index ae4fccf6f..4fe649921 100644 --- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c +++ b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c @@ -476,7 +476,14 @@ static OSErr ReleaseBitmapBlock( if (bp) { if (dirty) { - bdwrite(bp); + // XXXdbg + struct hfsmount *hfsmp = VCBTOHFS(vcb); + + if (hfsmp->jnl) { + journal_modify_block_end(hfsmp->jnl, bp); + } else { + bdwrite(bp); + } } else { brelse(bp); } @@ -597,6 +604,7 @@ static OSErr BlockAllocateAny( UInt32 bitsPerBlock; UInt32 wordsPerBlock; Boolean dirty = false; + struct hfsmount *hfsmp = VCBTOHFS(vcb); // Since this routine doesn't wrap around if (maxBlocks > (endingBlock - startingBlock)) { @@ -678,6 +686,11 @@ static OSErr BlockAllocateAny( endingBlock = block + maxBlocks; // if we get this far, we've found enough } + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + // // Allocate all of the consecutive blocks // @@ -709,6 +722,11 @@ static OSErr BlockAllocateAny( if (err != noErr) goto Exit; buffer = currCache; + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + wordsLeft = wordsPerBlock; } @@ -845,6 +863,8 @@ static OSErr BlockMarkAllocated( UInt32 blockRef; UInt32 bitsPerBlock; UInt32 wordsPerBlock; + // XXXdbg + struct hfsmount *hfsmp = VCBTOHFS(vcb); // // Pre-read the bitmap block containing the first word of allocation @@ -866,6 +886,11 @@ static OSErr BlockMarkAllocated( wordsLeft = wordsPerBlock - wordIndexInBlock; } + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + // // If the first block to allocate doesn't start on a word // boundary in the bitmap, then treat that first word @@ -909,6 +934,11 @@ static OSErr BlockMarkAllocated( err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); if (err != noErr) goto Exit; + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + // Readjust currentWord and wordsLeft currentWord = buffer; wordsLeft = wordsPerBlock; @@ -942,6 +972,11 @@ static OSErr BlockMarkAllocated( err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); if (err != noErr) goto Exit; + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + // Readjust currentWord and wordsLeft currentWord = buffer; wordsLeft = wordsPerBlock; @@ -995,6 +1030,8 @@ static OSErr BlockMarkFree( UInt32 blockRef; UInt32 bitsPerBlock; UInt32 wordsPerBlock; + // XXXdbg + struct hfsmount *hfsmp = VCBTOHFS(vcb); // // Pre-read the bitmap block containing the first word of allocation @@ -1002,6 +1039,11 @@ static OSErr BlockMarkFree( err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); if (err != noErr) goto Exit; + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + // // Initialize currentWord, and wordsLeft. // @@ -1058,6 +1100,11 @@ static OSErr BlockMarkFree( err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); if (err != noErr) goto Exit; + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + // Readjust currentWord and wordsLeft currentWord = buffer; wordsLeft = wordsPerBlock; @@ -1092,6 +1139,11 @@ static OSErr BlockMarkFree( err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); if (err != noErr) goto Exit; + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + // Readjust currentWord and wordsLeft currentWord = buffer; wordsLeft = wordsPerBlock; diff --git a/bsd/hfs/hfscommon/headers/BTreesInternal.h b/bsd/hfs/hfscommon/headers/BTreesInternal.h index a473cfceb..4ae9e7ad3 100644 --- a/bsd/hfs/hfscommon/headers/BTreesInternal.h +++ b/bsd/hfs/hfscommon/headers/BTreesInternal.h @@ -115,7 +115,8 @@ struct BlockDescriptor{ void *blockHeader; ByteCount blockSize; Boolean blockReadFromDisk; - Byte reserved[3]; + Byte isModified; // XXXdbg - for journaling + Byte reserved[2]; }; typedef struct BlockDescriptor BlockDescriptor; typedef BlockDescriptor *BlockDescPtr; @@ -338,6 +339,10 @@ extern OSStatus BTGetLastSync (FCB *filePtr, extern OSStatus BTSetLastSync (FCB *filePtr, UInt32 lastfsync ); +extern OSStatus BTCheckFreeSpace (FCB *filePtr); + +extern OSStatus BTHasContiguousNodes(FCB *filePtr); + #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ #endif // __BTREESINTERNAL__ diff --git a/bsd/hfs/hfscommon/headers/BTreesPrivate.h b/bsd/hfs/hfscommon/headers/BTreesPrivate.h index 4721f13a5..805c86346 100644 --- a/bsd/hfs/hfscommon/headers/BTreesPrivate.h +++ b/bsd/hfs/hfscommon/headers/BTreesPrivate.h @@ -382,6 +382,10 @@ OSStatus ReleaseNode (BTreeControlBlockPtr btreePtr, OSStatus TrashNode (BTreeControlBlockPtr btreePtr, NodePtr nodePtr ); +// XXXdbg +void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr); +// XXXdbg + OSStatus UpdateNode (BTreeControlBlockPtr btreePtr, NodePtr nodePtr, UInt32 transactionID, diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index ed614c238..3febc75bf 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -1086,6 +1086,10 @@ kern_return_t map_fd_funneled( if (fp->f_type != DTYPE_VNODE) return(KERN_INVALID_ARGUMENT); + + if (!(fp->f_flag & FREAD)) + return (KERN_PROTECTION_FAILURE); + vp = (struct vnode *)fp->f_data; if (vp->v_type != VREG) diff --git a/bsd/kern/qsort.c b/bsd/kern/qsort.c index 6ccb04112..d1505f175 100644 --- a/bsd/kern/qsort.c +++ b/bsd/kern/qsort.c @@ -58,7 +58,7 @@ #include -#include +//#include static inline char *med3 __P((char *, char *, char *, int (*)())); static inline void swapfunc __P((char *, char *, int, int)); @@ -113,6 +113,7 @@ med3(a, b, c, cmp) :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c )); } +__private_extern__ void qsort(a, n, es, cmp) void *a; diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index 955b6b638..47cb041ab 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2001 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -78,6 +78,62 @@ ubc_unlock(struct vnode *vp) simple_unlock(&vp->v_interlock); } +/* + * Serialize the requests to the VM + * Returns: + * 0 - Failure + * 1 - Sucessful in acquiring the lock + * 2 - Sucessful in acquiring the lock recursively + * do not call ubc_unbusy() + * [This is strange, but saves 4 bytes in struct ubc_info] + */ +static int +ubc_busy(struct vnode *vp) +{ + register struct ubc_info *uip; + + if (!UBCINFOEXISTS(vp)) + return (0); + + uip = vp->v_ubcinfo; + + while (ISSET(uip->ui_flags, UI_BUSY)) { + + if (uip->ui_owner == (void *)current_thread()) + return (2); + + SET(uip->ui_flags, UI_WANTED); + (void) tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "ubcbusy", 0); + + if (!UBCINFOEXISTS(vp)) + return (0); + } + uip->ui_owner = (void *)current_thread(); + + SET(uip->ui_flags, UI_BUSY); + + return (1); +} + +static void +ubc_unbusy(struct vnode *vp) +{ + register struct ubc_info *uip; + + if (!UBCINFOEXISTS(vp)) { + wakeup((caddr_t)&vp->v_ubcinfo); + return; + } + uip = vp->v_ubcinfo; + CLR(uip->ui_flags, UI_BUSY); + uip->ui_owner = (void *)NULL; + + if (ISSET(uip->ui_flags, UI_WANTED)) { + CLR(uip->ui_flags, UI_WANTED); + wakeup((caddr_t)&vp->v_ubcinfo); + } +} + /* * Initialization of the zone for Unified Buffer Cache. */ @@ -139,6 +195,7 @@ ubc_info_init(struct vnode *vp) uip->ui_refcount = 1; uip->ui_size = 0; uip->ui_mapped = 0; + uip->ui_owner = (void *)NULL; ubc_lock(vp); } #if DIAGNOSTIC @@ -232,10 +289,20 @@ ubc_info_free(struct ubc_info *uip) void ubc_info_deallocate(struct ubc_info *uip) { + assert(uip->ui_refcount > 0); - if (uip->ui_refcount-- == 1) + if (uip->ui_refcount-- == 1) { + struct vnode *vp; + + vp = uip->ui_vnode; + if (ISSET(uip->ui_flags, UI_WANTED)) { + CLR(uip->ui_flags, UI_WANTED); + wakeup((caddr_t)&vp->v_ubcinfo); + } + ubc_info_free(uip); + } } /* @@ -339,12 +406,16 @@ ubc_uncache(struct vnode *vp) { kern_return_t kret; struct ubc_info *uip; + int recursed; memory_object_control_t control; memory_object_perf_info_data_t perf; if (!UBCINFOEXISTS(vp)) return (0); + if ((recursed = ubc_busy(vp)) == 0) + return (0); + uip = vp->v_ubcinfo; assert(uip != UBC_INFO_NULL); @@ -372,11 +443,15 @@ ubc_uncache(struct vnode *vp) if (kret != KERN_SUCCESS) { printf("ubc_uncache: memory_object_change_attributes_named " "kret = %d", kret); + if (recursed == 1) + ubc_unbusy(vp); return (0); } ubc_release_named(vp); + if (recursed == 1) + ubc_unbusy(vp); return (1); } @@ -506,15 +581,16 @@ memory_object_control_t ubc_getobject(struct vnode *vp, int flags) { struct ubc_info *uip; + int recursed; memory_object_control_t control; - uip = vp->v_ubcinfo; - if (UBCINVALID(vp)) return (0); - ubc_lock(vp); + if ((recursed = ubc_busy(vp)) == 0) + return (0); + uip = vp->v_ubcinfo; control = uip->ui_control; if ((flags & UBC_HOLDOBJECT) && (!ISSET(uip->ui_flags, UI_HASOBJREF))) { @@ -523,19 +599,21 @@ ubc_getobject(struct vnode *vp, int flags) * Take a temporary reference on the ubc info so that it won't go * away during our recovery attempt. */ + ubc_lock(vp); uip->ui_refcount++; ubc_unlock(vp); if (memory_object_recover_named(control, TRUE) == KERN_SUCCESS) { - ubc_lock(vp); SET(uip->ui_flags, UI_HASOBJREF); - ubc_unlock(vp); } else { control = MEMORY_OBJECT_CONTROL_NULL; } + if (recursed == 1) + ubc_unbusy(vp); ubc_info_deallocate(uip); } else { - ubc_unlock(vp); + if (recursed == 1) + ubc_unbusy(vp); } return (control); @@ -770,15 +848,16 @@ int ubc_hold(struct vnode *vp) { struct ubc_info *uip; + int recursed; memory_object_control_t object; if (UBCINVALID(vp)) return (0); - if (!UBCINFOEXISTS(vp)) { + if ((recursed = ubc_busy(vp)) == 0) { /* must be invalid or dying vnode */ assert(UBCINVALID(vp) || - ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE))); + ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE))); return (0); } @@ -787,21 +866,23 @@ ubc_hold(struct vnode *vp) ubc_lock(vp); uip->ui_refcount++; + ubc_unlock(vp); if (!ISSET(uip->ui_flags, UI_HASOBJREF)) { - ubc_unlock(vp); - if (memory_object_recover_named(uip->ui_control, TRUE) != KERN_SUCCESS) { + if (memory_object_recover_named(uip->ui_control, TRUE) + != KERN_SUCCESS) { + if (recursed == 1) + ubc_unbusy(vp); ubc_info_deallocate(uip); return (0); } - ubc_lock(vp); SET(uip->ui_flags, UI_HASOBJREF); - ubc_unlock(vp); - } else { - ubc_unlock(vp); } + if (recursed == 1) + ubc_unbusy(vp); assert(uip->ui_refcount > 0); + return (1); } @@ -872,28 +953,30 @@ int ubc_release_named(struct vnode *vp) { struct ubc_info *uip; + int recursed; memory_object_control_t control; - kern_return_t kret; + kern_return_t kret = KERN_FAILURE; if (UBCINVALID(vp)) return (0); - if (!UBCINFOEXISTS(vp)) + if ((recursed = ubc_busy(vp)) == 0) return (0); - uip = vp->v_ubcinfo; /* can not release held or mapped vnodes */ if (ISSET(uip->ui_flags, UI_HASOBJREF) && - (uip->ui_refcount == 1) && !uip->ui_mapped) { + (uip->ui_refcount == 1) && !uip->ui_mapped) { control = uip->ui_control; assert(control); CLR(uip->ui_flags, UI_HASOBJREF); kret = memory_object_release_name(control, MEMORY_OBJECT_RESPECT_CACHE); - return ((kret != KERN_SUCCESS) ? 0 : 1); - } else - return (0); + } + + if (recursed == 1) + ubc_unbusy(vp); + return ((kret != KERN_SUCCESS) ? 0 : 1); } /* diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index 52eea7f81..f44cd9323 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -555,7 +555,8 @@ loop: s = splbio(); for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { nbp = bp->b_vnbufs.le_next; - if ((bp->b_flags & B_BUSY)) + // XXXdbg - don't flush locked blocks. they may be journaled. + if ((bp->b_flags & B_BUSY) || (bp->b_flags & B_LOCKED)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("spec_fsync: not dirty"); diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index 7a7394c78..4b77d6637 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -115,7 +115,8 @@ nfs_bioread(vp, uio, ioflag, cred, getpages) int getpages; { register struct nfsnode *np = VTONFS(vp); - register int biosize, diff, i; + register int biosize, i; + off_t diff; struct buf *bp = 0, *rabp; struct vattr vattr; struct proc *p; @@ -268,7 +269,7 @@ again: bufsize = biosize; if ((off_t)(lbn + 1) * biosize > np->n_size && (off_t)(lbn + 1) * biosize - np->n_size < biosize) { - bufsize = np->n_size - lbn * biosize; + bufsize = np->n_size - (off_t)lbn * biosize; bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); } bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation); @@ -876,7 +877,7 @@ nfs_getcacheblk(vp, bn, size, p, operation) bp = getblk(vp, bn, size, 0, 0, operation); if( vp->v_type == VREG) - bp->b_blkno = (bn * biosize) / DEV_BSIZE; + bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE; return (bp); } diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index ef42d4683..8038b43a6 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -2204,7 +2204,7 @@ nfsrv_getstream(slp, waitflag) register struct mbuf *m, **mpp; register char *cp1, *cp2; register int len; - struct mbuf *om, *m2, *recm = 0; + struct mbuf *om, *m2, *recm; u_long recmark; if (slp->ns_flag & SLP_GETSTREAM) @@ -2249,7 +2249,11 @@ nfsrv_getstream(slp, waitflag) /* * Now get the record part. + * + * Note that slp->ns_reclen may be 0. Linux sometimes + * generates 0-length RPCs */ + recm = NULL; if (slp->ns_cc == slp->ns_reclen) { recm = slp->ns_raw; slp->ns_raw = slp->ns_rawend = (struct mbuf *)0; diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index 2d516acf2..e8c78eee8 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -4512,8 +4512,8 @@ again: #if 0 /* (removed for UBC) */ bufsize = biosize; - if ((lbn + 1) * biosize > np->n_size) { - bufsize = np->n_size - lbn * biosize; + if ((off_t)(lbn + 1) * biosize > np->n_size) { + bufsize = np->n_size - (off_t)lbn * biosize; bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); } #endif @@ -4618,7 +4618,7 @@ nfs_blktooff(ap) biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); /* nfs_bio.c */ - *ap->a_offset = (off_t)(ap->a_lblkno * biosize); + *ap->a_offset = (off_t)ap->a_lblkno * biosize; return (0); } diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h index fb456c562..d051d11f0 100644 --- a/bsd/sys/buf.h +++ b/bsd/sys/buf.h @@ -132,6 +132,15 @@ struct buf { #define b_trans_head b_freelist.tqe_prev #define b_trans_next b_freelist.tqe_next #define b_real_bp b_saveaddr +#define b_iostate b_rcred + +/* journaling uses this cluster i/o field for its own + * purposes because meta data buf's should never go + * through the clustering code. + */ +#define b_transaction b_vectorlist + + /* * These flags are kept in b_flags. @@ -163,7 +172,7 @@ struct buf { #define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ #define B_WRITEINPROG 0x01000000 /* Write in progress. */ #define B_HDRALLOC 0x02000000 /* zone allocated buffer header */ -#define B_UNUSED1 0x04000000 /* Unused bit */ +#define B_NORELSE 0x04000000 /* don't brelse() in bwrite() */ #define B_NEED_IODONE 0x08000000 /* need to do a biodone on the */ /* real_bp associated with a cluster_io */ diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index 74b269c58..65a4bffdd 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -44,8 +44,12 @@ typedef struct #define DKIOCGETMAXBLOCKCOUNTREAD _IOR('d', 64, u_int64_t) #define DKIOCGETMAXBLOCKCOUNTWRITE _IOR('d', 65, u_int64_t) +#define DKIOCGETMAXBYTECOUNTREAD _IOR('d', 70, u_int64_t) +#define DKIOCGETMAXBYTECOUNTWRITE _IOR('d', 71, u_int64_t) #define DKIOCGETMAXSEGMENTCOUNTREAD _IOR('d', 66, u_int64_t) #define DKIOCGETMAXSEGMENTCOUNTWRITE _IOR('d', 67, u_int64_t) +#define DKIOCGETMAXSEGMENTBYTECOUNTREAD _IOR('d', 68, u_int64_t) +#define DKIOCGETMAXSEGMENTBYTECOUNTWRITE _IOR('d', 69, u_int64_t) #ifdef KERNEL #define DKIOCSETBLOCKSIZE _IOW('d', 24, u_int32_t) diff --git a/bsd/sys/malloc.h b/bsd/sys/malloc.h index fb05b8734..751de10e5 100644 --- a/bsd/sys/malloc.h +++ b/bsd/sys/malloc.h @@ -164,8 +164,9 @@ #define M_IP6MISC 88 /* IPv6 misc. memory */ #define M_TSEGQ 89 /* TCP segment queue entry */ #define M_IGMP 90 +#define M_JOURNAL 91 /* VFS Journaling code */ -#define M_LAST 91 /* Must be last type + 1 */ +#define M_LAST 92 /* Must be last type + 1 */ /* Strings corresponding to types of memory */ /* Must be in synch with the #defines above */ @@ -258,9 +259,10 @@ "UDF mount" /* 85 M_UDFMNT */ \ "IPv6 NDP", /* 86 M_IP6NDP */ \ "IPv6 options", /* 87 M_IP6OPT */ \ - "IPv6 Misc" /* 88 M_IP6MISC */\ - "TCP Segment Q" /* 89 M_TSEGQ */\ - "IGMP state" /* 90 M_IGMP */\ + "IPv6 Misc", /* 88 M_IP6MISC */\ + "TCP Segment Q",/* 89 M_TSEGQ */\ + "IGMP state", /* 90 M_IGMP */\ + "Journaling" /* 91 M_JOURNAL */\ } struct kmemstats { diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index a2840d9bc..2b8e1e05c 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -159,6 +159,7 @@ struct mount { #define MNT_DONTBROWSE 0x00100000 /* file system is not appropriate path to user data */ #define MNT_UNKNOWNPERMISSIONS 0x00200000 /* no known mapping for uid/gid in permissions information on disk */ #define MNT_AUTOMOUNTED 0x00400000 /* filesystem was mounted by automounter */ +#define MNT_JOURNALED 0x00800000 /* filesystem is journaled */ /* * NFS export related mount flags. @@ -188,7 +189,7 @@ struct mount { MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \ MNT_LOCAL | MNT_QUOTA | \ MNT_ROOTFS | MNT_DOVOLFS | MNT_DONTBROWSE | \ - MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_FIXEDSCRIPTENCODING ) + MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_JOURNALED | MNT_FIXEDSCRIPTENCODING ) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h index d243bb97f..e6a2a189d 100644 --- a/bsd/sys/ubc.h +++ b/bsd/sys/ubc.h @@ -60,6 +60,7 @@ struct ubc_info { int ui_refcount;/* ref count on the ubc_info */ off_t ui_size; /* file size for the vnode */ long ui_mapped; /* is it currently mapped */ + void *ui_owner; /* for recursive ubc_busy */ }; /* Defines for ui_flags */ @@ -69,6 +70,8 @@ struct ubc_info { #define UI_HASOBJREF 0x00000004 /* hold a reference on object */ #define UI_WASMAPPED 0x00000008 /* vnode was mapped */ #define UI_DONTCACHE 0x00000010 /* do not cache object */ +#define UI_BUSY 0x00000020 /* for VM synchronization */ +#define UI_WANTED 0x00000040 /* for VM synchronization */ #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/vfs/Makefile b/bsd/vfs/Makefile index ce2bd8753..1ed043ac2 100644 --- a/bsd/vfs/Makefile +++ b/bsd/vfs/Makefile @@ -20,7 +20,7 @@ EXPINC_SUBDIRS_PPC = \ EXPINC_SUBDIRS_I386 = \ DATAFILES = \ - vfs_support.h + vfs_support.h vfs_journal.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index c11c03bea..57c206760 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -180,6 +180,7 @@ simple_lock_data_t bufhashlist_slock; /* lock on buffer hash list */ /* number of per vnode, "in flight" buffer writes */ #define BUFWRITE_THROTTLE 9 + /* * Time in seconds before a buffer on a list is * considered as a stale buffer @@ -211,9 +212,9 @@ binshash(struct buf *bp, struct bufhashhdr *dp) simple_lock(&bufhashlist_slock); -#if 0 - if(incore(bp->b_vp, bp->b_lblkno)) - panic("binshash: already incore"); +#if 0 + if((bad = incore(bp->b_vp, bp->b_lblkno))) + panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad); #endif /* 0 */ BHASHENTCHECK(bp); @@ -459,6 +460,7 @@ bio_doread(vp, blkno, size, cred, async, queuetype) */ bp->b_rcred = crdup(cred); } + VOP_STRATEGY(bp); trace(TR_BREADMISS, pack(vp, size), blkno); @@ -627,7 +629,12 @@ bwrite(bp) p->p_stats->p_ru.ru_oublock++; /* XXX */ /* Release the buffer. */ - brelse(bp); + // XXXdbg - only if the unused bit is set + if (!ISSET(bp->b_flags, B_NORELSE)) { + brelse(bp); + } else { + CLR(bp->b_flags, B_NORELSE); + } return (rv); } else { @@ -707,7 +714,10 @@ bdwrite_internal(bp, return_error) if (nbdwrite < 0) panic("bdwrite: Negative nbdwrite"); - if (nbdwrite > ((nbuf/4)*3)) { + // can't do a bawrite() if the LOCKED bit is set because the + // buffer is part of a transaction and can't go to disk until + // the LOCKED bit is cleared. + if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) { if (return_error) return (EAGAIN); else @@ -807,6 +817,27 @@ brelse(bp) trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); + // if we're invalidating a buffer that has the B_CALL bit + // set then call the b_iodone function so it gets cleaned + // up properly. + // + if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) { + if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) { + panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp); + } + if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */ + void (*iodone_func)(struct buf *) = bp->b_iodone; + + CLR(bp->b_flags, B_CALL); /* but note callout done */ + bp->b_iodone = NULL; + + if (iodone_func == NULL) { + panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp); + } + (*iodone_func)(bp); + } + } + /* IO is done. Cleanup the UPL state */ if (!ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) { @@ -1121,6 +1152,10 @@ start: brelse(bp); goto start; } + /* + * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN + * CALLED! BE CAREFUL. + */ /* * if it is meta, the queue may be set to other @@ -1451,7 +1486,7 @@ allocbuf(bp, size) } if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0)) - panic("allocbuf: bp->b_data is NULL"); + panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp); bp->b_bufsize = desired_size; bp->b_bcount = size; @@ -1603,11 +1638,15 @@ start: panic("getnewbuf: null bp"); found: + if (ISSET(bp->b_flags, B_LOCKED)) { + panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags); + } + if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef) - panic("getnewbuf: le_prev is deadbeef"); + panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp); if(ISSET(bp->b_flags, B_BUSY)) - panic("getnewbuf reusing BUSY buf"); + panic("getnewbuf reusing BUSY buf @ 0x%x", bp); /* Clean it */ if (bcleanbuf(bp)) { @@ -1822,8 +1861,16 @@ biodone(bp) } if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */ + void (*iodone_func)(struct buf *) = bp->b_iodone; + CLR(bp->b_flags, B_CALL); /* but note callout done */ - (*bp->b_iodone)(bp); + bp->b_iodone = NULL; + + if (iodone_func == NULL) { + panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp); + } else { + (*iodone_func)(bp); + } } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */ brelse(bp); else { /* or just wakeup the buffer */ @@ -1932,6 +1979,7 @@ alloc_io_buf(vp, priv) /* clear out various fields */ bp->b_flags = B_BUSY; bp->b_blkno = bp->b_lblkno = 0; + bp->b_iodone = 0; bp->b_error = 0; bp->b_resid = 0; @@ -2344,3 +2392,76 @@ doit: (void) thread_funnel_set(kernel_flock, funnel_state); } + + +static int +bp_cmp(void *a, void *b) +{ + struct buf *bp_a = *(struct buf **)a, + *bp_b = *(struct buf **)b; + daddr_t res; + + // don't have to worry about negative block + // numbers so this is ok to do. + // + res = (bp_a->b_blkno - bp_b->b_blkno); + + return (int)res; +} + +#define NFLUSH 32 + +int +bflushq(int whichq, struct mount *mp) +{ + struct buf *bp, *next; + int i, buf_count, s; + int counter=0, total_writes=0; + static struct buf *flush_table[NFLUSH]; + + if (whichq < 0 || whichq >= BQUEUES) { + return; + } + + + restart: + bp = TAILQ_FIRST(&bufqueues[whichq]); + for(buf_count=0; bp; bp=next) { + next = bp->b_freelist.tqe_next; + + if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) { + continue; + } + + if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) { + if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) { + panic("bflushq: bp @ 0x%x is locked!\n", bp); + } + + bremfree(bp); + bp->b_flags |= B_BUSY; + flush_table[buf_count] = bp; + buf_count++; + total_writes++; + + if (buf_count >= NFLUSH) { + qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); + + for(i=0; i < buf_count; i++) { + bawrite(flush_table[i]); + } + + goto restart; + } + } + } + + if (buf_count > 0) { + qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp); + for(i=0; i < buf_count; i++) { + bawrite(flush_table[i]); + } + } + + return total_writes; +} diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index df2e73751..49b0938bb 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -1,4 +1,3 @@ - /* * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * @@ -80,6 +79,16 @@ #define CL_NOZERO 0x80 #define CL_PAGEIN 0x100 #define CL_DEV_MEMORY 0x200 +#define CL_PRESERVE 0x400 + +struct clios { + u_int io_completed; + u_int io_issued; + off_t io_offset; + int io_error; + int io_wanted; +}; + static void cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, struct buf *bp); @@ -93,8 +102,11 @@ static int cluster_nocopy_read(struct vnode *vp, struct uio *uio, static int cluster_nocopy_write(struct vnode *vp, struct uio *uio, off_t newEOF, int devblocksize, int flags); static int cluster_phys_read(struct vnode *vp, struct uio *uio, - off_t filesize); -static int cluster_phys_write(struct vnode *vp, struct uio *uio, off_t newEOF); + off_t filesize, int devblocksize, int flags); +static int cluster_phys_write(struct vnode *vp, struct uio *uio, + off_t newEOF, int devblocksize, int flags); +static int cluster_align_phys_io(struct vnode *vp, struct uio *uio, + vm_offset_t usr_paddr, int xsize, int devblocksize, int flags); static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay); static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all); @@ -116,12 +128,14 @@ cluster_iodone(bp) int total_resid; int upl_offset; int zero_offset; + int l_blkno; upl_t upl; struct buf *cbp; struct buf *cbp_head; struct buf *cbp_next; struct buf *real_bp; struct vnode *vp; + struct clios *iostate; int commit_size; int pg_offset; @@ -155,6 +169,8 @@ cluster_iodone(bp) real_bp = cbp->b_real_bp; vp = cbp->b_vp; zero_offset= cbp->b_validend; + l_blkno = cbp->b_lblkno; + iostate = (struct clios *)cbp->b_iostate; while (cbp) { if (cbp->b_vectorcount > 1) @@ -172,13 +188,34 @@ cluster_iodone(bp) cbp = cbp_next; } + if (zero_offset) + cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp); + if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) { vp->v_flag &= ~VTHROTTLED; wakeup((caddr_t)&vp->v_numoutput); } - if (zero_offset) - cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp); + if (iostate) { + if (error) { + off_t error_offset; + + error_offset = (off_t)l_blkno * PAGE_SIZE_64; + if (iostate->io_error == 0) { + iostate->io_error = error; + iostate->io_offset = error_offset; + } else { + if (error_offset < iostate->io_offset) + iostate->io_offset = error_offset; + } + } + iostate->io_completed += total_size; + + if (iostate->io_wanted) { + iostate->io_wanted = 0; + wakeup((caddr_t)&iostate->io_wanted); + } + } if ((b_flags & B_NEED_IODONE) && real_bp) { if (error) { real_bp->b_flags |= B_ERROR; @@ -192,13 +229,15 @@ cluster_iodone(bp) error = EIO; if (b_flags & B_COMMIT_UPL) { - pg_offset = upl_offset & PAGE_MASK; + pg_offset = upl_offset & PAGE_MASK; commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE; - if (error || (b_flags & B_NOCACHE)) { + if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) { int upl_abort_code; - if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */ + if (b_flags & B_PHYS) + upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; + else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */ upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; else if (b_flags & B_PGIN) upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR; @@ -215,7 +254,9 @@ cluster_iodone(bp) } else { int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY; - if ( !(b_flags & B_PAGEOUT)) + if (b_flags & B_PHYS) + upl_commit_flags |= UPL_COMMIT_SET_DIRTY; + else if ( !(b_flags & B_PAGEOUT)) upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY; if (b_flags & B_AGE) upl_commit_flags |= UPL_COMMIT_INACTIVATE; @@ -271,7 +312,7 @@ cluster_zero(upl, upl_offset, size, bp) } static int -cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp) +cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate) struct vnode *vp; upl_t upl; vm_offset_t upl_offset; @@ -280,10 +321,12 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, int devblocksize; int flags; struct buf *real_bp; + struct clios *iostate; { struct buf *cbp; struct iovec *iovp; - u_int size; + u_int size; + u_int io_size; int io_flags; int error = 0; int retval = 0; @@ -297,6 +340,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, u_int max_vectors; int priv; int zero_offset = 0; + u_int first_lblkno; if (flags & CL_READ) { io_flags = (B_VECTORLIST | B_READ); @@ -309,14 +353,18 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, } pl = ubc_upl_pageinfo(upl); - if (flags & CL_ASYNC) - io_flags |= (B_CALL | B_ASYNC); if (flags & CL_AGE) io_flags |= B_AGE; if (flags & CL_DUMP) io_flags |= B_NOCACHE; if (flags & CL_PAGEIN) io_flags |= B_PGIN; + if (flags & CL_PAGEOUT) + io_flags |= B_PAGEOUT; + if (flags & CL_COMMIT) + io_flags |= B_COMMIT_UPL; + if (flags & CL_PRESERVE) + io_flags |= B_PHYS; if (devblocksize) size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1); @@ -338,7 +386,6 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, zero_offset = upl_offset + non_rounded_size; } while (size) { - size_t io_size; int vsize; int i; int pl_index; @@ -352,7 +399,7 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, else io_size = size; - if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) { + if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) { if (error == EOPNOTSUPP) panic("VOP_CMAP Unimplemented"); break; @@ -587,8 +634,10 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, if (error) break; - if (flags & CL_ASYNC) - cbp->b_iodone = (void *)cluster_iodone; + if (flags & CL_ASYNC) { + cbp->b_flags |= (B_CALL | B_ASYNC); + cbp->b_iodone = (void *)cluster_iodone; + } cbp->b_flags |= io_flags; cbp->b_lblkno = lblkno; @@ -598,6 +647,9 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, cbp->b_uploffset = upl_offset; cbp->b_trans_next = (struct buf *)0; + if (cbp->b_iostate = (void *)iostate) + iostate->io_issued += io_size; + if (flags & CL_READ) KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE, cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0); @@ -631,13 +683,6 @@ cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, * then go ahead and issue the I/O */ start_io: - if (flags & CL_COMMIT) - cbp_head->b_flags |= B_COMMIT_UPL; - if (flags & CL_PAGEOUT) - cbp_head->b_flags |= B_PAGEOUT; - if (flags & CL_PAGEIN) - cbp_head->b_flags |= B_PGIN; - if (real_bp) { cbp_head->b_flags |= B_NEED_IODONE; cbp_head->b_real_bp = real_bp; @@ -687,6 +732,8 @@ start_io: if (error) { int abort_size; + io_size = 0; + for (cbp = cbp_head; cbp;) { struct buf * cbp_next; @@ -694,21 +741,36 @@ start_io: _FREE(cbp->b_vectorlist, M_SEGMENT); upl_offset -= cbp->b_bcount; size += cbp->b_bcount; + io_size += cbp->b_bcount; cbp_next = cbp->b_trans_next; free_io_buf(cbp); cbp = cbp_next; } + if (iostate) { + if (iostate->io_error == 0) { + iostate->io_error = error; + iostate->io_offset = f_offset - (off_t)io_size; + } + iostate->io_issued -= io_size; + + if (iostate->io_wanted) { + iostate->io_wanted = 0; + wakeup((caddr_t)&iostate->io_wanted); + } + } pg_offset = upl_offset & PAGE_MASK; abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE; if (flags & CL_COMMIT) { int upl_abort_code; - if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */ + if (flags & CL_PRESERVE) + upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; + else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */ upl_abort_code = UPL_ABORT_FREE_ON_EMPTY; else if (flags & CL_PAGEIN) - upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR; + upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR; else upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES; @@ -910,7 +972,7 @@ cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, fla } return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize, - local_flags, (struct buf *)0)); + local_flags, (struct buf *)0, (struct clios *)0)); } int @@ -968,7 +1030,7 @@ cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flag size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize, - local_flags | CL_READ | CL_PAGEIN, (struct buf *)0); + local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0); if (retval == 0) { int b_lblkno; @@ -1010,7 +1072,7 @@ cluster_bp(bp) f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno); - return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp)); + return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0)); } int @@ -1037,7 +1099,7 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) int retval = 0; - if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA))) + if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE)) { retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags); return(retval); @@ -1074,14 +1136,6 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) if (upl_flags & UPL_PHYS_CONTIG) { - /* - * since the interface to the IOKit below us uses physical block #'s and - * block counts to specify the I/O, we can't handle anything that isn't - * devblocksize aligned - */ - if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1))) - return(EINVAL); - if (flags & IO_HEADZEROFILL) { flags &= ~IO_HEADZEROFILL; @@ -1090,7 +1144,7 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) return(retval); } - retval = cluster_phys_write(vp, uio, newEOF); + retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags); if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL)) { @@ -1172,6 +1226,7 @@ cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) return(retval); } + static int cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) struct vnode *vp; @@ -1326,7 +1381,7 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0); error = cluster_io(vp, upl, upl_offset, uio->uio_offset, - io_size, devblocksize, 0, (struct buf *)0); + io_size, devblocksize, 0, (struct buf *)0, (struct clios *)0); if (error == 0) { /* @@ -1361,14 +1416,20 @@ cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags) return (error); } + static int -cluster_phys_write(vp, uio, newEOF) +cluster_phys_write(vp, uio, newEOF, devblocksize, flags) struct vnode *vp; struct uio *uio; off_t newEOF; + int devblocksize; + int flags; { + upl_page_info_t *pl; + vm_offset_t src_paddr; upl_t upl; vm_offset_t upl_offset; + int tail_size; int io_size; int upl_size; int upl_needed_size; @@ -1399,49 +1460,78 @@ cluster_phys_write(vp, uio, newEOF) (vm_offset_t)iov->iov_base & ~PAGE_MASK, &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0); - if (kret != KERN_SUCCESS) - { - /* cluster_phys_write: failed to get pagelist */ - /* note: return kret here */ + if (kret != KERN_SUCCESS) { + /* + * cluster_phys_write: failed to get pagelist + * note: return kret here + */ return(EINVAL); - } - + } /* * Consider the possibility that upl_size wasn't satisfied. * This is a failure in the physical memory case. */ - if (upl_size < upl_needed_size) - { - kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); - return(EINVAL); - } + if (upl_size < upl_needed_size) { + kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); + return(EINVAL); + } + pl = ubc_upl_pageinfo(upl); - /* - * issue a synchronous write to cluster_io - */ + src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK); - error = cluster_io(vp, upl, upl_offset, uio->uio_offset, - io_size, 0, CL_DEV_MEMORY, (struct buf *)0); + while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { + int head_size; - if (error == 0) { - /* - * The cluster_io write completed successfully, - * update the uio structure and commit. - */ + head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1)); - ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY); - - iov->iov_base += io_size; - iov->iov_len -= io_size; - uio->uio_resid -= io_size; - uio->uio_offset += io_size; + if (head_size > io_size) + head_size = io_size; + + error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0); + + if (error) { + ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); + + return(EINVAL); + } + upl_offset += head_size; + src_paddr += head_size; + io_size -= head_size; } - else - ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); + tail_size = io_size & (devblocksize - 1); + io_size -= tail_size; + + if (io_size) { + /* + * issue a synchronous write to cluster_io + */ + error = cluster_io(vp, upl, upl_offset, uio->uio_offset, + io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0); + } + if (error == 0) { + /* + * The cluster_io write completed successfully, + * update the uio structure + */ + uio->uio_resid -= io_size; + iov->iov_len -= io_size; + iov->iov_base += io_size; + uio->uio_offset += io_size; + src_paddr += io_size; + + if (tail_size) + error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0); + } + /* + * just release our hold on the physically contiguous + * region without changing any state + */ + ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); return (error); } + static int cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) struct vnode *vp; @@ -1593,7 +1683,7 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) read_size = newEOF - upl_f_offset; retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize, - CL_READ, (struct buf *)0); + CL_READ, (struct buf *)0, (struct clios *)0); if (retval) { /* * we had an error during the read which causes us to abort @@ -1627,7 +1717,7 @@ cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags) read_size = newEOF - (upl_f_offset + upl_offset); retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize, - CL_READ, (struct buf *)0); + CL_READ, (struct buf *)0, (struct clios *)0); if (retval) { /* * we had an error during the read which causes us to abort @@ -1934,7 +2024,7 @@ delay_io: if (last_blkno > vp->v_lastw) vp->v_lastw = last_blkno; - ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); + ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY); continue; issue_io: /* @@ -1963,7 +2053,7 @@ issue_io: tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0); } retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize, - io_flags, (struct buf *)0); + io_flags, (struct buf *)0, (struct clios *)0); } } KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, @@ -2039,7 +2129,7 @@ cluster_read(vp, uio, filesize, devblocksize, flags) if (upl_flags & UPL_PHYS_CONTIG) { - retval = cluster_phys_read(vp, uio, filesize); + retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags); } else if (uio->uio_resid < 4 * PAGE_SIZE) { @@ -2119,6 +2209,7 @@ cluster_read(vp, uio, filesize, devblocksize, flags) return(retval); } + static int cluster_read_x(vp, uio, filesize, devblocksize, flags) struct vnode *vp; @@ -2288,7 +2379,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) */ error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, - io_size, devblocksize, CL_READ, (struct buf *)0); + io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0); } if (error == 0) { /* @@ -2481,6 +2572,7 @@ cluster_read_x(vp, uio, filesize, devblocksize, flags) return (retval); } + static int cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) struct vnode *vp; @@ -2687,7 +2779,7 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0); error = cluster_io(vp, upl, upl_offset, start_upl_f_offset, - io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0); + io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0, (struct clios *)0); if (error == 0) { /* @@ -2724,22 +2816,29 @@ cluster_nocopy_read(vp, uio, filesize, devblocksize, flags) } + static int -cluster_phys_read(vp, uio, filesize) +cluster_phys_read(vp, uio, filesize, devblocksize, flags) struct vnode *vp; struct uio *uio; off_t filesize; + int devblocksize; + int flags; { + upl_page_info_t *pl; upl_t upl; vm_offset_t upl_offset; + vm_offset_t dst_paddr; off_t max_size; int io_size; + int tail_size; int upl_size; int upl_needed_size; int pages_in_pl; int upl_flags; kern_return_t kret; struct iovec *iov; + struct clios iostate; int error; /* @@ -2752,14 +2851,15 @@ cluster_phys_read(vp, uio, filesize) max_size = filesize - uio->uio_offset; - if (max_size < (off_t)((unsigned int)iov->iov_len)) - io_size = max_size; + if (max_size > (off_t)((unsigned int)iov->iov_len)) + io_size = iov->iov_len; else - io_size = iov->iov_len; + io_size = max_size; upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64; upl_needed_size = upl_offset + io_size; + error = 0; pages_in_pl = 0; upl_size = upl_needed_size; upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL; @@ -2768,48 +2868,112 @@ cluster_phys_read(vp, uio, filesize) (vm_offset_t)iov->iov_base & ~PAGE_MASK, &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0); - if (kret != KERN_SUCCESS) - { - /* cluster_phys_read: failed to get pagelist */ - return(EINVAL); - } + if (kret != KERN_SUCCESS) { + /* + * cluster_phys_read: failed to get pagelist + */ + return(EINVAL); + } + if (upl_size < upl_needed_size) { + /* + * The upl_size wasn't satisfied. + */ + ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); + + return(EINVAL); + } + pl = ubc_upl_pageinfo(upl); + + dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK); + while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) { + int head_size; + + head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1)); + + if (head_size > io_size) + head_size = io_size; + + error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ); + + if (error) { + ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); + + return(EINVAL); + } + upl_offset += head_size; + dst_paddr += head_size; + io_size -= head_size; + } + tail_size = io_size & (devblocksize - 1); + io_size -= tail_size; + + iostate.io_completed = 0; + iostate.io_issued = 0; + iostate.io_error = 0; + iostate.io_wanted = 0; + + while (io_size && error == 0) { + int xsize; + + if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE)) + xsize = MAX_UPL_TRANSFER * PAGE_SIZE; + else + xsize = io_size; + /* + * request asynchronously so that we can overlap + * the preparation of the next I/O... we'll do + * the commit after all the I/O has completed + * since its all issued against the same UPL + * if there are already too many outstanding reads + * throttle back until we reach a more reasonable level + */ + while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) { + iostate.io_wanted = 1; + tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0); + } + + error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0, + CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC, + (struct buf *)0, &iostate); + /* + * The cluster_io read was issued successfully, + * update the uio structure + */ + if (error == 0) { + uio->uio_resid -= xsize; + iov->iov_len -= xsize; + iov->iov_base += xsize; + uio->uio_offset += xsize; + dst_paddr += xsize; + upl_offset += xsize; + io_size -= xsize; + } + } /* - * Consider the possibility that upl_size wasn't satisfied. + * make sure any async reads have completed before + * we proceed */ - if (upl_size < upl_needed_size) - { - ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); - return(EINVAL); - } + while (iostate.io_issued != iostate.io_completed) { + iostate.io_wanted = 1; + tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0); + } + if (iostate.io_error) { + error = iostate.io_error; + } + if (error == 0 && tail_size) + error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ); /* - * issue a synchronous read to cluster_io + * just release our hold on the physically contiguous + * region without changing any state */ - - error = cluster_io(vp, upl, upl_offset, uio->uio_offset, - io_size, 0, CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0); - - if (error == 0) - { - /* - * The cluster_io read completed successfully, - * update the uio structure and commit. - */ - - ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY); - - iov->iov_base += io_size; - iov->iov_len -= io_size; - uio->uio_resid -= io_size; - uio->uio_offset += io_size; - } - else - ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); + ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY); return (error); } + /* * generate advisory I/O's in the largest chunks possible * the completed pages will be released into the VM cache @@ -2932,7 +3096,7 @@ advisory_read(vp, filesize, f_offset, resid, devblocksize) * issue an asynchronous read to cluster_io */ retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize, - CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0); + CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0); issued_io = 1; } @@ -3228,7 +3392,7 @@ cluster_push_x(vp, EOF, first, last, can_delay) vp->v_flag |= VTHROTTLED; tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0); } - cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0); + cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0); size -= io_size; } @@ -3236,3 +3400,64 @@ cluster_push_x(vp, EOF, first, last, can_delay) return(1); } + + + +static int +cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags) +{ + struct iovec *iov; + upl_page_info_t *pl; + upl_t upl; + vm_offset_t ubc_paddr; + kern_return_t kret; + int error = 0; + + iov = uio->uio_iov; + + kret = ubc_create_upl(vp, + uio->uio_offset & ~PAGE_MASK_64, + PAGE_SIZE, + &upl, + &pl, + UPL_FLAGS_NONE); + + if (kret != KERN_SUCCESS) + return(EINVAL); + + if (!upl_valid_page(pl, 0)) { + /* + * issue a synchronous read to cluster_io + */ + error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize, + CL_READ, (struct buf *)0, (struct clios *)0); + if (error) { + ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); + + return(error); + } + } + ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64); + + if (flags & CL_READ) + copyp2p(ubc_paddr, usr_paddr, xsize, 2); + else + copyp2p(usr_paddr, ubc_paddr, xsize, 1); + + if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) { + /* + * issue a synchronous write to cluster_io + */ + error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize, + 0, (struct buf *)0, (struct clios *)0); + } + if (error == 0) { + uio->uio_offset += xsize; + iov->iov_base += xsize; + iov->iov_len -= xsize; + uio->uio_resid -= xsize; + } + ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY); + + return (error); +} diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c new file mode 100644 index 000000000..2acb4fab2 --- /dev/null +++ b/bsd/vfs/vfs_journal.c @@ -0,0 +1,2067 @@ +/* + * Copyright (c) 1995-2002 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +// +// This file implements a simple write-ahead journaling layer. +// In theory any file system can make use of it by calling these +// functions when the fs wants to modify meta-data blocks. See +// vfs_journal.h for a more detailed description of the api and +// data structures. +// +// Dominic Giampaolo (dbg@apple.com) +// + +#ifdef KERNEL + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern task_t kernel_task; + +#else + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compat.h" + +#endif /* KERNEL */ + +#include "vfs_journal.h" + + +// number of bytes to checksum in a block_list_header +// NOTE: this should be enough to clear out the header +// fields as well as the first entry of binfo[] +#define BLHDR_CHECKSUM_SIZE 32 + + + +static int end_transaction(transaction *tr, int force_it); +static void abort_transaction(journal *jnl, transaction *tr); +static void dump_journal(journal *jnl); + + +#define CHECK_JOURNAL(jnl) \ + do { \ + if (jnl == NULL) {\ + panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\ + }\ + if (jnl->jdev == NULL) { \ + panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\ + } \ + if (jnl->fsdev == NULL) { \ + panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\ + } \ + if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\ + panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\ + __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\ + }\ + if ( jnl->jhdr->start <= 0 \ + || jnl->jhdr->start > jnl->jhdr->size\ + || jnl->jhdr->start > 128*1024*1024) {\ + panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \ + __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\ + }\ + if ( jnl->jhdr->end <= 0 \ + || jnl->jhdr->end > jnl->jhdr->size\ + || jnl->jhdr->end > 128*1024*1024) {\ + panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \ + __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\ + }\ + if (jnl->jhdr->size > 128*1024*1024) {\ + panic("%s:%d: jhdr size looks bad (0x%llx)\n",\ + __FILE__, __LINE__, jnl->jhdr->size);\ + } \ + } while(0) + +#define CHECK_TRANSACTION(tr) \ + do {\ + if (tr == NULL) {\ + panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\ + }\ + if (tr->jnl == NULL) {\ + panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\ + }\ + if (tr->blhdr != (block_list_header *)tr->tbuffer) {\ + panic("%s:%d: blhdr (0x%x) != tbuffer (0x%x)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\ + }\ + if (tr->total_bytes < 0) {\ + panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\ + }\ + if (tr->journal_start < 0 || tr->journal_start > 128*1024*1024) {\ + panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\ + }\ + if (tr->journal_end < 0 || tr->journal_end > 128*1024*1024) {\ + panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\ + }\ + if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > 2048)) {\ + panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\ + }\ + } while(0) + + + +// +// this isn't a great checksum routine but it will do for now. +// we use it to checksum the journal header and the block list +// headers that are at the start of each transaction. +// +static int +calc_checksum(char *ptr, int len) +{ + int i, cksum=0; + + // this is a lame checksum but for now it'll do + for(i=0; i < len; i++, ptr++) { + cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr); + } + + return (~cksum); +} + + +#define JNL_WRITE 1 +#define JNL_READ 2 + +// +// This function sets up a fake buf and passes it directly to the +// journal device strategy routine (so that it won't get cached in +// the block cache. +// +// It also handles range checking the i/o so that we don't write +// outside the journal boundaries and it will wrap the i/o back +// to the beginning if necessary (skipping over the journal header) +// +static size_t +do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction) +{ + int err, io_sz=0, curlen=len; + struct buf *bp; + int max_iosize=0, max_vectors; + + if (*offset < 0 || *offset > jnl->jhdr->size) { + panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size); + } + + again: + bp = alloc_io_buf(jnl->jdev, 1); + + if (direction == JNL_WRITE) { + bp->b_flags |= 0; // don't have to set any flags (was: B_WRITEINPROG) + jnl->jdev->v_numoutput++; + vfs_io_attributes(jnl->jdev, B_WRITE, &max_iosize, &max_vectors); + } else if (direction == JNL_READ) { + bp->b_flags |= B_READ; + vfs_io_attributes(jnl->jdev, B_READ, &max_iosize, &max_vectors); + } + + if (max_iosize == 0) { + max_iosize = 128 * 1024; + } + + if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) { + if (*offset == jnl->jhdr->size) { + *offset = jnl->jhdr->jhdr_size; + } else { + curlen = (off_t)jnl->jhdr->size - *offset; + } + } + + if (curlen > max_iosize) { + curlen = max_iosize; + } + + if (curlen <= 0) { + panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %d\n", curlen, *offset, len); + } + + bp->b_bufsize = curlen; + bp->b_bcount = curlen; + bp->b_data = data; + bp->b_blkno = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size); + bp->b_lblkno = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size); + + err = VOP_STRATEGY(bp); + if (!err) { + err = biowait(bp); + } + + bp->b_data = NULL; + bp->b_bufsize = bp->b_bcount = 0; + bp->b_blkno = bp->b_lblkno = -1; + + free_io_buf(bp); + + if (err) { + printf("jnl: do_jnl_io: strategy err 0x%x\n", err); + return 0; + } + + *offset += curlen; + io_sz += curlen; + if (io_sz != len) { + // handle wrap-around + data = (char *)data + curlen; + curlen = len - io_sz; + if (*offset >= jnl->jhdr->size) { + *offset = jnl->jhdr->jhdr_size; + } + goto again; + } + + return io_sz; +} + +static size_t +read_journal_data(journal *jnl, off_t *offset, void *data, size_t len) +{ + return do_journal_io(jnl, offset, data, len, JNL_READ); +} + +static size_t +write_journal_data(journal *jnl, off_t *offset, void *data, size_t len) +{ + return do_journal_io(jnl, offset, data, len, JNL_WRITE); +} + + +static int +write_journal_header(journal *jnl) +{ + int ret; + off_t jhdr_offset = 0; + + // + // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks. + // + ret = VOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, current_proc()); + if (ret != 0) { + printf("jnl: flushing fs disk buffer returned 0x%x\n", ret); + } + + + jnl->jhdr->checksum = 0; + jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header)); + if (write_journal_data(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size) != jnl->jhdr->jhdr_size) { + printf("jnl: write_journal_header: error writing the journal header!\n"); + jnl->flags |= JOURNAL_INVALID; + return -1; + } + + return 0; +} + + + +// +// this is a work function used to free up transactions that +// completed. they can't be free'd from buffer_flushed_callback +// because it is called from deep with the disk driver stack +// and thus can't do something that would potentially cause +// paging. it gets called by each of the journal api entry +// points so stuff shouldn't hang around for too long. +// +static void +free_old_stuff(journal *jnl) +{ + transaction *tr, *next; + + for(tr=jnl->tr_freeme; tr; tr=next) { + next = tr->next; + kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction)); + } + + jnl->tr_freeme = NULL; +} + + + +// +// This is our callback that lets us know when a buffer has been +// flushed to disk. It's called from deep within the driver stack +// and thus is quite limited in what it can do. Notably, it can +// not initiate any new i/o's or allocate/free memory. +// +static void +buffer_flushed_callback(struct buf *bp) +{ + transaction *tr; + journal *jnl; + transaction *ctr, *prev=NULL, *next; + int i, bufsize; + + + //printf("jnl: buf flush: bp @ 0x%x l/blkno %d/%d vp 0x%x tr @ 0x%x\n", + // bp, bp->b_lblkno, bp->b_blkno, bp->b_vp, bp->b_transaction); + + // snarf out the bits we want + bufsize = bp->b_bufsize; + tr = bp->b_transaction; + + bp->b_iodone = NULL; // don't call us for this guy again + bp->b_transaction = NULL; + + // + // This is what biodone() would do if it didn't call us. + // NOTE: THIS CODE *HAS* TO BE HERE! + // + if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */ + brelse(bp); + } else { /* or just wakeup the buffer */ + CLR(bp->b_flags, B_WANTED); + wakeup(bp); + } + + // NOTE: from here on out we do *NOT* touch bp anymore. + + + // then we've already seen it + if (tr == NULL) { + return; + } + + CHECK_TRANSACTION(tr); + + jnl = tr->jnl; + if (jnl->flags & JOURNAL_INVALID) { + return; + } + + CHECK_JOURNAL(jnl); + + // update the number of blocks that have been flushed. + // this buf may represent more than one block so take + // that into account. + tr->num_flushed += bufsize; + + + // if this transaction isn't done yet, just return as + // there is nothing to do. + if ((tr->num_flushed + tr->num_killed) < tr->total_bytes) { + return; + } + + //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", + // tr, tr->journal_start, tr->journal_end, jnl); + + // find this entry in the old_start[] index and mark it completed + simple_lock(&jnl->old_start_lock); + for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { + + if ((jnl->old_start[i] & ~(0x8000000000000000LL)) == tr->journal_start) { + jnl->old_start[i] &= ~(0x8000000000000000LL); + break; + } + } + if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { + panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n", + tr->journal_start, tr, jnl); + } + simple_unlock(&jnl->old_start_lock); + + + // if we are here then we need to update the journal header + // to reflect that this transaction is complete + if (tr->journal_start == jnl->active_start) { + jnl->active_start = tr->journal_end; + tr->journal_start = tr->journal_end = (off_t)0; + } + + // go through the completed_trs list and try to coalesce + // entries, restarting back at the beginning if we have to. + for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) { + if (ctr->journal_start == jnl->active_start) { + jnl->active_start = ctr->journal_end; + if (prev) { + prev->next = ctr->next; + } + if (ctr == jnl->completed_trs) { + jnl->completed_trs = ctr->next; + } + + next = jnl->completed_trs; // this starts us over again + ctr->next = jnl->tr_freeme; + jnl->tr_freeme = ctr; + ctr = NULL; + } else if (tr->journal_end == ctr->journal_start) { + ctr->journal_start = tr->journal_start; + next = jnl->completed_trs; // this starts us over again + ctr = NULL; + tr->journal_start = tr->journal_end = (off_t)0; + } else if (tr->journal_start == ctr->journal_end) { + ctr->journal_end = tr->journal_end; + next = ctr->next; + tr->journal_start = tr->journal_end = (off_t)0; + } else { + next = ctr->next; + } + } + + // at this point no one should be using this guy anymore + tr->total_bytes = 0xfbadc0de; + + // if this is true then we didn't merge with anyone + // so link ourselves in at the head of the completed + // transaction list. + if (tr->journal_start != 0) { + // put this entry into the correct sorted place + // in the list instead of just at the head. + // + + prev = NULL; + for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) { + // just keep looping + } + + if (ctr == NULL && prev == NULL) { + jnl->completed_trs = tr; + tr->next = NULL; + } else if (ctr == jnl->completed_trs) { + tr->next = jnl->completed_trs; + jnl->completed_trs = tr; + } else { + tr->next = prev->next; + prev->next = tr; + } + } else { + // if we're here this tr got merged with someone else so + // put it on the list to be free'd + tr->next = jnl->tr_freeme; + jnl->tr_freeme = tr; + } +} + +static int +update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) +{ + int ret; + struct buf *oblock_bp=NULL; + + // first read the block we want. + ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp); + if (ret != 0) { + printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block, ret); + + if (oblock_bp) { + brelse(oblock_bp); + oblock_bp = NULL; + } + + // let's try to be aggressive here and just re-write the block + oblock_bp = getblk(jnl->fsdev, (daddr_t)fs_block, bsize, 0, 0, BLK_META); + if (oblock_bp == NULL) { + printf("jnl: update_fs_block: getblk() for %lld failed! failing update.\n", fs_block); + return -1; + } + } + + // make sure it's the correct size. + if (oblock_bp->b_bufsize != bsize) { + brelse(oblock_bp); + return -1; + } + + // copy the journal data over top of it + memcpy(oblock_bp->b_data, block_ptr, bsize); + + if ((ret = VOP_BWRITE(oblock_bp)) != 0) { + printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block,ret); + brelse(oblock_bp); + return ret; + } + + // and now invalidate it so that if someone else wants to read + // it in a different size they'll be able to do it. + ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp); + if (oblock_bp) { + oblock_bp->b_flags |= B_INVAL; + brelse(oblock_bp); + } + + return 0; +} + + +static int +replay_journal(journal *jnl) +{ + int i, ret, checksum, max_bsize; + struct buf *oblock_bp; + block_list_header *blhdr; + off_t offset; + char *buf, *block_ptr=NULL; + + // wrap the start ptr if it points to the very end of the journal + if (jnl->jhdr->start == jnl->jhdr->size) { + jnl->jhdr->start = jnl->jhdr->jhdr_size; + } + if (jnl->jhdr->end == jnl->jhdr->size) { + jnl->jhdr->end = jnl->jhdr->jhdr_size; + } + + if (jnl->jhdr->start == jnl->jhdr->end) { + return 0; + } + + // allocate memory for the header_block. we'll read each blhdr into this + if (kmem_alloc(kernel_map, (vm_offset_t *)&buf, jnl->jhdr->blhdr_size)) { + printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n", + jnl->jhdr->blhdr_size); + return -1; + } + + + printf("jnl: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n", + jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset); + + while(jnl->jhdr->start != jnl->jhdr->end) { + offset = jnl->jhdr->start; + ret = read_journal_data(jnl, &offset, buf, jnl->jhdr->blhdr_size); + if (ret != jnl->jhdr->blhdr_size) { + printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset); + goto bad_replay; + } + + blhdr = (block_list_header *)buf; + checksum = blhdr->checksum; + blhdr->checksum = 0; + if (checksum != calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE)) { + printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", + offset, checksum, calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE)); + goto bad_replay; + } + if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048 + || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) { + printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n", + blhdr->max_blocks, blhdr->num_blocks); + goto bad_replay; + } + + for(i=1,max_bsize=0; i < blhdr->num_blocks; i++) { + if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) { + printf("jnl: replay_journal: bogus block number 0x%llx\n", blhdr->binfo[i].bnum); + goto bad_replay; + } + if (blhdr->binfo[i].bsize > max_bsize) { + max_bsize = blhdr->binfo[i].bsize; + } + } + + // make sure it's at least one page in size. + if (max_bsize & (PAGE_SIZE - 1)) { + max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1); + } + + if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { + goto bad_replay; + } + + //printf("jnl: replay_journal: %d blocks in journal entry @ 0x%llx\n", blhdr->num_blocks-1, + // jnl->jhdr->start); + for(i=1; i < blhdr->num_blocks; i++) { + int size; + + size = blhdr->binfo[i].bsize; + + ret = read_journal_data(jnl, &offset, block_ptr, size); + if (ret != size) { + printf("jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset); + goto bad_replay; + } + + // don't replay "killed" blocks + if (blhdr->binfo[i].bnum == (off_t)-1) { + // printf("jnl: replay_journal: skipping killed fs block (slot %d)\n", i); + } else { + //printf("jnl: replay_journal: fixing fs block # %lld (%d)\n", + // blhdr->binfo[i].bnum, blhdr->binfo[i].bsize); + + if (update_fs_block(jnl, block_ptr, blhdr->binfo[i].bnum, blhdr->binfo[i].bsize) != 0) { + goto bad_replay; + } + } + + // check if we need to wrap offset back to the beginning + // (which is just past the journal header) + // + if (offset >= jnl->jhdr->size) { + offset = jnl->jhdr->jhdr_size; + } + } + + kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); + block_ptr = NULL; + + jnl->jhdr->start += blhdr->bytes_used; + if (jnl->jhdr->start >= jnl->jhdr->size) { + // wrap around and skip the journal header block + jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size; + } + + // only update the on-disk journal header if we've reached the + // last chunk of updates from this transaction. if binfo[0].bnum + // is zero then we know we're at the end. + if (blhdr->binfo[0].bnum == 0) { + if (write_journal_header(jnl) != 0) { + goto bad_replay; + } + } + } + + kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size); + return 0; + + bad_replay: + if (block_ptr) { + kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); + } + kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size); + return -1; +} + + +#define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024) +//#define DEFAULT_TRANSACTION_BUFFER_SIZE (256*1024) // better performance but uses more mem +#define MAX_TRANSACTION_BUFFER_SIZE (512*1024) + +// XXXdbg - so I can change it in the debugger +int def_tbuffer_size = 0; + + +// +// This function sets the size of the tbuffer and the +// size of the blhdr. It assumes that jnl->jhdr->size +// and jnl->jhdr->jhdr_size are already valid. +// +static void +size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) +{ + // + // one-time initialization based on how much memory + // there is in the machine. + // + if (def_tbuffer_size == 0) { + if (mem_size < (256*1024*1024)) { + def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE; + } else if (mem_size < (512*1024*1024)) { + def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2; + } else if (mem_size < (1024*1024*1024)) { + def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3; + } else if (mem_size >= (1024*1024*1024)) { + def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4; + } + } + + // size up the transaction buffer... can't be larger than the number + // of blocks that can fit in a block_list_header block. + if (tbuffer_size == 0) { + jnl->tbuffer_size = def_tbuffer_size; + } else { + // make sure that the specified tbuffer_size isn't too small + if (tbuffer_size < jnl->jhdr->blhdr_size * 2) { + tbuffer_size = jnl->jhdr->blhdr_size * 2; + } + // and make sure it's an even multiple of the block size + if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) { + tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size); + } + + jnl->tbuffer_size = tbuffer_size; + } + + if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) { + jnl->tbuffer_size = (jnl->jhdr->size / 2); + } + + if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) { + jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE; + } + + jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info); + if (jnl->jhdr->blhdr_size < phys_blksz) { + jnl->jhdr->blhdr_size = phys_blksz; + } +} + + + +journal * +journal_create(struct vnode *jvp, + off_t offset, + off_t journal_size, + struct vnode *fsvp, + size_t min_fs_blksz, + int32_t flags, + int32_t tbuffer_size, + void (*flush)(void *arg), + void *arg) +{ + journal *jnl; + int ret, phys_blksz; + + /* Get the real physical block size. */ + if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) { + return NULL; + } + + if (phys_blksz > min_fs_blksz) { + printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n", + phys_blksz, min_fs_blksz); + return NULL; + } + + if ((journal_size % phys_blksz) != 0) { + printf("jnl: create: journal size 0x%llx is not an even multiple of block size 0x%x\n", + journal_size, phys_blksz); + return NULL; + } + + if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl, sizeof(struct journal))) { + return NULL; + } + memset(jnl, 0, sizeof(*jnl)); + + jnl->jdev = jvp; + jnl->jdev_offset = offset; + jnl->fsdev = fsvp; + jnl->flush = flush; + jnl->flush_arg = arg; + jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); + simple_lock_init(&jnl->old_start_lock); + + if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { + printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz); + goto bad_kmem_alloc; + } + + memset(jnl->header_buf, 0, phys_blksz); + + jnl->jhdr = (journal_header *)jnl->header_buf; + jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; + jnl->jhdr->endian = ENDIAN_MAGIC; + jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself + jnl->jhdr->end = phys_blksz; + jnl->jhdr->size = journal_size; + jnl->jhdr->jhdr_size = phys_blksz; + size_up_tbuffer(jnl, tbuffer_size, phys_blksz); + + jnl->active_start = jnl->jhdr->start; + + // XXXdbg - for testing you can force the journal to wrap around + // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3); + // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3); + + if (semaphore_create(kernel_task, &jnl->jsem, SYNC_POLICY_FIFO, 1) != 0) { + printf("jnl: journal_create: failed to create journal semaphore..\n"); + goto bad_sem; + } + + if (write_journal_header(jnl) != 0) { + printf("jnl: journal_create: failed to write journal header.\n"); + goto bad_write; + } + + return jnl; + + + bad_write: + semaphore_destroy(kernel_task, jnl->jsem); + bad_sem: + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); + bad_kmem_alloc: + jnl->jhdr = NULL; + kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal)); + return NULL; +} + + +journal * +journal_open(struct vnode *jvp, + off_t offset, + off_t journal_size, + struct vnode *fsvp, + size_t min_fs_blksz, + int32_t flags, + int32_t tbuffer_size, + void (*flush)(void *arg), + void *arg) +{ + journal *jnl; + int orig_blksz=0, phys_blksz, blhdr_size; + off_t hdr_offset=0; + + /* Get the real physical block size. */ + if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) { + return NULL; + } + + if (phys_blksz > min_fs_blksz) { + printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n", + phys_blksz, min_fs_blksz); + return NULL; + } + + if ((journal_size % phys_blksz) != 0) { + printf("jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n", + journal_size, phys_blksz); + return NULL; + } + + if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl, sizeof(struct journal))) { + return NULL; + } + memset(jnl, 0, sizeof(*jnl)); + + jnl->jdev = jvp; + jnl->jdev_offset = offset; + jnl->fsdev = fsvp; + jnl->flush = flush; + jnl->flush_arg = arg; + jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); + simple_lock_init(&jnl->old_start_lock); + + if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { + printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz); + goto bad_kmem_alloc; + } + + jnl->jhdr = (journal_header *)jnl->header_buf; + memset(jnl->jhdr, 0, sizeof(journal_header)+4); + + // we have to set this up here so that do_journal_io() will work + jnl->jhdr->jhdr_size = phys_blksz; + + if (read_journal_data(jnl, &hdr_offset, jnl->jhdr, phys_blksz) != phys_blksz) { + printf("jnl: open: could not read %d bytes for the journal header.\n", + phys_blksz); + goto bad_journal; + } + + if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { + printf("jnl: open: journal magic is bad (0x%x != 0x%x)\n", + jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); + goto bad_journal; + } + + // only check if we're the current journal header magic value + if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) { + int orig_checksum = jnl->jhdr->checksum; + + jnl->jhdr->checksum = 0; + if (orig_checksum != calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header))) { + printf("jnl: open: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum, + calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header))); + //goto bad_journal; + } + } + + // XXXdbg - convert old style magic numbers to the new one + if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) { + jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; + } + + if (phys_blksz != jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { + printf("jnl: open: phys_blksz %d does not match journal header size %d\n", + phys_blksz, jnl->jhdr->jhdr_size); + + orig_blksz = phys_blksz; + phys_blksz = jnl->jhdr->jhdr_size; + if (VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, FSCRED, NULL)) { + printf("jnl: could not set block size to %d bytes.\n", phys_blksz); + goto bad_journal; + } +// goto bad_journal; + } + + if ( jnl->jhdr->start <= 0 + || jnl->jhdr->start > jnl->jhdr->size + || jnl->jhdr->start > 128*1024*1024) { + printf("jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n", + jnl->jhdr->start, jnl->jhdr->size); + goto bad_journal; + } + + if ( jnl->jhdr->end <= 0 + || jnl->jhdr->end > jnl->jhdr->size + || jnl->jhdr->end > 128*1024*1024) { + printf("jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n", + jnl->jhdr->end, jnl->jhdr->size); + goto bad_journal; + } + + if (jnl->jhdr->size > 128*1024*1024) { + printf("jnl: open: jhdr size looks bad (0x%llx)\n", jnl->jhdr->size); + goto bad_journal; + } + +// XXXdbg - can't do these checks because hfs writes all kinds of +// non-uniform sized blocks even on devices that have a block size +// that is larger than 512 bytes (i.e. optical media w/2k blocks). +// therefore these checks will fail and so we just have to punt and +// do more relaxed checking... +// XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) { + if ((jnl->jhdr->start % 512) != 0) { + printf("jnl: open: journal start (0x%llx) not a multiple of 512?\n", + jnl->jhdr->start); + goto bad_journal; + } + +//XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) { + if ((jnl->jhdr->end % 512) != 0) { + printf("jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n", + jnl->jhdr->end, jnl->jhdr->jhdr_size); + goto bad_journal; + } + + // take care of replaying the journal if necessary + if (flags & JOURNAL_RESET) { + printf("jnl: journal start/end pointers reset! (jnl 0x%x; s 0x%llx e 0x%llx)\n", + jnl, jnl->jhdr->start, jnl->jhdr->end); + jnl->jhdr->start = jnl->jhdr->end; + } else if (replay_journal(jnl) != 0) { + printf("jnl: journal_open: Error replaying the journal!\n"); + goto bad_journal; + } + + if (orig_blksz != 0) { + VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL); + phys_blksz = orig_blksz; + } + + // make sure this is in sync! + jnl->active_start = jnl->jhdr->start; + + // set this now, after we've replayed the journal + size_up_tbuffer(jnl, tbuffer_size, phys_blksz); + + if (semaphore_create(kernel_task, &jnl->jsem, SYNC_POLICY_FIFO, 1) != 0) { + printf("jnl: journal_create: failed to create journal semaphore..\n"); + goto bad_journal; + } + + return jnl; + + bad_journal: + if (orig_blksz != 0) { + phys_blksz = orig_blksz; + VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL); + } + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); + bad_kmem_alloc: + kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal)); + return NULL; +} + +void +journal_close(journal *jnl) +{ + volatile off_t *start, *end; + int counter=0; + + CHECK_JOURNAL(jnl); + + // set this before doing anything that would block so that + // we start tearing things down properly. + // + jnl->flags |= JOURNAL_CLOSE_PENDING; + + if (jnl->owner != current_act()) { + int ret; + + while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) { + // just keep trying if we've been ^C'ed + } + if (ret != 0) { + printf("jnl: close: sem wait failed.\n"); + return; + } + } + + // + // only write stuff to disk if the journal is still valid + // + if ((jnl->flags & JOURNAL_INVALID) == 0) { + + if (jnl->active_tr) { + journal_end_transaction(jnl); + } + + // flush any buffered transactions + if (jnl->cur_tr) { + transaction *tr = jnl->cur_tr; + + jnl->cur_tr = NULL; + end_transaction(tr, 1); // force it to get flushed + } + + //start = &jnl->jhdr->start; + start = &jnl->active_start; + end = &jnl->jhdr->end; + + while (*start != *end && counter++ < 500) { + printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end); + if (jnl->flush) { + jnl->flush(jnl->flush_arg); + } + + } + + if (*start != *end) { + printf("jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n", + *start, *end); + } + + // make sure this is in sync when we close the journal + jnl->jhdr->start = jnl->active_start; + + // if this fails there's not much we can do at this point... + write_journal_header(jnl); + } else { + // if we're here the journal isn't valid any more. + // so make sure we don't leave any locked blocks lying around + printf("jnl: close: journal 0x%x, is invalid. aborting outstanding transactions\n", jnl); + if (jnl->active_tr || jnl->cur_tr) { + transaction *tr; + if (jnl->active_tr) { + tr = jnl->active_tr; + jnl->active_tr = NULL; + } else { + tr = jnl->cur_tr; + jnl->cur_tr = NULL; + } + + abort_transaction(jnl, tr); + if (jnl->active_tr || jnl->cur_tr) { + panic("jnl: close: jnl @ 0x%x had both an active and cur tr\n", jnl); + } + } + } + + free_old_stuff(jnl); + + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size); + jnl->jhdr = (void *)0xbeefbabe; + + semaphore_destroy(kernel_task, jnl->jsem); + kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal)); +} + +static void +dump_journal(journal *jnl) +{ + transaction *ctr; + + printf("journal:"); + printf(" jdev_offset %.8llx\n", jnl->jdev_offset); + printf(" magic: 0x%.8x\n", jnl->jhdr->magic); + printf(" start: 0x%.8llx\n", jnl->jhdr->start); + printf(" end: 0x%.8llx\n", jnl->jhdr->end); + printf(" size: 0x%.8llx\n", jnl->jhdr->size); + printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size); + printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size); + printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum); + + printf(" completed transactions:\n"); + for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) { + printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end); + } +} + + + +static off_t +free_space(journal *jnl) +{ + off_t free_space; + + if (jnl->jhdr->start < jnl->jhdr->end) { + free_space = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size; + } else if (jnl->jhdr->start > jnl->jhdr->end) { + free_space = jnl->jhdr->start - jnl->jhdr->end; + } else { + // journal is completely empty + free_space = jnl->jhdr->size - jnl->jhdr->jhdr_size; + } + + return free_space; +} + + +// +// The journal must be locked on entry to this function. +// The "desired_size" is in bytes. +// +static int +check_free_space(journal *jnl, int desired_size) +{ + int i, counter=0; + + //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n", +// desired_size, free_space(jnl)); + + while (1) { + if (counter++ == 5000) { + dump_journal(jnl); + panic("jnl: check_free_space: buffer flushing isn't working " + "(jnl @ 0x%x s %lld e %lld f %lld [active start %lld]).\n", jnl, + jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start); + } + if (counter > 7500) { + printf("jnl: check_free_space: giving up waiting for free space.\n"); + return ENOSPC; + } + + // make sure there's space in the journal to hold this transaction + if (free_space(jnl) > desired_size) { + break; + } + + // + // here's where we lazily bump up jnl->jhdr->start. we'll consume + // entries until there is enough space for the next transaction. + // + simple_lock(&jnl->old_start_lock); + for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { + int counter; + + counter = 0; + while (jnl->old_start[i] & 0x8000000000000000LL) { + if (counter++ > 100) { + panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl 0x%x).\n", + jnl->old_start[i], jnl); + } + + simple_unlock(&jnl->old_start_lock); + if (jnl->flush) { + jnl->flush(jnl->flush_arg); + } + tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1); + simple_lock(&jnl->old_start_lock); + } + + if (jnl->old_start[i] == 0) { + continue; + } + + jnl->jhdr->start = jnl->old_start[i]; + jnl->old_start[i] = 0; + if (free_space(jnl) > desired_size) { + write_journal_header(jnl); + break; + } + } + simple_unlock(&jnl->old_start_lock); + + // if we bumped the start, loop and try again + if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { + continue; + } + + + // if the file system gave us a flush function, call it to so that + // it can flush some blocks which hopefully will cause some transactions + // to complete and thus free up space in the journal. + if (jnl->flush) { + jnl->flush(jnl->flush_arg); + } + + // wait for a while to avoid being cpu-bound (this will + // put us to sleep for 10 milliseconds) + tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1); + } + + return 0; +} + +int +journal_start_transaction(journal *jnl) +{ + int ret; + transaction *tr; + + CHECK_JOURNAL(jnl); + + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + if (jnl->owner == current_act()) { + if (jnl->active_tr == NULL) { + panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_act 0x%x\n", + jnl, jnl->owner, current_act()); + } + jnl->nested_count++; + return 0; + } + + while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) { + // just keep looping if we've been ^C'ed + } + if (ret != 0) { + printf("jnl: start_tr: sem wait failed.\n"); + return EINVAL; + } + + if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) { + panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n", + jnl->owner, jnl->nested_count, jnl->active_tr, jnl); + } + + jnl->owner = current_act(); + jnl->nested_count = 1; + + free_old_stuff(jnl); + + // make sure there's room in the journal + if (check_free_space(jnl, jnl->tbuffer_size) != 0) { + printf("jnl: start transaction failed: no space\n"); + ret = ENOSPC; + goto bad_start; + } + + // if there's a buffered transaction, use it. + if (jnl->cur_tr) { + jnl->active_tr = jnl->cur_tr; + jnl->cur_tr = NULL; + + return 0; + } + + if (kmem_alloc(kernel_map, (vm_offset_t *)&tr, sizeof(transaction))) { + printf("jnl: start transaction failed: no mem\n"); + ret = ENOMEM; + goto bad_start; + } + memset(tr, 0, sizeof(transaction)); + + tr->tbuffer_size = jnl->tbuffer_size; + if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) { + kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction)); + printf("jnl: start transaction failed: no tbuffer mem\n"); + ret = ENOMEM; + goto bad_start; + } + + // journal replay code checksum check depends on this. + memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); + + tr->blhdr = (block_list_header *)tr->tbuffer; + tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; + tr->blhdr->num_blocks = 1; // accounts for this header block + tr->blhdr->bytes_used = jnl->jhdr->blhdr_size; + + tr->num_blhdrs = 1; + tr->total_bytes = jnl->jhdr->blhdr_size; + tr->jnl = jnl; + + jnl->active_tr = tr; + + // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, tr); + + return 0; + + bad_start: + jnl->owner = NULL; + jnl->nested_count = 0; + semaphore_signal(jnl->jsem); + return ret; +} + + +int +journal_modify_block_start(journal *jnl, struct buf *bp) +{ + transaction *tr; + + CHECK_JOURNAL(jnl); + + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + // XXXdbg - for debugging I want this to be true. later it may + // not be necessary. + if ((bp->b_flags & B_META) == 0) { + panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp, jnl); + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl->owner != current_act()) { + panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n", + jnl, jnl->owner, current_act()); + } + + free_old_stuff(jnl); + + //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d; total bytes %d)\n", + // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes); + + // can't allow blocks that aren't an even multiple of the + // underlying block size. + if ((bp->b_bufsize % jnl->jhdr->jhdr_size) != 0) { + panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n", + bp->b_bufsize, jnl->jhdr->jhdr_size); + return -1; + } + + // make sure that this transaction isn't bigger than the whole journal + if (tr->total_bytes+bp->b_bufsize >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) { + panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n", + tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), bp->b_bufsize, tr, bp); + return -1; + } + + // if the block is dirty and not already locked we have to write + // it out before we muck with it because it has data that belongs + // (presumably) to another transaction. + // + if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) { + + // this will cause it to not be brelse()'d + bp->b_flags |= B_NORELSE; + VOP_BWRITE(bp); + } + + bp->b_flags |= B_LOCKED; + + return 0; +} + +int +journal_modify_block_abort(journal *jnl, struct buf *bp) +{ + transaction *tr; + block_list_header *blhdr; + int i, j; + + CHECK_JOURNAL(jnl); + + tr = jnl->active_tr; + + // + // if there's no active transaction then we just want to + // call brelse() and return since this is just a block + // that happened to be modified as part of another tr. + // + if (tr == NULL) { + brelse(bp); + return 0; + } + + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + CHECK_TRANSACTION(tr); + + if (jnl->owner != current_act()) { + panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n", + jnl, jnl->owner, current_act()); + } + + free_old_stuff(jnl); + + // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp); + + // first check if it's already part of this transaction + for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { + for(i=1; i < blhdr->num_blocks; i++) { + if (bp == blhdr->binfo[i].bp) { + if (bp->b_bufsize != blhdr->binfo[i].bsize) { + panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n", + bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl); + } + break; + } + } + + if (i < blhdr->num_blocks) { + break; + } + } + + // + // if blhdr is null, then this block has only had modify_block_start + // called on it as part of the current transaction. that means that + // it is ok to clear the LOCKED bit since it hasn't actually been + // modified. if blhdr is non-null then modify_block_end was called + // on it and so we need to keep it locked in memory. + // + if (blhdr == NULL) { + bp->b_flags &= ~(B_LOCKED); + } + + brelse(bp); + return 0; +} + + +int +journal_modify_block_end(journal *jnl, struct buf *bp) +{ + int i, j, tbuffer_offset; + char *blkptr; + block_list_header *blhdr, *prev=NULL; + transaction *tr; + + CHECK_JOURNAL(jnl); + + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl->owner != current_act()) { + panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n", + jnl, jnl->owner, current_act()); + } + + free_old_stuff(jnl); + + //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d, total bytes %d)\n", + // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes); + + if ((bp->b_flags & B_LOCKED) == 0) { + panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl); + bp->b_flags |= B_LOCKED; + } + + // first check if it's already part of this transaction + for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { + tbuffer_offset = jnl->jhdr->blhdr_size; + + for(i=1; i < blhdr->num_blocks; i++) { + if (bp == blhdr->binfo[i].bp) { + if (bp->b_bufsize != blhdr->binfo[i].bsize) { + panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n", + bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl); + } + break; + } + tbuffer_offset += blhdr->binfo[i].bsize; + } + + if (i < blhdr->num_blocks) { + break; + } + } + + if (blhdr == NULL + && prev + && (prev->num_blocks+1) <= prev->max_blocks + && (prev->bytes_used+bp->b_bufsize) <= tr->tbuffer_size) { + blhdr = prev; + } else if (blhdr == NULL) { + block_list_header *nblhdr; + + if (prev == NULL) { + panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl, bp); + } + + // we got to the end of the list, didn't find the block and there's + // no room in the block_list_header pointed to by prev + + // we allocate another tbuffer and link it in at the end of the list + // through prev->binfo[0].bnum. that's a skanky way to do things but + // avoids having yet another linked list of small data structures to manage. + + if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) { + panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n", + tr, tr->total_bytes); + } + + // journal replay code checksum check depends on this. + memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE); + + // initialize the new guy + nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; + nblhdr->num_blocks = 1; // accounts for this header block + nblhdr->bytes_used = jnl->jhdr->blhdr_size; + + tr->num_blhdrs++; + tr->total_bytes += jnl->jhdr->blhdr_size; + + // then link him in at the end + prev->binfo[0].bnum = (off_t)((long)nblhdr); + + // and finally switch to using the new guy + blhdr = nblhdr; + tbuffer_offset = jnl->jhdr->blhdr_size; + i = 1; + } + + + if ((i+1) > blhdr->max_blocks) { + panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks); + } + + // copy the data into the in-memory transaction buffer + blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; + memcpy(blkptr, bp->b_data, bp->b_bufsize); + + // if this is true then this is a new block we haven't seen + if (i >= blhdr->num_blocks) { + vget(bp->b_vp, 0, current_proc()); + + blhdr->binfo[i].bnum = bp->b_blkno; + blhdr->binfo[i].bsize = bp->b_bufsize; + blhdr->binfo[i].bp = bp; + + blhdr->bytes_used += bp->b_bufsize; + tr->total_bytes += bp->b_bufsize; + + blhdr->num_blocks++; + } + + bdwrite(bp); + + return 0; +} + +int +journal_kill_block(journal *jnl, struct buf *bp) +{ + int i; + block_list_header *blhdr; + transaction *tr; + + CHECK_JOURNAL(jnl); + + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl->owner != current_act()) { + panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n", + jnl, jnl->owner, current_act()); + } + + free_old_stuff(jnl); + + if ((bp->b_flags & B_LOCKED) == 0) { + panic("jnl: kill block: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl); + } + + // first check if it's already part of this transaction + for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { + + for(i=1; i < blhdr->num_blocks; i++) { + if (bp == blhdr->binfo[i].bp) { + bp->b_flags &= ~B_LOCKED; + + // this undoes the vget() in journal_modify_block_end() + vrele(bp->b_vp); + + // if the block has the DELWRI and CALL bits sets, then + // things are seriously weird. if it was part of another + // transaction then journal_modify_block_start() should + // have force it to be written. + // + if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_CALL)) { + panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp); + } else { + tr->num_killed += bp->b_bufsize; + } + + if (bp->b_flags & B_BUSY) { + brelse(bp); + } + + blhdr->binfo[i].bp = NULL; + blhdr->binfo[i].bnum = (off_t)-1; + break; + } + } + + if (i < blhdr->num_blocks) { + break; + } + } + + return 0; +} + + +static int +journal_binfo_cmp(void *a, void *b) +{ + block_info *bi_a = (struct block_info *)a, + *bi_b = (struct block_info *)b; + daddr_t res; + + if (bi_a->bp == NULL) { + return 1; + } + if (bi_b->bp == NULL) { + return -1; + } + + // don't have to worry about negative block + // numbers so this is ok to do. + // + res = (bi_a->bp->b_blkno - bi_b->bp->b_blkno); + + return (int)res; +} + + +static int +end_transaction(transaction *tr, int force_it) +{ + int i, j, ret, amt; + off_t end; + journal *jnl = tr->jnl; + struct buf *bp; + block_list_header *blhdr=NULL, *next=NULL; + + if (jnl->cur_tr) { + panic("jnl: jnl @ 0x%x already has cur_tr 0x%x, new tr: 0x%x\n", + jnl, jnl->cur_tr, tr); + } + + // if there weren't any modified blocks in the transaction + // just save off the transaction pointer and return. + if (tr->total_bytes == jnl->jhdr->blhdr_size) { + jnl->cur_tr = tr; + return; + } + + // if our transaction buffer isn't very full, just hang + // on to it and don't actually flush anything. this is + // what is known as "group commit". we will flush the + // transaction buffer if it's full or if we have more than + // one of them so we don't start hogging too much memory. + // + if ( force_it == 0 + && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0 + && tr->num_blhdrs < 3 + && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) { + + jnl->cur_tr = tr; + return; + } + + + // if we're here we're going to flush the transaction buffer to disk. + // make sure there is room in the journal first. + check_free_space(jnl, tr->total_bytes); + + // range check the end index + if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { + panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n", + jnl->jhdr->end, jnl->jhdr->size); + } + + // this transaction starts where the current journal ends + tr->journal_start = jnl->jhdr->end; + end = jnl->jhdr->end; + + // + // if the first entry in old_start[] isn't free yet, loop calling the + // file system flush routine until it is (or we panic). + // + i = 0; + simple_lock(&jnl->old_start_lock); + while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) { + if (jnl->flush) { + simple_unlock(&jnl->old_start_lock); + + if (jnl->flush) { + jnl->flush(jnl->flush_arg); + } + + // yield the cpu so others can get in to clear the lock bit + (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1); + + simple_lock(&jnl->old_start_lock); + } + if (i++ >= 100) { + panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n", + jnl->old_start[0] & (~0x8000000000000000LL), jnl); + } + } + + // + // slide everyone else down and put our latest guy in the last + // entry in the old_start array + // + memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0])); + jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL; + + simple_unlock(&jnl->old_start_lock); + + + // for each block, make sure that the physical block # is set + for(blhdr=tr->blhdr; blhdr; blhdr=next) { + + for(i=1; i < blhdr->num_blocks; i++) { + + bp = blhdr->binfo[i].bp; + if (bp == NULL) { // only true if a block was "killed" + if (blhdr->binfo[i].bnum != (off_t)-1) { + panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ 0x%x, tr 0x%x)\n", + blhdr->binfo[i].bnum, jnl, tr); + } + continue; + } + + if (bp->b_vp == NULL && bp->b_lblkno == bp->b_blkno) { + panic("jnl: end_tr: DANGER! bp @ 0x%x w/null vp and l/blkno = %d/%d\n", + bp, bp->b_lblkno, bp->b_blkno); + } + + // if the lblkno is the same as blkno and this bp isn't + // associated with the underlying file system device then + // we need to call bmap() to get the actual physical block. + // + if ((bp->b_lblkno == bp->b_blkno) && (bp->b_vp != jnl->fsdev)) { + if (VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL) != 0) { + printf("jnl: end_tr: can't bmap the bp @ 0x%x, jnl 0x%x\n", bp, jnl); + goto bad_journal; + } + } + + // update this so we write out the correct physical block number! + blhdr->binfo[i].bnum = bp->b_blkno; + } + + next = (block_list_header *)((long)blhdr->binfo[0].bnum); + } + + for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { + + amt = blhdr->bytes_used; + + blhdr->checksum = 0; + blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); + + ret = write_journal_data(jnl, &end, blhdr, amt); + if (ret != amt) { + printf("jnl: end_transaction: only wrote %d of %d bytes to the journal!\n", + ret, amt); + + goto bad_journal; + } + } + + jnl->jhdr->end = end; // update where the journal now ends + tr->journal_end = end; // the transaction ends here too + if (tr->journal_start == 0 || tr->journal_end == 0) { + panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n", + tr->journal_start, tr->journal_end); + } + + if (write_journal_header(jnl) != 0) { + goto bad_journal; + } + + // + // setup for looping through all the blhdr's. we null out the + // tbuffer and blhdr fields so that they're not used any more. + // + blhdr = tr->blhdr; + tr->tbuffer = NULL; + tr->blhdr = NULL; + + // the buffer_flushed_callback will only be called for the + // real blocks that get flushed so we have to account for + // the block_list_headers here. + // + tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size; + + // for each block, set the iodone callback and unlock it + for(; blhdr; blhdr=next) { + + // we can re-order the buf ptrs because everything is written out already + qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp); + + for(i=1; i < blhdr->num_blocks; i++) { + if (blhdr->binfo[i].bp == NULL) { + continue; + } + + ret = meta_bread(blhdr->binfo[i].bp->b_vp, + (daddr_t)blhdr->binfo[i].bp->b_lblkno, + blhdr->binfo[i].bp->b_bufsize, + NOCRED, + &bp); + if (ret == 0 && bp != NULL) { + struct vnode *save_vp; + + if (bp != blhdr->binfo[i].bp) { + panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n", + bp, blhdr->binfo[i].bp, jnl); + } + + if ((bp->b_flags & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) { + if (jnl->flags & JOURNAL_CLOSE_PENDING) { + brelse(bp); + continue; + } else { + panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp, bp->b_flags); + } + } + + if (bp->b_iodone != NULL) { + panic("jnl: bp @ 0x%x (blkno %d, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n", + bp, bp->b_blkno, bp->b_vp, bp->b_iodone, buffer_flushed_callback); + } + + save_vp = bp->b_vp; + + bp->b_iodone = buffer_flushed_callback; + bp->b_transaction = tr; + bp->b_flags |= B_CALL; + bp->b_flags &= ~(B_LOCKED); + + // kicking off the write here helps performance + bawrite(bp); + // XXXdbg this is good for testing: bdwrite(bp); + //bdwrite(bp); + + // this undoes the vget() in journal_modify_block_end() + vrele(save_vp); + + } else { + printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n", + blhdr->binfo[i].bnum, blhdr->binfo[i].bp); + } + } + + next = (block_list_header *)((long)blhdr->binfo[0].bnum); + + // we can free blhdr here since we won't need it any more + blhdr->binfo[0].bnum = 0xdeadc0de; + kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); + } + + //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n", + // tr, tr->journal_start, tr->journal_end); + return 0; + + + bad_journal: + jnl->flags |= JOURNAL_INVALID; + abort_transaction(jnl, tr); + return -1; +} + +static void +abort_transaction(journal *jnl, transaction *tr) +{ + int i, ret; + block_list_header *blhdr, *next; + struct buf *bp; + + // for each block list header, iterate over the blocks then + // free up the memory associated with the block list. + // + // for each block, clear the lock bit and release it. + // + for(blhdr=tr->blhdr; blhdr; blhdr=next) { + + for(i=1; i < blhdr->num_blocks; i++) { + if (blhdr->binfo[i].bp == NULL) { + continue; + } + + ret = meta_bread(blhdr->binfo[i].bp->b_vp, + (daddr_t)blhdr->binfo[i].bp->b_lblkno, + blhdr->binfo[i].bp->b_bufsize, + NOCRED, + &bp); + if (ret == 0 && bp != NULL) { + if (bp != blhdr->binfo[i].bp) { + panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n", + bp, blhdr->binfo[i].bp, jnl); + } + + // clear the locked bit and the delayed-write bit. we + // don't want these blocks going to disk. + bp->b_flags &= ~(B_LOCKED|B_DELWRI); + bp->b_flags |= B_INVAL; + + brelse(bp); + + } else { + printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n", + blhdr->binfo[i].bnum, blhdr->binfo[i].bp); + } + } + + next = (block_list_header *)((long)blhdr->binfo[0].bnum); + + // we can free blhdr here since we won't need it any more + blhdr->binfo[0].bnum = 0xdeadc0de; + kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); + } + + tr->tbuffer = NULL; + tr->blhdr = NULL; + tr->total_bytes = 0xdbadc0de; + kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction)); +} + + +int +journal_end_transaction(journal *jnl) +{ + int ret; + transaction *tr; + + CHECK_JOURNAL(jnl); + + if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) { + return 0; + } + + if (jnl->owner != current_act()) { + panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n", + jnl, jnl->owner, current_act()); + } + + free_old_stuff(jnl); + + jnl->nested_count--; + if (jnl->nested_count > 0) { + return 0; + } else if (jnl->nested_count < 0) { + panic("jnl: jnl @ 0x%x has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count); + } + + if (jnl->flags & JOURNAL_INVALID) { + if (jnl->active_tr) { + transaction *tr; + + if (jnl->cur_tr != NULL) { + panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n", + jnl, jnl->active_tr, jnl->cur_tr); + } + + tr = jnl->active_tr; + jnl->active_tr = NULL; + abort_transaction(jnl, tr); + } + + jnl->owner = NULL; + semaphore_signal(jnl->jsem); + + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + // clear this out here so that when check_free_space() calls + // the FS flush function, we don't panic in journal_flush() + // if the FS were to call that. note: check_free_space() is + // called from end_transaction(). + // + jnl->active_tr = NULL; + ret = end_transaction(tr, 0); + + jnl->owner = NULL; + semaphore_signal(jnl->jsem); + + return ret; +} + + +int +journal_flush(journal *jnl) +{ + int need_signal = 0; + + CHECK_JOURNAL(jnl); + + if (jnl->flags & JOURNAL_INVALID) { + return -1; + } + + if (jnl->owner != current_act()) { + int ret; + + while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) { + // just keep looping if we've ben ^C'ed + } + if (ret != 0) { + printf("jnl: flush: sem wait failed.\n"); + return -1; + } + need_signal = 1; + } + + free_old_stuff(jnl); + + // if we're not active, flush any buffered transactions + if (jnl->active_tr == NULL && jnl->cur_tr) { + transaction *tr = jnl->cur_tr; + + jnl->cur_tr = NULL; + end_transaction(tr, 1); // force it to get flushed + } + + if (need_signal) { + semaphore_signal(jnl->jsem); + } + + return 0; +} + +int +journal_active(journal *jnl) +{ + if (jnl->flags & JOURNAL_INVALID) { + return -1; + } + + return (jnl->active_tr == NULL) ? 0 : 1; +} diff --git a/bsd/vfs/vfs_journal.h b/bsd/vfs/vfs_journal.h new file mode 100644 index 000000000..523ba7d52 --- /dev/null +++ b/bsd/vfs/vfs_journal.h @@ -0,0 +1,238 @@ + +/* + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * The contents of this file constitute Original Code as defined in and + * are subject to the Apple Public Source License Version 1.1 (the + * "License"). You may not use this file except in compliance with the + * License. Please obtain a copy of the License at + * http://www.apple.com/publicsource and read it before using this file. + * + * This Original Code and all software distributed under the License are + * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * This header contains the structures and function prototypes + * for the vfs journaling code. The data types are not meant + * to be modified by user code. Just use the functions and do + * not mess around with the structs. + */ +#ifndef _SYS_VFS_JOURNAL_H_ +#define _SYS_VFS_JOURNAL_H_ + +#include + +#ifdef __APPLE_API_UNSTABLE + +#include + +typedef struct block_info { + off_t bnum; // block # on the file system device + size_t bsize; // in bytes + struct buf *bp; +} block_info; + +typedef struct block_list_header { + u_int16_t max_blocks; // max number of blocks in this chunk + u_int16_t num_blocks; // number of valid block numbers in block_nums + int32_t bytes_used; // how many bytes of this tbuffer are used + int32_t checksum; // on-disk: checksum of this header and binfo[0] + int32_t pad; // pad out to 16 bytes + block_info binfo[1]; // so we can reference them by name +} block_list_header; + + +struct journal; + +typedef struct transaction { + int tbuffer_size; // in bytes + char *tbuffer; // memory copy of the transaction + block_list_header *blhdr; // points to the first byte of tbuffer + int num_blhdrs; // how many buffers we've allocated + int total_bytes; // total # of bytes in transaction + int num_flushed; // how many bytes have been flushed + int num_killed; // how many bytes were "killed" + off_t journal_start; // where in the journal this transaction starts + off_t journal_end; // where in the journal this transaction ends + struct journal *jnl; // ptr back to the journal structure + struct transaction *next; // list of tr's (either completed or to be free'd) +} transaction; + + +/* + * This is written to block zero of the journal and it + * maintains overall state about the journal. + */ +typedef struct journal_header { + int32_t magic; + int32_t endian; + volatile off_t start; // zero-based byte offset of the start of the first transaction + volatile off_t end; // zero-based byte offset of where free space begins + off_t size; // size in bytes of the entire journal + int32_t blhdr_size; // size in bytes of each block_list_header in the journal + int32_t checksum; + int32_t jhdr_size; // block size (in bytes) of the journal header +} journal_header; + +#define JOURNAL_HEADER_MAGIC 0x4a4e4c78 // 'JNLx' +#define ENDIAN_MAGIC 0x12345678 + +#define OLD_JOURNAL_HEADER_MAGIC 0x4a484452 // 'JHDR' + + +/* + * In memory structure about the journal. + */ +typedef struct journal { + struct vnode *jdev; // vnode of the device where the journal lives + off_t jdev_offset; // byte offset to the start of the journal + + struct vnode *fsdev; // vnode of the file system device + + void (*flush)(void *arg); // fs callback to flush meta data blocks + void *flush_arg; // arg that's passed to flush() + + int32_t flags; + int32_t tbuffer_size; // default transaction buffer size + + char *header_buf; // in-memory copy of the journal header + journal_header *jhdr; // points to the first byte of header_buf + + transaction *cur_tr; // for group-commit + transaction *completed_trs; // out-of-order transactions that completed + transaction *active_tr; // for nested transactions + int32_t nested_count; // for nested transactions + void *owner; // a ptr that's unique to the calling process + + transaction *tr_freeme; // transaction structs that need to be free'd + + volatile off_t active_start; // the active start that we only keep in memory + simple_lock_data_t old_start_lock; // guard access + volatile off_t old_start[16]; // this is how we do lazy start update + + semaphore_t jsem; +} journal; + +/* internal-only journal flags (top 16 bits) */ +#define JOURNAL_CLOSE_PENDING 0x00010000 +#define JOURNAL_INVALID 0x00020000 + +/* journal_open/create options are always in the low-16 bits */ +#define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff + +/* + * Prototypes. + */ + +/* + * Call journal_create() to create a new journal. You only + * call this once, typically at file system creation time. + * + * The "jvp" argument is the vnode where the journal is written. + * The journal starts at "offset" and is "journal_size" bytes long. + * + * The "fsvp" argument is the vnode of your file system. It may be + * the same as "jvp". + * + * The "min_fs_block_size" argument is the minimum block size + * (in bytes) that the file system will ever write. Typically + * this is the block size of the file system (1k, 4k, etc) but + * on HFS+ it is the minimum block size of the underlying device. + * + * The flags argument lets you disable group commit if you + * want tighter guarantees on transactions (in exchange for + * lower performance). + * + * The tbuffer_size is the size of the transaction buffer + * used by the journal. If you specify zero, the journal code + * will use a reasonable defaults. The tbuffer_size should + * be an integer multiple of the min_fs_block_size. + * + * Returns a valid journal pointer or NULL if one could not + * be created. + */ +journal *journal_create(struct vnode *jvp, + off_t offset, + off_t journal_size, + struct vnode *fsvp, + size_t min_fs_block_size, + int32_t flags, + int32_t tbuffer_size, + void (*flush)(void *arg), + void *arg); + +/* + * Call journal_open() when mounting an existing file system + * that has a previously created journal. It will take care + * of validating the journal and replaying it if necessary. + * + * See journal_create() for a description of the arguments. + * + * Returns a valid journal pointer of NULL if it runs into + * trouble reading/playing back the journal. + */ +journal *journal_open(struct vnode *jvp, + off_t offset, + off_t journal_size, + struct vnode *fsvp, + size_t min_fs_block_size, + int32_t flags, + int32_t tbuffer_size, + void (*flush)(void *arg), + void *arg); + +/* + * Call journal_close() just before your file system is unmounted. + * It flushes any outstanding transactions and makes sure the + * journal is in a consistent state. + */ +void journal_close(journal *journal); + +/* + * flags for journal_create/open. only can use + * the low 16 bits for flags because internal + * bits go in the high 16. + */ +#define JOURNAL_NO_GROUP_COMMIT 0x00000001 +#define JOURNAL_RESET 0x00000002 + +/* + * Transaction related functions. + * + * Before you start modifying file system meta data, you + * should call journal_start_transaction(). Then before + * you modify each block, call journal_modify_block_start() + * and when you're done, journal_modify_block_end(). When + * you've modified the last block as part of a transaction, + * call journal_end_transaction() to commit the changes. + * + * If you decide to abort the modifications to a block you + * should call journal_modify_block_abort(). + * + * If as part of a transaction you need want to throw out + * any previous copies of a block (because it got deleted) + * then call journal_kill_block(). This will mark it so + * that the journal does not play it back (effectively + * dropping it). + */ +int journal_start_transaction(journal *jnl); +int journal_modify_block_start(journal *jnl, struct buf *bp); +int journal_modify_block_abort(journal *jnl, struct buf *bp); +int journal_modify_block_end(journal *jnl, struct buf *bp); +int journal_kill_block(journal *jnl, struct buf *bp); +int journal_end_transaction(journal *jnl); + +int journal_active(journal *jnl); +int journal_flush(journal *jnl); + +#endif /* __APPLE_API_UNSTABLE */ +#endif /* !_SYS_VFS_JOURNAL_H_ */ diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index c49f321c2..ce79f9d4d 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -677,12 +677,22 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) { return (error); } - if (vp->v_dirtyblkhd.lh_first) - panic("vinvalbuf: dirty bufs"); + + // XXXdbg - if there are dirty bufs, wait for 'em if they're busy + for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) { + nbp = bp->b_vnbufs.le_next; + if (ISSET(bp->b_flags, B_BUSY)) { + SET(bp->b_flags, B_WANTED); + tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), "vinvalbuf", 0); + nbp = vp->v_dirtyblkhd.lh_first; + } else { + panic("vinvalbuf: dirty buf (vp 0x%x, bp 0x%x)", vp, bp); + } + } } for (;;) { - if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA) + if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA)) while (blist && blist->b_lblkno < 0) blist = blist->b_vnbufs.le_next; if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && @@ -694,7 +704,7 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) for (bp = blist; bp; bp = nbp) { nbp = bp->b_vnbufs.le_next; - if (flags & V_SAVEMETA && bp->b_lblkno < 0) + if ((flags & V_SAVEMETA) && bp->b_lblkno < 0) continue; s = splbio(); if (ISSET(bp->b_flags, B_BUSY)) { @@ -720,7 +730,13 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) (void) VOP_BWRITE(bp); break; } - SET(bp->b_flags, B_INVAL); + + if (bp->b_flags & B_LOCKED) { + panic("vinvalbuf: bp @ 0x%x is locked!\n", bp); + break; + } else { + SET(bp->b_flags, B_INVAL); + } brelse(bp); } } diff --git a/iokit/IOKit/IOKitKeys.h b/iokit/IOKit/IOKitKeys.h index 6862e89e9..65fe7197d 100644 --- a/iokit/IOKit/IOKitKeys.h +++ b/iokit/IOKit/IOKitKeys.h @@ -96,10 +96,14 @@ #define kIOCommandPoolSizeKey "IOCommandPoolSize" // (OSNumber) // properties found in services that have transfer constraints -#define kIOMaximumBlockCountReadKey "IOMaximumBlockCountRead" // (OSNumber) -#define kIOMaximumBlockCountWriteKey "IOMaximumBlockCountWrite" // (OSNumber) -#define kIOMaximumSegmentCountReadKey "IOMaximumSegmentCountRead" // (OSNumber) -#define kIOMaximumSegmentCountWriteKey "IOMaximumSegmentCountWrite" // (OSNumber) +#define kIOMaximumBlockCountReadKey "IOMaximumBlockCountRead" // (OSNumber) +#define kIOMaximumBlockCountWriteKey "IOMaximumBlockCountWrite" // (OSNumber) +#define kIOMaximumByteCountReadKey "IOMaximumByteCountRead" // (OSNumber) +#define kIOMaximumByteCountWriteKey "IOMaximumByteCountWrite" // (OSNumber) +#define kIOMaximumSegmentCountReadKey "IOMaximumSegmentCountRead" // (OSNumber) +#define kIOMaximumSegmentCountWriteKey "IOMaximumSegmentCountWrite" // (OSNumber) +#define kIOMaximumSegmentByteCountReadKey "IOMaximumSegmentByteCountRead" // (OSNumber) +#define kIOMaximumSegmentByteCountWriteKey "IOMaximumSegmentByteCountWrite" // (OSNumber) // properties found in services that wish to describe an icon // diff --git a/iokit/KernelConfigTables.cpp b/iokit/KernelConfigTables.cpp index ff0b955c9..1eedcc6df 100644 --- a/iokit/KernelConfigTables.cpp +++ b/iokit/KernelConfigTables.cpp @@ -28,16 +28,16 @@ */ const char * gIOKernelKmods = "{ - 'com.apple.kernel' = '6.1'; - 'com.apple.kernel.bsd' = '6.1'; - 'com.apple.kernel.iokit' = '6.1'; - 'com.apple.kernel.libkern' = '6.1'; - 'com.apple.kernel.mach' = '6.1'; - 'com.apple.iokit.IOADBFamily' = '1.1'; - 'com.apple.iokit.IONVRAMFamily' = '1.1'; - 'com.apple.iokit.IOSystemManagementFamily' = '1.1'; - 'com.apple.iokit.ApplePlatformFamily' = '1.0'; - 'com.apple.driver.AppleNMI' = '1.0'; + 'com.apple.kernel' = '6.2'; + 'com.apple.kernel.bsd' = '6.2'; + 'com.apple.kernel.iokit' = '6.2'; + 'com.apple.kernel.libkern' = '6.2'; + 'com.apple.kernel.mach' = '6.2'; + 'com.apple.iokit.IOADBFamily' = '6.2'; + 'com.apple.iokit.IONVRAMFamily' = '6.2'; + 'com.apple.iokit.IOSystemManagementFamily' = '6.2'; + 'com.apple.iokit.ApplePlatformFamily' = '6.2'; + 'com.apple.driver.AppleNMI' = '6.2'; }"; diff --git a/iokit/conf/version.minor b/iokit/conf/version.minor index d00491fd7..0cfbf0888 100644 --- a/iokit/conf/version.minor +++ b/iokit/conf/version.minor @@ -1 +1 @@ -1 +2 diff --git a/libkern/conf/version.minor b/libkern/conf/version.minor index d00491fd7..0cfbf0888 100644 --- a/libkern/conf/version.minor +++ b/libkern/conf/version.minor @@ -1 +1 @@ -1 +2 diff --git a/libsa/conf/version.minor b/libsa/conf/version.minor index d00491fd7..0cfbf0888 100644 --- a/libsa/conf/version.minor +++ b/libsa/conf/version.minor @@ -1 +1 @@ -1 +2 diff --git a/osfmk/conf/kernelversion.minor b/osfmk/conf/kernelversion.minor index d00491fd7..0cfbf0888 100644 --- a/osfmk/conf/kernelversion.minor +++ b/osfmk/conf/kernelversion.minor @@ -1 +1 @@ -1 +2 diff --git a/osfmk/conf/version.minor b/osfmk/conf/version.minor index d00491fd7..0cfbf0888 100644 --- a/osfmk/conf/version.minor +++ b/osfmk/conf/version.minor @@ -1 +1 @@ -1 +2 diff --git a/osfmk/i386/loose_ends.c b/osfmk/i386/loose_ends.c index e64faedb2..b0cd27fff 100644 --- a/osfmk/i386/loose_ends.c +++ b/osfmk/i386/loose_ends.c @@ -64,6 +64,49 @@ */ +/* + * copy 'size' bytes from physical to physical address + * the caller must validate the physical ranges + * + * if flush_action == 0, no cache flush necessary + * if flush_action == 1, flush the source + * if flush_action == 2, flush the dest + * if flush_action == 3, flush both source and dest + */ + +kern_return_t copyp2p(vm_offset_t source, vm_offset_t dest, unsigned int size, unsigned int flush_action) { + + switch(flush_action) { + case 1: + flush_dcache(source, size, 1); + break; + case 2: + flush_dcache(dest, size, 1); + break; + case 3: + flush_dcache(source, size, 1); + flush_dcache(dest, size, 1); + break; + + } + bcopy_phys((char *)source, (char *)dest, size); /* Do a physical copy */ + + switch(flush_action) { + case 1: + flush_dcache(source, size, 1); + break; + case 2: + flush_dcache(dest, size, 1); + break; + case 3: + flush_dcache(source, size, 1); + flush_dcache(dest, size, 1); + break; + + } +} + + /* * Copies data from a physical page to a virtual page. This is used to diff --git a/osfmk/ppc/cswtch.s b/osfmk/ppc/cswtch.s index 3cca411b2..148730393 100644 --- a/osfmk/ppc/cswtch.s +++ b/osfmk/ppc/cswtch.s @@ -871,7 +871,7 @@ fsenable: lwz r8,savesrr1(r25) ; Get the msr of the interrupted guy rlwinm. r0,r8,0,MSR_PR_BIT,MSR_PR_BIT ; See if we are doing this for user state stw r8,savesrr1(r25) ; Set the msr of the interrupted guy xor r3,r25,r5 ; Get the real address of the savearea - bne- fsnuser ; We are not user state... + beq- fsnuser ; We are not user state... stw r10,ACT_MACT_SPF(r17) ; Set the activation copy stw r10,spcFlags(r26) ; Set per_proc copy @@ -2297,7 +2297,7 @@ vrenable: lwz r8,savesrr1(r25) ; Get the msr of the interrupted guy rlwinm. r0,r8,0,MSR_PR_BIT,MSR_PR_BIT ; See if we are doing this for user state stw r8,savesrr1(r25) ; Set the msr of the interrupted guy xor r3,r25,r5 ; Get the real address of the savearea - bne- vrnuser ; We are not user state... + beq- vrnuser ; We are not user state... stw r10,ACT_MACT_SPF(r17) ; Set the activation copy stw r10,spcFlags(r26) ; Set per_proc copy diff --git a/osfmk/ppc/mappings.c b/osfmk/ppc/mappings.c index de3411de9..237e2bc12 100644 --- a/osfmk/ppc/mappings.c +++ b/osfmk/ppc/mappings.c @@ -70,6 +70,7 @@ #endif vm_map_t mapping_map = VM_MAP_NULL; +#define MAPPING_MAP_SIZE 33554432 /* 32MB address space */ unsigned int incrVSID = 0; /* VSID increment value */ unsigned int mappingdeb0 = 0; @@ -1548,7 +1549,7 @@ void mapping_free_prime(void) { /* Primes the mapping block release list mappingblok *mbn; vm_offset_t mapping_min; - retr = kmem_suballoc(kernel_map, &mapping_min, mem_size / 16, + retr = kmem_suballoc(kernel_map, &mapping_min, MAPPING_MAP_SIZE, FALSE, TRUE, &mapping_map); if (retr != KERN_SUCCESS) @@ -1877,6 +1878,50 @@ kern_return_t copyp2v(vm_offset_t source, vm_offset_t sink, unsigned int size) { } +/* + * copy 'size' bytes from physical to physical address + * the caller must validate the physical ranges + * + * if flush_action == 0, no cache flush necessary + * if flush_action == 1, flush the source + * if flush_action == 2, flush the dest + * if flush_action == 3, flush both source and dest + */ + +kern_return_t copyp2p(vm_offset_t source, vm_offset_t dest, unsigned int size, unsigned int flush_action) { + + switch(flush_action) { + case 1: + flush_dcache(source, size, 1); + break; + case 2: + flush_dcache(dest, size, 1); + break; + case 3: + flush_dcache(source, size, 1); + flush_dcache(dest, size, 1); + break; + + } + bcopy_phys((char *)source, (char *)dest, size); /* Do a physical copy */ + + switch(flush_action) { + case 1: + flush_dcache(source, size, 1); + break; + case 2: + flush_dcache(dest, size, 1); + break; + case 3: + flush_dcache(source, size, 1); + flush_dcache(dest, size, 1); + break; + + } +} + + + #if DEBUG /* * Dumps out the mapping stuff associated with a virtual address diff --git a/osfmk/ppc/pmap.c b/osfmk/ppc/pmap.c index f6f6a8e34..85fabb668 100644 --- a/osfmk/ppc/pmap.c +++ b/osfmk/ppc/pmap.c @@ -483,6 +483,9 @@ pmap_bootstrap(unsigned int mem_size, vm_offset_t *first_avail, vm_offset_t *fir hash_table_size *= 2) continue; + if (num > (sizeof(pte_t) * 524288)) + hash_table_size = hash_table_size/2; /* reduce by half above 512MB */ + /* Scale to within any physical memory layout constraints */ do { num = atop(mem_size); /* num now holds mem_size in pages */ diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c index 0a729a2a8..d066f2f58 100644 --- a/osfmk/vm/vm_init.c +++ b/osfmk/vm/vm_init.c @@ -100,7 +100,11 @@ vm_mem_bootstrap(void) kmem_init(start, end); pmap_init(); - zsize = mem_size >> 2; /* Get target zone size as 1/4 of physical memory */ + if (PE_parse_boot_arg("zsize", &zsize)) + zsize = zsize * 1024 * 1024; + else { + zsize = mem_size >> 2; /* Get target zone size as 1/4 of physical memory */ + } if(zsize < ZONE_MAP_MIN) zsize = ZONE_MAP_MIN; /* Clamp to min */ if(zsize > ZONE_MAP_MAX) zsize = ZONE_MAP_MAX; /* Clamp to max */ zone_init(zsize); /* Allocate address space for zones */ diff --git a/osfmk/vm/vm_kern.c b/osfmk/vm/vm_kern.c index ab065e048..4d4ee31cb 100644 --- a/osfmk/vm/vm_kern.c +++ b/osfmk/vm/vm_kern.c @@ -85,9 +85,7 @@ vm_map_t kernel_pageable_map; extern kern_return_t kmem_alloc_pages( register vm_object_t object, register vm_object_offset_t offset, - register vm_offset_t start, - register vm_offset_t end, - vm_prot_t protection); + register vm_size_t size); extern void kmem_remap_pages( register vm_object_t object, @@ -254,8 +252,13 @@ kernel_memory_allocate( /* * Since we have not given out this address yet, - * it is safe to unlock the map. + * it is safe to unlock the map. Except of course + * we must make certain no one coalesces our address + * or does a blind vm_deallocate and removes the object + * an extra object reference will suffice to protect + * against both contingencies. */ + vm_object_reference(object); vm_map_unlock(map); vm_object_lock(object); @@ -271,6 +274,7 @@ kernel_memory_allocate( offset + (vm_object_offset_t)i); vm_object_unlock(object); vm_map_remove(map, addr, addr + size, 0); + vm_object_deallocate(object); return KERN_RESOURCE_SHORTAGE; } vm_object_unlock(object); @@ -289,8 +293,11 @@ kernel_memory_allocate( vm_object_unlock(object); } vm_map_remove(map, addr, addr + size, 0); + vm_object_deallocate(object); return (kr); } + /* now that the page is wired, we no longer have to fear coalesce */ + vm_object_deallocate(object); if (object == kernel_object) vm_map_simplify(map, addr); @@ -338,31 +345,26 @@ kmem_realloc( vm_offset_t *newaddrp, vm_size_t newsize) { - vm_offset_t oldmin, oldmax; - vm_offset_t newaddr; - vm_object_t object; - vm_map_entry_t oldentry, newentry; - kern_return_t kr; + vm_offset_t oldmin, oldmax; + vm_offset_t newaddr; + vm_offset_t offset; + vm_object_t object; + vm_map_entry_t oldentry, newentry; + vm_page_t mem; + kern_return_t kr; oldmin = trunc_page(oldaddr); oldmax = round_page(oldaddr + oldsize); oldsize = oldmax - oldmin; newsize = round_page(newsize); - /* - * Find space for the new region. - */ - - kr = vm_map_find_space(map, &newaddr, newsize, (vm_offset_t) 0, - &newentry); - if (kr != KERN_SUCCESS) { - return kr; - } /* * Find the VM object backing the old region. */ + vm_map_lock(map); + if (!vm_map_lookup_entry(map, oldmin, &oldentry)) panic("kmem_realloc"); object = oldentry->object.vm_object; @@ -373,36 +375,71 @@ kmem_realloc( */ vm_object_reference(object); + /* by grabbing the object lock before unlocking the map */ + /* we guarantee that we will panic if more than one */ + /* attempt is made to realloc a kmem_alloc'd area */ vm_object_lock(object); + vm_map_unlock(map); if (object->size != oldsize) panic("kmem_realloc"); object->size = newsize; vm_object_unlock(object); - newentry->object.vm_object = object; - newentry->offset = 0; - assert (newentry->wired_count == 0); - newentry->wired_count = 1; + /* allocate the new pages while expanded portion of the */ + /* object is still not mapped */ + kmem_alloc_pages(object, oldsize, newsize-oldsize); + /* - * Since we have not given out this address yet, - * it is safe to unlock the map. We are trusting - * that nobody will play with either region. + * Find space for the new region. */ + kr = vm_map_find_space(map, &newaddr, newsize, (vm_offset_t) 0, + &newentry); + if (kr != KERN_SUCCESS) { + vm_object_lock(object); + for(offset = oldsize; + offsetsize = oldsize; + vm_object_unlock(object); + vm_object_deallocate(object); + return kr; + } + newentry->object.vm_object = object; + newentry->offset = 0; + assert (newentry->wired_count == 0); + + + /* add an extra reference in case we have someone doing an */ + /* unexpected deallocate */ + vm_object_reference(object); vm_map_unlock(map); - /* - * Remap the pages in the old region and - * allocate more pages for the new region. - */ + if ((kr = vm_map_wire(map, newaddr, newaddr + newsize, + VM_PROT_DEFAULT, FALSE)) != KERN_SUCCESS) { + vm_map_remove(map, newaddr, newaddr + newsize, 0); + vm_object_lock(object); + for(offset = oldsize; + offsetsize = oldsize; + vm_object_unlock(object); + vm_object_deallocate(object); + return (kr); + } + vm_object_deallocate(object); - kmem_remap_pages(object, 0, - newaddr, newaddr + oldsize, - VM_PROT_DEFAULT); - kmem_alloc_pages(object, oldsize, - newaddr + oldsize, newaddr + newsize, - VM_PROT_DEFAULT); *newaddrp = newaddr; return KERN_SUCCESS; @@ -500,28 +537,21 @@ kmem_free( } /* - * Allocate new wired pages in an object. - * The object is assumed to be mapped into the kernel map or - * a submap. + * Allocate new pages in an object. */ kern_return_t kmem_alloc_pages( register vm_object_t object, register vm_object_offset_t offset, - register vm_offset_t start, - register vm_offset_t end, - vm_prot_t protection) + register vm_size_t size) { - /* - * Mark the pmap region as not pageable. - */ - pmap_pageable(kernel_pmap, start, end, FALSE); - while (start < end) { + size = round_page(size); + vm_object_lock(object); + while (size) { register vm_page_t mem; - vm_object_lock(object); /* * Allocate a page @@ -533,27 +563,12 @@ kmem_alloc_pages( vm_object_lock(object); } - /* - * Wire it down - */ - vm_page_lock_queues(); - vm_page_wire(mem); - vm_page_unlock_queues(); - vm_object_unlock(object); - - /* - * Enter it in the kernel pmap - */ - PMAP_ENTER(kernel_pmap, start, mem, protection, - VM_WIMG_USE_DEFAULT, TRUE); - - vm_object_lock(object); - PAGE_WAKEUP_DONE(mem); - vm_object_unlock(object); - start += PAGE_SIZE; - offset += PAGE_SIZE_64; + offset += PAGE_SIZE; + size -= PAGE_SIZE; + mem->busy = FALSE; } + vm_object_unlock(object); return KERN_SUCCESS; } diff --git a/pexpert/conf/version.minor b/pexpert/conf/version.minor index d00491fd7..0cfbf0888 100644 --- a/pexpert/conf/version.minor +++ b/pexpert/conf/version.minor @@ -1 +1 @@ -1 +2 -- 2.45.2