bsd/vfs/vfs_utfconv.c standard
bsd/vfs/vfs_vnops.c standard
bsd/vfs/vnode_if.c standard
+bsd/vfs/vfs_journal.c standard
bsd/miscfs/deadfs/dead_vnops.c standard
bsd/miscfs/fdesc/fdesc_vfsops.c optional fdesc
bsd/kern/mach_loader.c standard
bsd/kern/posix_sem.c standard
bsd/kern/posix_shm.c standard
+# XXXdbg - I need this in the journaling and block cache code
+bsd/kern/qsort.c standard
bsd/vm/vnode_pager.c standard
bsd/vm/vm_unix.c standard
#include <sys/quota.h>
#include <sys/dirent.h>
+#include <vfs/vfs_journal.h>
+
#include <hfs/hfs_format.h>
#include <hfs/hfs_catalog.h>
#include <hfs/hfs_cnode.h>
int16_t vcbAtrb;
int16_t vcbFlags;
int16_t vcbspare;
+ u_int32_t vcbJinfoBlock;
u_int32_t vcbCrDate;
u_int32_t vcbLsMod;
u_int8_t hfs_fs_ronly; /* Whether this was mounted as read-initially */
u_int8_t hfs_unknownpermissions; /* Whether this was mounted with MNT_UNKNOWNPERMISSIONS */
u_int8_t hfs_media_writeable;
+ u_int8_t hfs_orphans_cleaned;
/* Physical Description */
u_long hfs_phys_block_count; /* Num of PHYSICAL blocks of volume */
unicode_to_hfs_func_t hfs_get_hfsname;
struct quotafile hfs_qfiles[MAXQUOTAS]; /* quota files */
+
+ // XXXdbg
+ void *jnl; // the journal for this volume (if one exists)
+ struct vnode *jvp; // device where the journal lives (may be equal to devvp)
+ u_int32_t jnl_start; // start block of the journal file (so we don't delete it)
+ u_int32_t hfs_jnlfileid;
+ u_int32_t hfs_jnlinfoblkid;
+ volatile int readers;
+ volatile int blocker;
} hfsmount_t;
#define hfs_private_metadata_dir hfs_privdir_desc.cd_cnid
+#define hfs_global_shared_lock_acquire(hfsmp) \
+ do { \
+ if (hfsmp->blocker) { \
+ tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \
+ continue; \
+ } \
+ hfsmp->readers++; \
+ break; \
+ } while (1)
+
+#define hfs_global_shared_lock_release(hfsmp) \
+ do { \
+ hfsmp->readers--; \
+ if (hfsmp->readers == 0) { \
+ wakeup((caddr_t)&hfsmp->readers); \
+ } \
+ } while (0)
+
+#define hfs_global_exclusive_lock_acquire(hfsmp) \
+ do { \
+ if (hfsmp->blocker) { \
+ tsleep((caddr_t)&hfsmp->blocker, PRIBIO, "journal_blocker", 0); \
+ continue; \
+ } \
+ if (hfsmp->readers != 0) { \
+ tsleep((caddr_t)&hfsmp->readers, PRIBIO, "journal_enable/disble", 0); \
+ continue; \
+ } \
+ hfsmp->blocker = 1; \
+ break; \
+ } while (1)
+
+#define hfs_global_exclusive_lock_release(hfsmp) \
+ hfsmp->blocker = 0; \
+ wakeup((caddr_t)&hfsmp->blocker)
+
#define MAXHFSVNODELEN 31
#define VTOHFS(VP) ((struct hfsmount *)((VP)->v_mount->mnt_data))
#define VFSTOHFS(MP) ((struct hfsmount *)(MP)->mnt_data)
#define VCBTOHFS(VCB) (((struct vfsVCB *)(VCB))->vcb_hfsmp)
+#define FCBTOHFS(FCB) ((struct hfsmount *)(FCB)->ff_cp->c_vp->v_mount->mnt_data)
/*
* Various ways to acquire a VCB pointer:
#define VTOVCB(VP) (&(((struct hfsmount *)((VP)->v_mount->mnt_data))->hfs_vcb.vcb_vcb))
#define VFSTOVCB(MP) (&(((struct hfsmount *)(MP)->mnt_data)->hfs_vcb.vcb_vcb))
#define HFSTOVCB(HFSMP) (&(HFSMP)->hfs_vcb.vcb_vcb)
+#define FCBTOVCB(FCB) (&(((struct hfsmount *)((FCB)->ff_cp->c_vp->v_mount->mnt_data))->hfs_vcb.vcb_vcb))
#define E_NONE 0
extern u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve);
+extern void hfs_remove_orphans(struct hfsmount *);
+
short MacToVFSError(OSErr err);
#define HFS_SYNCTRANS 1
extern int hfs_btsync(struct vnode *vp, int sync_transaction);
+// used as a callback by the journaling code
+extern void hfs_sync_metadata(void *arg);
short make_dir_entry(FCB **fileptr, char *name, u_int32_t fileID);
OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb,
struct proc *p);
OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
- off_t embeddedOffset, u_int64_t disksize, struct proc *p);
+ off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args);
+
+extern int hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
+ void *_args, int embeddedOffset, int mdb_offset,
+ HFSMasterDirectoryBlock *mdbp, struct ucred *cred);
+extern u_long GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name,
+ struct cat_attr *fattr, struct cat_fork *forkinfo);
int hfs_getconverter(u_int32_t encoding, hfs_to_unicode_func_t *get_unicode,
unicode_to_hfs_func_t *get_hfsname);
if ((error = hfs_write_access(vp, ap->a_cred, ap->a_p, false)) != 0)
return (error);
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ return error;
+ }
+ }
+
/* Lock catalog b-tree */
error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, ap->a_p);
- if (error)
- return (error);
+ if (error) {
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+ return (error);
+ }
error = cat_insertfilethread(hfsmp, &cp->c_desc);
/* Unlock catalog b-tree */
(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, ap->a_p);
+
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
if (error)
return (error);
}
}
if (cp->c_flag & (C_NOEXISTS | C_DELETED))
return (ENOENT);
+
+ // XXXdbg - don't allow modifying the journal or journal_info_block
+ if (hfsmp->jnl && cp->c_datafork) {
+ struct HFSPlusExtentDescriptor *extd;
+
+ extd = &cp->c_datafork->ff_data.cf_extents[0];
+ if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+ return EPERM;
+ }
+ }
+
/*
* Ownership of a file is required in one of two classes of calls:
*
* If any cnode attributes changed then do an update.
*/
if (alist->volattr == 0) {
- struct timeval atime, mtime;
+ struct timeval tv;
- atime.tv_sec = cp->c_atime;
- atime.tv_usec = 0;
- mtime.tv_sec = cp->c_mtime;
- mtime.tv_usec = cp->c_mtime_nsec / 1000;
cp->c_flag |= C_MODIFIED;
- if ((error = VOP_UPDATE(vp, &atime, &mtime, 1)))
+ tv = time;
+ CTIMES(cp, &tv, &tv);
+ if ((error = VOP_UPDATE(vp, &tv, &tv, 1)))
goto ErrorExit;
}
/* Volume Rename */
to_desc.cd_cnid = cp->c_cnid;
to_desc.cd_flags = CD_ISDIR;
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ error = EINVAL;
+ /* Restore the old name in the VCB */
+ copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
+ vcb->vcbFlags |= 0xFF00;
+ goto ErrorExit;
+ }
+ }
+
+
/* Lock catalog b-tree */
error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
if (error) {
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
/* Restore the old name in the VCB */
copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
vcb->vcbFlags |= 0xFF00;
/* Unlock the Catalog */
(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
-
+
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
if (error) {
/* Restore the old name in the VCB */
copystr(cp->c_desc.cd_nameptr, vcb->vcbVN, sizeof(vcb->vcbVN), NULL);
int error = 0;
int depleted = 0;
int index, startindex;
- int i;
+ int i, dir_entries;
struct cat_desc *lastdescp = NULL;
struct cat_desc prevdesc;
char * prevnamebuf = NULL;
struct cat_entrylist *ce_list = NULL;
+ dir_entries = dcp->c_entries;
+ if (dcp->c_attr.ca_fileid == kHFSRootFolderID && hfsmp->jnl) {
+ dir_entries -= 3;
+ }
+
*(ap->a_actualcount) = 0;
*(ap->a_eofflag) = 0;
/* Convert uio_offset into a directory index. */
startindex = index = uio->uio_offset / sizeof(struct dirent);
- if ((index + 1) > dcp->c_entries) {
+ if ((index + 1) > dir_entries) {
*(ap->a_eofflag) = 1;
error = 0;
goto exit;
/* Termination checks */
if ((--maxcount <= 0) ||
(uio->uio_resid < (fixedblocksize + HFS_AVERAGE_NAME_SIZE)) ||
- (index >= dcp->c_entries)) {
+ (index >= dir_entries)) {
depleted = 1;
break;
}
} /* for each catalog entry */
/* If there are more entries then save the last name. */
- if (index < dcp->c_entries
+ if (index < dir_entries
&& !(*(ap->a_eofflag))
&& lastdescp != NULL) {
if (prevnamebuf == NULL)
if (ATTR_DIR_ENTRYCOUNT & attr) {
u_long entries = cattrp->ca_entries;
- if ((descp->cd_parentcnid == kRootParID) &&
- (hfsmp->hfs_private_metadata_dir != 0))
- --entries; /* hide private dir */
+ if (descp->cd_parentcnid == kRootParID) {
+ if (hfsmp->hfs_private_metadata_dir != 0)
+ --entries; /* hide private dir */
+ if (hfsmp->jnl)
+ entries -= 2; /* hide the journal files */
+ }
*((u_long *)attrbufptr)++ = entries;
}
if (options & kGetEmptyBlock)
bp = getblk(vp, blockNum, block->blockSize, 0, 0, BLK_META);
else
- retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp);
+ retval = meta_bread(vp, blockNum, block->blockSize, NOCRED, &bp);
DBG_ASSERT(bp != NULL);
DBG_ASSERT(bp->b_data != NULL);
block->buffer = bp->b_data;
block->blockReadFromDisk = (bp->b_flags & B_CACHE) == 0; /* not found in cache ==> came from disk */
+ // XXXdbg
+ block->isModified = 0;
+
#if BYTE_ORDER == LITTLE_ENDIAN
/* Endian swap B-Tree node (only if it's a valid block) */
if (!(options & kGetEmptyBlock)) {
}
+__private_extern__
+void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr)
+{
+ struct hfsmount *hfsmp = VTOHFS(vp);
+ struct buf *bp = NULL;
+
+ if (hfsmp->jnl == NULL) {
+ return;
+ }
+
+ bp = (struct buf *) blockPtr->blockHeader;
+ if (bp == NULL) {
+ panic("ModifyBlockStart: null bp for blockdescptr 0x%x?!?\n", blockPtr);
+ return;
+ }
+
+ journal_modify_block_start(hfsmp->jnl, bp);
+ blockPtr->isModified = 1;
+}
+
+
__private_extern__
OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options)
{
+ struct hfsmount *hfsmp = VTOHFS(vp);
extern int bdwrite_internal(struct buf *, int);
OSStatus retval = E_NONE;
struct buf *bp = NULL;
}
if (options & kTrashBlock) {
- bp->b_flags |= B_INVAL;
- brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+ bp->b_flags |= B_INVAL;
+ if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+ journal_kill_block(hfsmp->jnl, bp);
+ } else {
+ brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+ }
} else {
if (options & kForceWriteBlock) {
- retval = VOP_BWRITE(bp);
+ if (hfsmp->jnl) {
+ if (blockPtr->isModified == 0) {
+ panic("hfs: releaseblock: modified is 0 but forcewrite set! bp 0x%x\n", bp);
+ }
+ retval = journal_modify_block_end(hfsmp->jnl, bp);
+ blockPtr->isModified = 0;
+ } else {
+ retval = VOP_BWRITE(bp);
+ }
} else if (options & kMarkBlockDirty) {
-#if FORCESYNCBTREEWRITES
- VOP_BWRITE(bp);
-#else
- if (options & kLockTransaction) {
+ if ((options & kLockTransaction) && hfsmp->jnl == NULL) {
/*
*
* Set the B_LOCKED flag and unlock the buffer, causing brelse to move
/* Rollback sync time to cause a sync on lock release... */
(void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1));
}
- bp->b_flags |= B_LOCKED;
- }
+
+ bp->b_flags |= B_LOCKED;
+ }
+
/*
* Delay-write this block.
* If the maximum delayed buffers has been exceeded then
* free up some buffers and fall back to an asynchronous write.
*/
- if (bdwrite_internal(bp, 1) != 0) {
+ if (hfsmp->jnl) {
+ if (blockPtr->isModified == 0) {
+ panic("hfs: releaseblock: modified is 0 but markdirty set! bp 0x%x\n", bp);
+ }
+ retval = journal_modify_block_end(hfsmp->jnl, bp);
+ blockPtr->isModified = 0;
+ } else if (bdwrite_internal(bp, 1) != 0) {
hfs_btsync(vp, 0);
/* Rollback sync time to cause a sync on lock release... */
(void) BTSetLastSync(VTOF(vp), time.tv_sec - (kMaxSecsForFsync + 1));
bp->b_flags &= ~B_LOCKED;
bawrite(bp);
}
-
-#endif
} else {
- brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+ // check if we had previously called journal_modify_block_start()
+ // on this block and if so, abort it (which will call brelse()).
+ if (hfsmp->jnl && blockPtr->isModified) {
+ // XXXdbg - I don't want to call modify_block_abort()
+ // because I think it may be screwing up the
+ // journal and blowing away a block that has
+ // valid data in it.
+ //
+ // journal_modify_block_abort(hfsmp->jnl, bp);
+ //panic("hfs: releaseblock called for 0x%x but mod_block_start previously called.\n", bp);
+ journal_modify_block_end(hfsmp->jnl, bp);
+ blockPtr->isModified = 0;
+ } else {
+ brelse(bp); /* note: B-tree code will clear blockPtr->blockHeader and blockPtr->buffer */
+ }
};
};
{
#pragma unused (maxEOF)
- OSStatus retval;
- UInt64 actualBytesAdded;
+ OSStatus retval, ret;
+ UInt64 actualBytesAdded, origSize;
UInt64 bytesToAdd;
- UInt32 extendFlags;
u_int32_t startAllocation;
u_int32_t fileblocks;
BTreeInfoRec btInfo;
ExtendedVCB *vcb;
FCB *filePtr;
struct proc *p = NULL;
-
+ UInt64 trim = 0;
filePtr = GetFileControlBlock(vp);
{
p = current_proc();
/* lock extents b-tree (also protects volume bitmap) */
- retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p);
+ retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, p);
if (retval)
return (retval);
}
(void) BTGetInformation(filePtr, 0, &btInfo);
+#if 0 // XXXdbg
/*
* The b-tree code expects nodes to be contiguous. So when
* the allocation block size is less than the b-tree node
extendFlags = 0;
} else {
/* Ensure that all b-tree nodes are contiguous on disk */
- extendFlags = kEFAllMask | kEFContigMask;
+ extendFlags = kEFContigMask;
}
+#endif
+ origSize = filePtr->fcbEOF;
fileblocks = filePtr->ff_blocks;
startAllocation = vcb->nextAllocation;
- retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, extendFlags, &actualBytesAdded);
-
+ // loop trying to get a contiguous chunk that's an integer multiple
+ // of the btree node size. if we can't get a contiguous chunk that
+ // is at least the node size then we break out of the loop and let
+ // the error propagate back up.
+ do {
+ retval = ExtendFileC(vcb, filePtr, bytesToAdd, 0, kEFContigMask, &actualBytesAdded);
+ if (retval == dskFulErr && actualBytesAdded == 0) {
+
+ if (bytesToAdd == btInfo.nodeSize || bytesToAdd < (minEOF - origSize)) {
+ // if we're here there's nothing else to try, we're out
+ // of space so we break and bail out.
+ break;
+ } else {
+ bytesToAdd >>= 1;
+ if (bytesToAdd < btInfo.nodeSize) {
+ bytesToAdd = btInfo.nodeSize;
+ } else if ((bytesToAdd % btInfo.nodeSize) != 0) {
+ // make sure it's an integer multiple of the nodeSize
+ bytesToAdd -= (bytesToAdd % btInfo.nodeSize);
+ }
+ }
+ }
+ } while (retval == dskFulErr && actualBytesAdded == 0);
+
/*
* If a new extent was added then move the roving allocator
* reference forward by the current b-tree file size so
vcb->nextAllocation += fileblocks;
}
+ filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
+
+ // XXXdbg ExtendFileC() could have returned an error even though
+ // it grew the file to be big enough for our needs. If this is
+ // the case, we don't care about retval so we blow it away.
+ //
+ if (filePtr->fcbEOF >= minEOF && retval != 0) {
+ retval = 0;
+ }
+
+ // XXXdbg if the file grew but isn't large enough or isn't an
+ // even multiple of the nodeSize then trim things back. if
+ // the file isn't large enough we trim back to the original
+ // size. otherwise we trim back to be an even multiple of the
+ // btree node size.
+ //
+ if ((filePtr->fcbEOF < minEOF) || (actualBytesAdded % btInfo.nodeSize) != 0) {
+
+ if (filePtr->fcbEOF < minEOF) {
+ retval = dskFulErr;
+
+ if (filePtr->fcbEOF < origSize) {
+ panic("hfs: btree file eof %lld less than orig size %lld!\n",
+ filePtr->fcbEOF, origSize);
+ }
+
+ trim = filePtr->fcbEOF - origSize;
+ if (trim != actualBytesAdded) {
+ panic("hfs: trim == %lld but actualBytesAdded == %lld\n",
+ trim, actualBytesAdded);
+ }
+ } else {
+ trim = (actualBytesAdded % btInfo.nodeSize);
+ }
+
+ ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0);
+ filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
+
+ // XXXdbg - panic if the file didn't get trimmed back properly
+ if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
+ panic("hfs: truncate file didn't! fcbEOF %lld nsize %d fcb 0x%x\n",
+ filePtr->fcbEOF, btInfo.nodeSize, filePtr);
+ }
+
+ if (ret) {
+ // XXXdbg - this probably doesn't need to be a panic()
+ panic("hfs: error truncating btree files (sz 0x%llx, trim %lld, ret %d)\n",
+ filePtr->fcbEOF, trim, ret);
+ return ret;
+ }
+ actualBytesAdded -= trim;
+ }
+
if(VTOC(vp)->c_fileid != kHFSExtentsFileID) {
/*
* Get any extents overflow b-tree changes to disk ASAP!
*/
- if (retval == 0) {
- (void) BTFlushPath(VTOF(vcb->extentsRefNum));
- (void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p);
- }
+ (void) BTFlushPath(VTOF(vcb->extentsRefNum));
+ (void) VOP_FSYNC(vcb->extentsRefNum, NOCRED, MNT_WAIT, p);
+
(void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, p);
}
- if (retval)
- return (retval);
-
- filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize;
- retval = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded);
- if (retval)
- return (retval);
-
+ if ((filePtr->fcbEOF % btInfo.nodeSize) != 0) {
+ panic("hfs: extendbtree: fcb 0x%x has eof 0x%llx not a multiple of 0x%x (trim %llx)\n",
+ filePtr, filePtr->fcbEOF, btInfo.nodeSize, trim);
+ }
+
/*
* Update the Alternate MDB or Alternate VolumeHeader
*/
(VTOC(vp)->c_fileid == kHFSAttributesFileID)
) {
MarkVCBDirty( vcb );
- retval = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
+ ret = hfs_flushvolumeheader(VCBTOHFS(vcb), MNT_WAIT, HFS_ALTFLUSH);
}
+
+ ret = ClearBTNodes(vp, btInfo.nodeSize, filePtr->fcbEOF - actualBytesAdded, actualBytesAdded);
+ if (ret)
+ return (ret);
return retval;
}
static int
ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount)
{
+ struct hfsmount *hfsmp = VTOHFS(vp);
struct buf *bp = NULL;
daddr_t blk;
daddr_t blkcnt;
bp = getblk(vp, blk, blksize, 0, 0, BLK_META);
if (bp == NULL)
continue;
+
+ // XXXdbg
+ if (hfsmp->jnl) {
+ // XXXdbg -- skipping this for now since it makes a transaction
+ // become *way* too large
+ //journal_modify_block_start(hfsmp->jnl, bp);
+ }
+
bzero((char *)bp->b_data, blksize);
bp->b_flags |= B_AGE;
- /* wait/yield every 32 blocks so we don't hog all the buffers */
- if ((blk % 32) == 0)
- VOP_BWRITE(bp);
- else
- bawrite(bp);
+ // XXXdbg
+ if (hfsmp->jnl) {
+ // XXXdbg -- skipping this for now since it makes a transaction
+ // become *way* too large
+ //journal_modify_block_end(hfsmp->jnl, bp);
+
+ // XXXdbg - remove this once we decide what to do with the
+ // writes to the journal
+ if ((blk % 32) == 0)
+ VOP_BWRITE(bp);
+ else
+ bawrite(bp);
+ } else {
+ /* wait/yield every 32 blocks so we don't hog all the buffers */
+ if ((blk % 32) == 0)
+ VOP_BWRITE(bp);
+ else
+ bawrite(bp);
+ }
--blkcnt;
++blk;
}
if (result)
goto exit;
+ // XXXdbg - preflight all btree operations to make sure there's enough space
+ result = BTCheckFreeSpace(fcb);
+ if (result)
+ goto exit;
+
BDINIT(file_data, &file_rec);
result = BTSearchRecord(fcb, &iterator[0], &file_data, &datasize, &iterator[0]);
if (result)
(void) BTFlushPath(fcb);
}
exit:
+ (void) BTFlushPath(fcb);
FREE(iterator, M_TEMP);
return MacToVFSError(result);
encoding = getencoding(recp);
hint = iterator->hint.nodeNum;
+ /* Hide the journal files (if any) */
+ if (hfsmp->jnl &&
+ ((cnid == hfsmp->hfs_jnlfileid) ||
+ (cnid == hfsmp->hfs_jnlinfoblkid))) {
+
+ result = ENOENT;
+ goto exit;
+ }
+
/*
* When a hardlink link is encountered, auto resolve it
*/
hfs_setencodingbits(hfsmp, encoding);
}
+ // XXXdbg - preflight all btree operations to make sure there's enough space
+ result = BTCheckFreeSpace(fcb);
+ if (result)
+ goto exit;
+
/*
* Insert the thread record first
*/
vcb->vcbNxtCNID = nextCNID;
vcb->vcbFlags |= 0xFF00;
- (void) BTFlushPath(fcb);
-
exit:
+ (void) BTFlushPath(fcb);
FREE(bto, M_TEMP);
return MacToVFSError(result);
if ((result = buildkey(hfsmp, to_cdp, (HFSPlusCatalogKey *)&to_iterator->key, 0)))
goto exit;
+ // XXXdbg - preflight all btree operations to make sure there's enough space
+ result = BTCheckFreeSpace(fcb);
+ if (result)
+ goto exit;
+
to_key = (HFSPlusCatalogKey *)&to_iterator->key;
MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK);
BDINIT(btdata, recp);
result = BTInsertRecord(fcb, to_iterator, &btdata, datasize);
if (result) {
/* Try and restore original before leaving */
+ // XXXdbg
+ #if 1
+ {
+ int err;
+ err = BTInsertRecord(fcb, from_iterator, &btdata, datasize);
+ if (err)
+ panic("cat_create: could not undo (BTInsert = %d)", err);
+ }
+ #else
(void) BTInsertRecord(fcb, from_iterator, &btdata, datasize);
+ #endif
goto exit;
}
sourcegone = 1;
result = BTDeleteRecord(fcb, from_iterator);
if (result) {
/* Try and delete new record before leaving */
+ // XXXdbg
+ #if 1
+ {
+ int err;
+ err = BTDeleteRecord(fcb, to_iterator);
+ if (err)
+ panic("cat_create: could not undo (BTDelete = %d)", err);
+ }
+ #else
(void) BTDeleteRecord(fcb, to_iterator);
+ #endif
goto exit;
}
}
FREE(pluskey, M_TEMP);
}
}
- (void) BTFlushPath(fcb);
exit:
+ (void) BTFlushPath(fcb);
if (from_iterator)
FREE(from_iterator, M_TEMP);
if (to_iterator)
* A directory must be empty
* A file must be zero length (no blocks)
*/
-
if (descp->cd_cnid < kHFSFirstUserCatalogNodeID ||
descp->cd_parentcnid == kRootParID)
return (EINVAL);
if (result)
goto exit;
+ // XXXdbg - preflight all btree operations to make sure there's enough space
+ result = BTCheckFreeSpace(fcb);
+ if (result)
+ goto exit;
+
/* Delete record */
result = BTDeleteRecord(fcb, iterator);
if (result)
TrashCatalogIterator(vcb, descp->cd_parentcnid);
- (void) BTFlushPath(fcb);
exit:
+ (void) BTFlushPath(fcb);
FREE(iterator, M_TEMP);
return MacToVFSError(result);
/* Update the node hint. */
descp->cd_hint = iterator->hint.nodeNum;
- (void) BTFlushPath(fcb);
-
exit:
+ (void) BTFlushPath(fcb);
FREE(iterator, M_TEMP);
return MacToVFSError(result);
return (0); /* stop */
}
- /* Hide the private meta data directory. */
- if (parentcnid == kRootDirID &&
- rec->recordType == kHFSPlusFolderRecord &&
- rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir) {
- return (1); /* continue */
+ /* Hide the private meta data directory and journal files */
+ if (parentcnid == kRootDirID) {
+ if ((rec->recordType == kHFSPlusFolderRecord) &&
+ (rec->hfsPlusFolder.folderID == hfsmp->hfs_private_metadata_dir)) {
+ return (1); /* continue */
+ }
+ if (hfsmp->jnl &&
+ (rec->recordType == kHFSPlusFileRecord) &&
+ ((rec->hfsPlusFile.fileID == hfsmp->hfs_jnlfileid) ||
+ (rec->hfsPlusFile.fileID == hfsmp->hfs_jnlinfoblkid))) {
+
+ return (1); /* continue */
+ }
}
+
cep = &list->entry[list->realentries++];
if (state->stdhfs) {
struct read_state {
u_int32_t cbs_parentID;
u_int32_t cbs_hiddenDirID;
+ u_int32_t cbs_hiddenJournalID;
+ u_int32_t cbs_hiddenInfoBlkID;
off_t cbs_lastoffset;
struct uio * cbs_uio;
ExtendedVCB * cbs_vcb;
catent.d_type == DT_DIR)
goto lastitem;
+ /* Hide the journal files */
+ if ((curID == kRootDirID) &&
+ (catent.d_type == DT_REG) &&
+ ((catent.d_fileno == state->cbs_hiddenJournalID) ||
+ (catent.d_fileno == state->cbs_hiddenInfoBlkID))) {
+
+ return (1); /* skip and continue */
+ }
+
state->cbs_lastoffset = state->cbs_uio->uio_offset;
/* if this entry won't fit then we're done */
goto cleanup;
state.cbs_hiddenDirID = hfsmp->hfs_private_metadata_dir;
+ if (hfsmp->jnl) {
+ state.cbs_hiddenJournalID = hfsmp->hfs_jnlfileid;
+ state.cbs_hiddenInfoBlkID = hfsmp->hfs_jnlinfoblkid;
+ }
+
state.cbs_lastoffset = cip->currentOffset;
state.cbs_vcb = vcb;
state.cbs_uio = uio;
case kHFSPlusFileRecord:
cnid = crp->hfsPlusFile.fileID;
break;
+ default:
+ panic("hfs: getcnid: unknown recordType (crp @ 0x%x)\n", crp);
+ break;
}
+
return (cnid);
}
case kHFSPlusFolderThreadRecord:
cnid = recp->hfsPlusThread.parentID;
break;
+ default:
+ panic("hfs: getparentcnid: unknown recordType (crp @ 0x%x)\n", recp);
+ break;
}
+
return (cnid);
}
int recycle = 0;
int forkcount = 0;
int truncated = 0;
+ int started_tr = 0, grabbed_lock = 0;
if (prtactive && vp->v_usecount != 0)
vprint("hfs_inactive: pushing active", vp);
vp->v_type == VREG &&
(VTOF(vp)->ff_blocks != 0)) {
error = VOP_TRUNCATE(vp, (off_t)0, IO_NDELAY, NOCRED, p);
- if (error) goto out;
truncated = 1;
+ // have to do this to prevent the lost ubc_info panic
+ SET(cp->c_flag, C_TRANSIT);
recycle = 1;
+ if (error) goto out;
}
/*
cp->c_flag &= ~C_DELETED;
cp->c_rdev = 0;
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ grabbed_lock = 1;
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ error = EINVAL;
+ goto out;
+ }
+ started_tr = 1;
+ }
+
/* Lock catalog b-tree */
error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
if (error) goto out;
if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)
cp->c_flag |= C_MODIFIED;
}
- if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) {
- tv = time;
- VOP_UPDATE(vp, &tv, &tv, 0);
- }
+
+ if (cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) {
+ tv = time;
+ VOP_UPDATE(vp, &tv, &tv, 0);
+ }
out:
+ // XXXdbg - have to do this because a goto could have come here
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ started_tr = 0;
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ }
+
VOP_UNLOCK(vp, 0, p);
/*
* If we are done with the vnode, reclaim it
retval = ENOENT;
goto exit;
}
+
+ /* Hide private journal files */
+ if (hfsmp->jnl &&
+ (cp->c_parentcnid == kRootDirID) &&
+ ((cp->c_cnid == hfsmp->hfs_jnlfileid) ||
+ (cp->c_cnid == hfsmp->hfs_jnlinfoblkid))) {
+ retval = ENOENT;
+ goto exit;
+ }
+
if (wantrsrc && rvp != NULL) {
vp = rvp;
rvp = NULL;
enum {
kHFSSigWord = 0x4244, /* 'BD' in ASCII */
kHFSPlusSigWord = 0x482B, /* 'H+' in ASCII */
+ kHFSJSigWord = 0x484a, /* 'HJ' in ASCII */
kHFSPlusVersion = 0x0004, /* will change as format changes */
/* version 4 shipped with Mac OS 8.1 */
- kHFSPlusMountVersion = 0x31302E30 /* '10.0' for Mac OS X */
+ kHFSPlusMountVersion = 0x31302E30, /* '10.0' for Mac OS X */
+ kHFSJMountVersion = 0x4846534a /* 'HFSJ' for journaled HFS+ on OS X */
};
kHFSVolumeNoCacheRequiredBit = 10, /* don't cache volume blocks (i.e. RAM or ROM disk) */
kHFSBootVolumeInconsistentBit = 11, /* boot volume is inconsistent (System 7.6 and later) */
kHFSCatalogNodeIDsReusedBit = 12,
- /* Bits 13-14 are reserved for future use */
+ kHFSVolumeJournaledBit = 13, /* this volume has a journal on it */
+ /* Bit 14 is reserved for future use */
kHFSVolumeSoftwareLockBit = 15, /* volume is locked by software */
kHFSVolumeHardwareLockMask = 1 << kHFSVolumeHardwareLockBit,
kHFSVolumeNoCacheRequiredMask = 1 << kHFSVolumeNoCacheRequiredBit,
kHFSBootVolumeInconsistentMask = 1 << kHFSBootVolumeInconsistentBit,
kHFSCatalogNodeIDsReusedMask = 1 << kHFSCatalogNodeIDsReusedBit,
+ kHFSVolumeJournaledMask = 1 << kHFSVolumeJournaledBit,
kHFSVolumeSoftwareLockMask = 1 << kHFSVolumeSoftwareLockBit,
kHFSMDBAttributesMask = 0x8380
};
u_int16_t version; /* == kHFSPlusVersion */
u_int32_t attributes; /* volume attributes */
u_int32_t lastMountedVersion; /* implementation version which last mounted volume */
- u_int32_t reserved; /* reserved - initialized as zero */
+//XXXdbg u_int32_t reserved; /* reserved - initialized as zero */
+ u_int32_t journalInfoBlock; /* block addr of journal info (if volume is journaled, zero otherwise) */
u_int32_t createDate; /* date and time of volume creation */
u_int32_t modifyDate; /* date and time of last modification */
kBTVariableIndexKeysMask = 0x00000004 /* keys in index nodes are variable length */
};
+/* JournalInfoBlock - Structure that describes where our journal lives */
+struct JournalInfoBlock {
+ u_int32_t flags;
+ u_int32_t device_signature[8]; // signature used to locate our device.
+ u_int64_t offset; // byte offset to the journal on the device
+ u_int64_t size; // size in bytes of the journal
+ u_int32_t reserved[32];
+};
+typedef struct JournalInfoBlock JournalInfoBlock;
+
+enum {
+ kJIJournalInFSMask = 0x00000001,
+ kJIJournalOnOtherDeviceMask = 0x00000002,
+ kJIJournalNeedInitMask = 0x00000004
+};
+
+
#pragma options align=reset
#ifdef __cplusplus
fip->fdCreator = SWAP_BE32 (kHFSPlusCreator); /* 'hfs+' */
fip->fdFlags = SWAP_BE16 (kHasBeenInited);
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ return EINVAL;
+ }
+ }
+
/* Create the indirect link directly in the catalog */
result = cat_create(hfsmp, &desc, &attr, NULL);
- if (linkcnid != NULL)
+ if (result == 0 && linkcnid != NULL)
*linkcnid = attr.ca_fileid;
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
return (result);
}
/* Lock catalog b-tree */
retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
- if (retval)
- return retval;
+ if (retval) {
+ return retval;
+ }
/*
* If this is a new hardlink then we need to create the data
bzero(&to_desc, sizeof(to_desc));
to_desc.cd_parentcnid = hfsmp->hfs_privdir_desc.cd_cnid;
to_desc.cd_cnid = cp->c_fileid;
+
do {
/* get a unique indirect node number */
indnodeno = ((random() & 0x3fffffff) + 100);
cp->c_desc.cd_nameptr, &cp->c_desc.cd_cnid);
if (retval) {
/* put it source file back */
+ // XXXdbg
+ #if 1
+ {
+ int err;
+ err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+ if (err)
+ panic("hfs_makelink: error %d from cat_rename backout 1", err);
+ }
+ #else
(void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+ #endif
goto out;
}
cp->c_rdev = indnodeno;
(void) cat_delete(hfsmp, &cp->c_desc, &cp->c_attr);
/* Put the source file back */
+ // XXXdbg
+ #if 1
+ {
+ int err;
+ err = cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+ if (err)
+ panic("hfs_makelink: error %d from cat_rename backout 2", err);
+ }
+ #else
(void) cat_rename(hfsmp, &to_desc, &dcp->c_desc, &cp->c_desc, NULL);
+ #endif
goto out;
}
struct componentname *a_cnp;
} */ *ap;
{
+ struct hfsmount *hfsmp;
struct vnode *vp = ap->a_vp;
struct vnode *tdvp = ap->a_tdvp;
struct componentname *cnp = ap->a_cnp;
struct timeval tv;
int error;
+ hfsmp = VTOHFS(vp);
+
#if HFS_DIAGNOSTIC
if ((cnp->cn_flags & HASBUF) == 0)
panic("hfs_link: no name");
if (VTOVCB(tdvp)->vcbSigWord != kHFSPlusSigWord)
return err_link(ap); /* hfs disks don't support hard links */
- if (VTOHFS(vp)->hfs_private_metadata_dir == 0)
+ if (hfsmp->hfs_private_metadata_dir == 0)
return err_link(ap); /* no private metadata dir, no links possible */
if (tdvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE, p))) {
goto out1;
}
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ return EINVAL;
+ }
+ }
+
cp->c_nlink++;
cp->c_flag |= C_CHANGE;
tv = time;
+
error = VOP_UPDATE(vp, &tv, &tv, 1);
- if (!error)
- error = hfs_makelink(VTOHFS(vp), cp, tdcp, cnp);
+ if (!error) {
+ error = hfs_makelink(hfsmp, cp, tdcp, cnp);
+ }
if (error) {
cp->c_nlink--;
cp->c_flag |= C_CHANGE;
tdcp->c_flag |= C_CHANGE | C_UPDATE;
tv = time;
(void) VOP_UPDATE(tdvp, &tv, &tv, 0);
- hfs_volupdate(VTOHFS(vp), VOL_MKFILE,
+
+ hfs_volupdate(hfsmp, VOL_MKFILE,
(tdcp->c_cnid == kHFSRootFolderID));
}
+
+ // XXXdbg - need to do this here as well because cp could have changed
+ error = VOP_UPDATE(vp, &tv, &tv, 1);
+
FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
+
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
out1:
if (tdvp != vp)
VOP_UNLOCK(vp, 0, p);
* creation of files in the directory.
*/
retval = VOP_ACCESS(dvp, VWRITE, cred, cnp->cn_proc);
- if (retval)
+ if (retval) {
goto exit;
+ }
cnp->cn_flags |= SAVENAME;
if (!(flags & LOCKPARENT))
u_long hfs_encoding; /* encoding for this volume (standard HFS only) */
struct timezone hfs_timezone; /* user time zone info (standard HFS only) */
int flags; /* mounting flags, see below */
+ int journal_tbuffer_size; /* size in bytes of the journal transaction buffer */
+ int journal_flags; /* flags to pass to journal_open/create */
+ int journal_disable; /* don't use journaling (potentially dangerous) */
};
#define HFSFSMNT_NOXONFILES 0x1 /* disable execute permissions for files */
#define HFSFSMNT_WRAPPER 0x2 /* mount HFS wrapper (if it exists) */
+#define HFSFSMNT_EXTENDED_ARGS 0x4 /* indicates new fields after "flags" are valid */
+
#endif /* __APPLE_API_UNSTABLE */
#endif /* ! _HFS_MOUNT_H_ */
int retval;
off_t filebytes;
u_long fileblocks;
+ struct hfsmount *hfsmp;
+ int started_tr = 0, grabbed_lock = 0;
ioflag = ap->a_ioflag;
if ((cp->c_flags & APPEND) && uio->uio_offset != fp->ff_size)
return (EPERM);
+ // XXXdbg - don't allow modification of the journal or journal_info_block
+ if (VTOHFS(vp)->jnl && cp->c_datafork) {
+ struct HFSPlusExtentDescriptor *extd;
+
+ extd = &cp->c_datafork->ff_data.cf_extents[0];
+ if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+ return EPERM;
+ }
+ }
+
writelimit = uio->uio_offset + uio->uio_resid;
/*
if(writelimit > filebytes) {
bytesToAdd = writelimit - filebytes;
- retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, fp->ff_clumpsize)),
+ retval = hfs_chkdq(cp, (int64_t)(roundup(bytesToAdd, vcb->blockSize)),
ap->a_cred, 0);
if (retval)
return (retval);
}
#endif /* QUOTA */
+ hfsmp = VTOHFS(vp);
+ if (writelimit > filebytes) {
+ hfs_global_shared_lock_acquire(hfsmp);
+ grabbed_lock = 1;
+ }
+ if (hfsmp->jnl && (writelimit > filebytes)) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ return EINVAL;
+ }
+ started_tr = 1;
+ }
+
while (writelimit > filebytes) {
bytesToAdd = writelimit - filebytes;
(int)uio->uio_offset, uio->uio_resid, (int)fp->ff_size, (int)filebytes, 0);
}
+ // XXXdbg
+ if (started_tr) {
+ hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+ journal_end_transaction(hfsmp->jnl);
+ started_tr = 0;
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ grabbed_lock = 0;
+ }
+
if (UBCISVALID(vp) && retval == E_NONE) {
off_t filesize;
off_t zero_off;
struct proc *p = NULL;
struct rl_entry *invalid_range;
enum rl_overlaptype overlaptype;
+ int started_tr = 0, grabbed_lock = 0;
/*
* Check for underlying vnode requests and ensure that logical
if (ap->a_bpn == NULL)
return (0);
- if (overflow_extents(fp) || fp->ff_unallocblocks) {
+ p = current_proc();
+ if (fp->ff_unallocblocks) {
lockExtBtree = 1;
- p = current_proc();
+
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ grabbed_lock = 1;
+
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ return EINVAL;
+ } else {
+ started_tr = 1;
+ }
+ }
+
if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) {
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ }
return (retval);
- }
+ }
+ } else if (overflow_extents(fp)) {
+ lockExtBtree = 1;
+ if (retval = hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_EXCLUSIVE | LK_CANRECURSE, p)) {
+ return retval;
+ }
}
/*
}
if (retval) {
- (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
- return (retval);
- }
+ (void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
+ if (started_tr) {
+ hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+ journal_end_transaction(hfsmp->jnl);
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ }
+ return (retval);
+ }
VTOC(ap->a_vp)->c_flag |= C_MODIFIED;
}
if (lockExtBtree)
(void) hfs_metafilelocking(hfsmp, kHFSExtentsFileID, LK_RELEASE, p);
+ // XXXdbg
+ if (started_tr) {
+ hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+ journal_end_transaction(hfsmp->jnl);
+ started_tr = 0;
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ grabbed_lock = 0;
+ }
+
if (retval == E_NONE) {
/* Adjust the mapping information for invalid file ranges: */
overlaptype = rl_scan(&fp->ff_invalidranges,
}
frag->b_vp = NULL;
+ //
+ // XXXdbg - in the case that this is a meta-data block, it won't affect
+ // the journal because this bp is for a physical disk block,
+ // not a logical block that is part of the catalog or extents
+ // files.
SET(frag->b_flags, B_INVAL);
brelse(frag);
off_t filebytes;
u_long fileblocks;
int blksize;
+ struct hfsmount *hfsmp;
if (vp->v_type != VREG && vp->v_type != VLNK)
return (EISDIR); /* cannot truncate an HFS directory! */
if ((!ISHFSPLUS(VTOVCB(vp))) && (length > (off_t)MAXHFSFILESIZE))
return (EFBIG);
+ hfsmp = VTOHFS(vp);
tv = time;
retval = E_NONE;
*/
if (length > fp->ff_size) {
#if QUOTA
- retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, fp->ff_clumpsize)),
+ retval = hfs_chkdq(cp, (int64_t)(roundup(length - filebytes, blksize)),
ap->a_cred, 0);
if (retval)
goto Err_Exit;
if (suser(ap->a_cred, NULL) != 0)
eflags |= kEFReserveMask; /* keep a reserve */
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ retval = EINVAL;
+ goto Err_Exit;
+ }
+ }
+
/* lock extents b-tree (also protects volume bitmap) */
retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
- if (retval)
+ if (retval) {
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
goto Err_Exit;
+ }
while ((length > filebytes) && (retval == E_NONE)) {
bytesToAdd = length - filebytes;
break;
}
} /* endwhile */
+
(void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+
+ // XXXdbg
+ if (hfsmp->jnl) {
+ hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
if (retval)
goto Err_Exit;
#if QUOTA
off_t savedbytes = ((off_t)fp->ff_blocks * (off_t)blksize);
#endif /* QUOTA */
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ retval = EINVAL;
+ goto Err_Exit;
+ }
+ }
+
/* lock extents b-tree (also protects volume bitmap) */
retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
- if (retval)
+ if (retval) {
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
goto Err_Exit;
+ }
if (fp->ff_unallocblocks == 0)
retval = MacToVFSError(TruncateFileC(VTOVCB(vp),
(FCB*)fp, length, false));
(void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+
+ // XXXdbg
+ if (hfsmp->jnl) {
+ hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
filebytes = (off_t)fp->ff_blocks * (off_t)blksize;
if (retval)
goto Err_Exit;
int retval, retval2;
UInt32 blockHint;
UInt32 extendFlags =0; /* For call to ExtendFileC */
+ struct hfsmount *hfsmp;
+
+ hfsmp = VTOHFS(vp);
*(ap->a_bytesallocated) = 0;
fileblocks = fp->ff_blocks;
moreBytesRequested = length - filebytes;
#if QUOTA
- retval = hfs_chkdq(cp, (int64_t)(roundup(moreBytesRequested, fp->ff_clumpsize)),
+ retval = hfs_chkdq(cp,
+ (int64_t)(roundup(moreBytesRequested, VTOVCB(vp)->blockSize)),
ap->a_cred, 0);
if (retval)
return (retval);
#endif /* QUOTA */
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ retval = EINVAL;
+ goto Err_Exit;
+ }
+ }
+
/* lock extents b-tree (also protects volume bitmap) */
retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
- if (retval) goto Err_Exit;
+ if (retval) {
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+ goto Err_Exit;
+ }
retval = MacToVFSError(ExtendFileC(VTOVCB(vp),
(FCB*)fp,
*(ap->a_bytesallocated) = actualBytesAdded;
filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
+
(void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
+ // XXXdbg
+ if (hfsmp->jnl) {
+ hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
/*
* if we get an error and no changes were made then exit
* otherwise we must do the VOP_UPDATE to reflect the changes
(void) vinvalbuf(vp, vflags, ap->a_cred, ap->a_p, 0, 0);
}
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ retval = EINVAL;
+ goto Err_Exit;
+ }
+ }
+
/* lock extents b-tree (also protects volume bitmap) */
retval = hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_EXCLUSIVE, ap->a_p);
- if (retval) goto Err_Exit;
+ if (retval) {
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
+ goto Err_Exit;
+ }
retval = MacToVFSError(
TruncateFileC(
false));
(void) hfs_metafilelocking(VTOHFS(vp), kHFSExtentsFileID, LK_RELEASE, ap->a_p);
filebytes = (off_t)fp->ff_blocks * (off_t)VTOVCB(vp)->blockSize;
+
+ if (hfsmp->jnl) {
+ hfs_flushvolumeheader(hfsmp, MNT_NOWAIT, 0);
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
+
/*
* if we get an error and no changes were made then exit
* otherwise we must do the VOP_UPDATE to reflect the changes
} */ *ap;
{
int retval = 0;
-#if BYTE_ORDER == LITTLE_ENDIAN
register struct buf *bp = ap->a_bp;
register struct vnode *vp = bp->b_vp;
+#if BYTE_ORDER == LITTLE_ENDIAN
BlockDescriptor block;
/* Trap B-Tree writes */
}
#endif
/* This buffer shouldn't be locked anymore but if it is clear it */
- if (ISSET(ap->a_bp->b_flags, B_LOCKED)) {
- CLR(ap->a_bp->b_flags, B_LOCKED);
+ if (ISSET(bp->b_flags, B_LOCKED)) {
+ // XXXdbg
+ if (VTOHFS(vp)->jnl) {
+ panic("hfs: CLEARING the lock bit on bp 0x%x\n", bp);
+ }
+ CLR(bp->b_flags, B_LOCKED);
printf("hfs_bwrite: called with lock bit set\n");
}
retval = vn_bwrite (ap);
CatalogRecord * myCurrentDataPtr;
CatPosition * myCatPositionPtr;
BTScanState myBTScanState;
+ void *user_start = NULL;
+ int user_len;
/* XXX Parameter check a_searchattrs? */
MALLOC( attributesBuffer, void *, eachReturnBufferSize, M_TEMP, M_WAITOK );
variableBuffer = (void*)((char*) attributesBuffer + fixedBlockSize);
+ // XXXdbg - have to lock the user's buffer so we don't fault
+ // while holding the shared catalog file lock. see the comment
+ // in hfs_readdir() for more details.
+ //
+ if (VTOHFS(ap->a_vp)->jnl && ap->a_uio->uio_segflg == UIO_USERSPACE) {
+ user_start = ap->a_uio->uio_iov->iov_base;
+ user_len = ap->a_uio->uio_iov->iov_len;
+
+ if ((err = vslock(user_start, user_len)) != 0) {
+ user_start = NULL;
+ goto ExitThisRoutine;
+ }
+ }
+
/* Lock catalog b-tree */
err = hfs_metafilelocking(VTOHFS(ap->a_vp), kHFSCatalogFileID, LK_SHARED, p);
if (err)
ExitThisRoutine:
FREE( attributesBuffer, M_TEMP );
+ if (VTOHFS(ap->a_vp)->jnl && user_start) {
+ vsunlock(user_start, user_len, TRUE);
+ }
+
return (MacToVFSError(err));
}
goto exit;
}
+ /* Hide the private journal files */
+ if (VTOHFS(root_vp)->jnl &&
+ ((c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlfileid) ||
+ (c_attr.ca_fileid == VTOHFS(root_vp)->hfs_jnlinfoblkid))) {
+ err = 0;
+ goto exit;
+ }
+
if (returnAttrList->commonattr & ATTR_CMN_NAME) {
cat_convertkey(VTOHFS(root_vp), key, rec, &c_desc);
} else {
#include <sys/quota.h>
#include <sys/disk.h>
+// XXXdbg
+#include <vfs/vfs_journal.h>
+
#include <miscfs/specfs/specdev.h>
#include <hfs/hfs_mount.h>
(HFSTOVCB(hfsmp)->vcbSigWord == kHFSPlusSigWord)) {
/* setup private/hidden directory for unlinked files */
hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(HFSTOVCB(hfsmp));
+ if (hfsmp->jnl)
+ hfs_remove_orphans(hfsmp);
}
if (args.fspec == 0) {
goto error_exit;
}
-
/* Set the mount flag to indicate that we support volfs */
mp->mnt_flag |= MNT_DOVOLFS;
if (VFSTOVCB(mp)->vcbSigWord == kHFSSigWord) {
mp->mnt_flag |= MNT_FIXEDSCRIPTENCODING;
}
(void) copyinstr(path, mp->mnt_stat.f_mntonname, MNAMELEN-1, &size);
+
bzero(mp->mnt_stat.f_mntonname + size, MNAMELEN - size);
(void) copyinstr(args.fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate));
vcb->vcbAtrb = (UInt16) SWAP_BE32 (vhp->attributes); /* VCB only uses lower 16 bits */
+ vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock);
vcb->vcbClpSiz = SWAP_BE32 (vhp->rsrcClumpSize);
vcb->vcbNxtCNID = SWAP_BE32 (vhp->nextCatalogID);
vcb->vcbVolBkUp = to_bsd_time(SWAP_BE32(vhp->backupDate));
}
+static int
+get_raw_device(char *fspec, int is_user, int ronly, struct vnode **rvp, struct ucred *cred, struct proc *p)
+{
+ char *rawbuf;
+ char *dp;
+ size_t namelen;
+ struct nameidata nd;
+ int retval;
+
+ *rvp = NULL;
+
+ MALLOC(rawbuf, char *, MAXPATHLEN, M_HFSMNT, M_WAITOK);
+ if (rawbuf == NULL) {
+ retval = ENOMEM;
+ goto error_exit;
+ }
+
+ if (is_user) {
+ retval = copyinstr(fspec, rawbuf, MAXPATHLEN - 1, &namelen);
+ if (retval != E_NONE) {
+ FREE(rawbuf, M_HFSMNT);
+ goto error_exit;
+ }
+ } else {
+ strcpy(rawbuf, fspec);
+ namelen = strlen(rawbuf);
+ }
+
+ /* make sure it's null terminated */
+ rawbuf[MAXPATHLEN-1] = '\0';
+
+ dp = &rawbuf[namelen-1];
+ while(dp >= rawbuf && *dp != '/') {
+ dp--;
+ }
+
+ if (dp != NULL) {
+ dp++;
+ } else {
+ dp = rawbuf;
+ }
+
+ /* make room for and insert the 'r' for the raw device */
+ memmove(dp+1, dp, strlen(dp)+1);
+ *dp = 'r';
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, rawbuf, p);
+ retval = namei(&nd);
+ if (retval != E_NONE) {
+ DBG_ERR(("hfs_mountfs: can't open raw device for journal: %s, %x\n", rawbuf, nd.ni_vp->v_rdev));
+ FREE(rawbuf, M_HFSMNT);
+ goto error_exit;
+ }
+
+ *rvp = nd.ni_vp;
+ if ((retval = VOP_OPEN(*rvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p))) {
+ *rvp = NULL;
+ goto error_exit;
+ }
+
+ // don't need this any more
+ FREE(rawbuf, M_HFSMNT);
+
+ return 0;
+
+ error_exit:
+ if (*rvp) {
+ (void)VOP_CLOSE(*rvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+ }
+
+ if (rawbuf) {
+ FREE(rawbuf, M_HFSMNT);
+ }
+ return retval;
+}
+
+
+
/*
* Common code for mount and mountroot
*/
u_int32_t blksize;
u_int32_t minblksize;
u_int32_t iswritable;
+ daddr_t mdb_offset;
dev = devvp->v_rdev;
cred = p ? p->p_ucred : NOCRED;
return (retval);
}
+ mdb_offset = HFS_PRI_SECTOR(blksize);
if ((retval = meta_bread(devvp, HFS_PRI_SECTOR(blksize), blksize, cred, &bp))) {
goto error_exit;
}
bzero(hfsmp, sizeof(struct hfsmount));
simple_lock_init(&hfsmp->hfs_renamelock);
-
+
/*
* Init the volume information structure
*/
} else /* Mount an HFS Plus disk */ {
HFSPlusVolumeHeader *vhp;
off_t embeddedOffset;
+ int jnl_disable = 0;
/* Get the embedded Volume Header */
if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) {
hfsmp->hfs_phys_block_count = disksize / blksize;
- retval = meta_bread(devvp, (embeddedOffset / blksize) +
- HFS_PRI_SECTOR(blksize), blksize, cred, &bp);
+ mdb_offset = (embeddedOffset / blksize) + HFS_PRI_SECTOR(blksize);
+ retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp);
if (retval)
goto error_exit;
bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512);
vhp = (HFSPlusVolumeHeader*) mdbp;
}
+ // XXXdbg
+ //
+ hfsmp->jnl = NULL;
+ hfsmp->jvp = NULL;
+ if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) && args->journal_disable) {
+ jnl_disable = 1;
+ }
+
+ //
+ // We only initialize the journal here if the last person
+ // to mount this volume was journaling aware. Otherwise
+ // we delay journal initialization until later at the end
+ // of hfs_MountHFSPlusVolume() because the last person who
+ // mounted it could have messed things up behind our back
+ // (so we need to go find the .journal file, make sure it's
+ // the right size, re-sync up if it was moved, etc).
+ //
+ if ( (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion)
+ && (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask)
+ && !jnl_disable) {
+
+ // if we're able to init the journal, mark the mount
+ // point as journaled.
+ //
+ if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) {
+ mp->mnt_flag |= MNT_JOURNALED;
+ } else {
+ retval = EINVAL;
+ goto error_exit;
+ }
+ }
+ // XXXdbg
+
(void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname);
- retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p);
+ retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args);
/*
* If the backend didn't like our physical blocksize
* then retry with physical blocksize of 512.
hfsmp->hfs_phys_block_size = blksize;
/* Try again with a smaller block size... */
- retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p);
+ retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args);
}
if (retval)
(void) hfs_relconverter(0);
if (mdbp)
FREE(mdbp, M_TEMP);
(void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+ if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+ (void)VOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, cred, p);
+ hfsmp->jvp = NULL;
+ }
if (hfsmp) {
FREE(hfsmp, M_HFSMNT);
mp->mnt_data = (qaddr_t)0;
int retval = E_NONE;
int flags;
int force;
+ int started_tr = 0, grabbed_lock = 0;
flags = 0;
force = 0;
* Flush out the b-trees, volume bitmap and Volume Header
*/
if (hfsmp->hfs_fs_ronly == 0) {
+ hfs_global_shared_lock_acquire(hfsmp);
+ grabbed_lock = 1;
+ if (hfsmp->jnl) {
+ journal_start_transaction(hfsmp->jnl);
+ started_tr = 1;
+ }
+
retval = VOP_FSYNC(HFSTOVCB(hfsmp)->catalogRefNum, NOCRED, MNT_WAIT, p);
if (retval && !force)
- return (retval);
-
+ goto err_exit;
+
retval = VOP_FSYNC(HFSTOVCB(hfsmp)->extentsRefNum, NOCRED, MNT_WAIT, p);
if (retval && !force)
- return (retval);
+ goto err_exit;
+
+ // if we have an allocation file, sync it too so we don't leave dirty
+ // blocks around
+ if (HFSTOVCB(hfsmp)->allocationsRefNum) {
+ if (retval = VOP_FSYNC(HFSTOVCB(hfsmp)->allocationsRefNum, NOCRED, MNT_WAIT, p)) {
+ if (!force)
+ goto err_exit;
+ }
+ }
if (retval = VOP_FSYNC(hfsmp->hfs_devvp, NOCRED, MNT_WAIT, p)) {
if (!force)
- return (retval);
+ goto err_exit;
}
/* See if this volume is damaged, is so do not unmount cleanly */
HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask;
}
- retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
+ retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
if (retval) {
HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
if (!force)
- return (retval); /* could not flush everything */
+ goto err_exit; /* could not flush everything */
+ }
+
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ started_tr = 0;
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ grabbed_lock = 0;
}
}
+ if (hfsmp->jnl) {
+ journal_flush(hfsmp->jnl);
+ }
+
/*
* Invalidate our caches and release metadata vnodes
*/
if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
(void) hfs_relconverter(hfsmp->hfs_encoding);
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_close(hfsmp->jnl);
+ }
+
+ if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+ retval = VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE,
+ NOCRED, p);
+ vrele(hfsmp->jvp);
+ hfsmp->jvp = NULL;
+ }
+ // XXXdbg
+
hfsmp->hfs_devvp->v_specflags &= ~SI_MOUNTEDON;
retval = VOP_CLOSE(hfsmp->hfs_devvp,
hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE,
FREE(hfsmp, M_HFSMNT);
mp->mnt_data = (qaddr_t)0;
return (0);
+
+ err_exit:
+ if (hfsmp->jnl && started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ }
+ return retval;
}
}
+
+
/*
* Get file system statistics.
*/
}
+//
+// XXXdbg -- this is a callback to be used by the journal to
+// get meta data blocks flushed out to disk.
+//
+// XXXdbg -- be smarter and don't flush *every* block on each
+// call. try to only flush some so we don't wind up
+// being too synchronous.
+//
+__private_extern__
+void
+hfs_sync_metadata(void *arg)
+{
+ struct mount *mp = (struct mount *)arg;
+ struct cnode *cp;
+ struct hfsmount *hfsmp;
+ ExtendedVCB *vcb;
+ struct vnode *meta_vp[3];
+ struct buf *bp;
+ int i, sectorsize, priIDSector, altIDSector, retval;
+ int error, allerror = 0;
+
+ hfsmp = VFSTOHFS(mp);
+ vcb = HFSTOVCB(hfsmp);
+
+ bflushq(BQ_META, mp);
+
+
+#if 1 // XXXdbg - I do not believe this is necessary...
+ // but if I pull it out, then the journal
+ // does not seem to get flushed properly
+ // when it is closed....
+
+ // now make sure the super block is flushed
+ sectorsize = hfsmp->hfs_phys_block_size;
+ priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
+ HFS_PRI_SECTOR(sectorsize);
+ retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp);
+ if (retval != 0) {
+ panic("hfs: sync_metadata: can't read super-block?! (retval 0x%x, priIDSector)\n",
+ retval, priIDSector);
+ }
+
+ if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+ bwrite(bp);
+ } else if (bp) {
+ brelse(bp);
+ }
+
+ // the alternate super block...
+ // XXXdbg - we probably don't need to do this each and every time.
+ // hfs_btreeio.c:FlushAlternate() should flag when it was
+ // written...
+ altIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
+ HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count);
+ retval = meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &bp);
+ if (retval == 0 && (bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+ bwrite(bp);
+ } else if (bp) {
+ brelse(bp);
+ }
+#endif
+
+}
+
/*
* Go through the disk queues to initiate sandbagged IO;
* go through the inodes to write those that have been modified;
panic("update: rofs mod");
};
+#if 0
+ // XXXdbg first go through and flush out any modified
+ // meta data blocks so they go out in order...
+ bflushq(BQ_META, mp);
+ bflushq(BQ_LRU, mp);
+ // only flush locked blocks if we're not doing journaling
+ if (hfsmp->jnl == NULL) {
+ bflushq(BQ_LOCKED, mp);
+ }
+#endif
+
/*
* Write back each 'modified' vnode
*/
simple_unlock(&mntvnode_slock);
goto loop;
}
+
simple_lock(&vp->v_interlock);
nvp = vp->v_mntvnodes.le_next;
+
cp = VTOC(vp);
+ // restart our whole search if this guy is locked
+ // or being reclaimed.
+ if (cp == NULL || vp->v_flag & (VXLOCK|VORECLAIM)) {
+ simple_unlock(&vp->v_interlock);
+ continue;
+ }
+
if ((vp->v_flag & VSYSTEM) || (vp->v_type == VNON) ||
(((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) &&
(vp->v_dirtyblkhd.lh_first == NULL) && !(vp->v_flag & VHASDIRTY))) {
btvp = btvp = meta_vp[i];;
if ((btvp==0) || (btvp->v_type == VNON) || (btvp->v_mount != mp))
continue;
+
simple_lock(&btvp->v_interlock);
cp = VTOC(btvp);
if (((cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE)) == 0) &&
*/
if (IsVCBDirty(vcb)) {
+ // XXXdbg - debugging, remove
+ if (hfsmp->jnl) {
+ //printf("hfs: sync: strange, a journaled volume w/dirty VCB? jnl 0x%x hfsmp 0x%x\n",
+ // hfsmp->jnl, hfsmp);
+ }
+
error = hfs_flushvolumeheader(hfsmp, waitfor, 0);
- if (error)
- allerror = error;
+ if (error)
+ allerror = error;
}
+ if (hfsmp->jnl) {
+ journal_flush(hfsmp->jnl);
+ }
+
+ err_exit:
return (allerror);
}
}
+// XXXdbg
+#include <sys/filedesc.h>
+
+
/*
* HFS filesystem related variables.
*/
extern u_int32_t hfs_encodingbias;
/* all sysctl names at this level are terminal */
- if (namelen != 1)
- return (ENOTDIR); /* overloaded */
if (name[0] == HFS_ENCODINGBIAS)
return (sysctl_int(oldp, oldlenp, newp, newlen,
&hfs_encodingbias));
+ else if (name[0] == 0x082969) {
+ // make the file system journaled...
+ struct vnode *vp = p->p_fd->fd_cdir, *jvp;
+ struct hfsmount *hfsmp;
+ ExtendedVCB *vcb;
+ int retval;
+ struct cat_attr jnl_attr, jinfo_attr;
+ struct cat_fork jnl_fork, jinfo_fork;
+ void *jnl = NULL;
+
+ hfsmp = VTOHFS(vp);
+ if (hfsmp->hfs_fs_ronly) {
+ return EROFS;
+ }
+ if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
+ printf("hfs: can't make a plain hfs volume journaled.\n");
+ return EINVAL;
+ }
+
+ if (hfsmp->jnl) {
+ printf("hfs: volume @ mp 0x%x is already journaled!\n", vp->v_mount);
+ return EAGAIN;
+ }
+
+ vcb = HFSTOVCB(hfsmp);
+ if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 ||
+ BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) {
+
+ printf("hfs: volume has a btree w/non-contiguous nodes. can not enable journaling.\n");
+ return EINVAL;
+ }
+
+ // make sure these both exist!
+ if ( GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0
+ || GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, &jnl_fork) == 0) {
+
+ return EINVAL;
+ }
+
+ hfs_sync(hfsmp->hfs_mp, MNT_WAIT, FSCRED, p);
+ bflushq(BQ_META);
+
+ printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+ (off_t)name[2], (off_t)name[3]);
+
+ jvp = hfsmp->hfs_devvp;
+ jnl = journal_create(jvp,
+ (off_t)name[2] * (off_t)HFSTOVCB(hfsmp)->blockSize
+ + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
+ (off_t)name[3],
+ hfsmp->hfs_devvp,
+ hfsmp->hfs_phys_block_size,
+ 0,
+ 0,
+ hfs_sync_metadata, hfsmp->hfs_mp);
+
+ if (jnl == NULL) {
+ printf("hfs: FAILED to create the journal!\n");
+ if (jvp && jvp != hfsmp->hfs_devvp) {
+ VOP_CLOSE(jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p);
+ }
+ jvp = NULL;
+
+ return EINVAL;
+ }
+
+ hfs_global_exclusive_lock_acquire(hfsmp);
+
+ HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1];
+ HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask;
+ hfsmp->jvp = jvp;
+ hfsmp->jnl = jnl;
+
+ // save this off for the hack-y check in hfs_remove()
+ hfsmp->jnl_start = (u_int32_t)name[2];
+ hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid;
+ hfsmp->hfs_jnlfileid = jnl_attr.ca_fileid;
+
+ hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED;
+
+ hfs_global_exclusive_lock_release(hfsmp);
+ hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
+
+ return 0;
+ } else if (name[0] == 0x031272) {
+ // clear the journaling bit
+ struct vnode *vp = p->p_fd->fd_cdir;
+ struct hfsmount *hfsmp;
+ void *jnl;
+ int retval;
+
+ hfsmp = VTOHFS(vp);
+ if (hfsmp->jnl == NULL) {
+ return EINVAL;
+ }
+
+ printf("hfs: disabling journaling for mount @ 0x%x\n", vp->v_mount);
+
+ jnl = hfsmp->jnl;
+
+ hfs_global_exclusive_lock_acquire(hfsmp);
+
+ // Lights out for you buddy!
+ hfsmp->jnl = NULL;
+ journal_close(jnl);
+
+ if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
+ VOP_CLOSE(hfsmp->jvp, hfsmp->hfs_fs_ronly ? FREAD : FREAD|FWRITE, FSCRED, p);
+ }
+ hfsmp->jnl = NULL;
+ hfsmp->jvp = NULL;
+ hfsmp->hfs_mp->mnt_flag &= ~MNT_JOURNALED;
+ hfsmp->jnl_start = 0;
+ hfsmp->hfs_jnlinfoblkid = 0;
+ hfsmp->hfs_jnlfileid = 0;
+
+ HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask;
+
+ hfs_global_exclusive_lock_release(hfsmp);
+ hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
+
+ return 0;
+ }
return (EOPNOTSUPP);
}
--vcb->vcbNmFls;
break;
}
+
+ if (hfsmp->jnl) {
+ hfs_flushvolumeheader(hfsmp, 0, 0);
+ }
+
return (0);
}
ByteCount namelen;
sectorsize = hfsmp->hfs_phys_block_size;
-
retval = bread(hfsmp->hfs_devvp, HFS_PRI_SECTOR(sectorsize), sectorsize, NOCRED, &bp);
if (retval) {
if (bp)
DBG_ASSERT(bp->b_data != NULL);
DBG_ASSERT(bp->b_bcount == size);
+ if (hfsmp->jnl) {
+ panic("hfs: standard hfs volumes should not be journaled!\n");
+ }
+
mdb = (HFSMasterDirectoryBlock *)(bp->b_data + HFS_PRI_OFFSET(sectorsize));
mdb->drCrDate = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbCrDate)));
if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) {
bcopy(mdb, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize);
+
(void) VOP_BWRITE(alt_bp);
} else if (alt_bp)
brelse(alt_bp);
if (waitfor != MNT_WAIT)
bawrite(bp);
- else
+ else
retval = VOP_BWRITE(bp);
MarkVCBClean( vcb );
priIDSector = (vcb->hfsPlusIOPosOffset / sectorsize) +
HFS_PRI_SECTOR(sectorsize);
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ return EINVAL;
+ }
+ }
+
retval = meta_bread(hfsmp->hfs_devvp, priIDSector, sectorsize, NOCRED, &bp);
if (retval) {
if (bp)
brelse(bp);
+
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
return (retval);
}
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, bp);
+ }
+
volumeHeader = (HFSPlusVolumeHeader *)((char *)bp->b_data + HFS_PRI_OFFSET(sectorsize));
/*
if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate )
{
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, bp2);
+ }
+
mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate); /* pick up the new create date */
- (void) VOP_BWRITE(bp2); /* write out the changes */
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_end(hfsmp->jnl, bp2);
+ } else {
+ (void) VOP_BWRITE(bp2); /* write out the changes */
+ }
}
else
{
}
}
+// XXXdbg - only monkey around with the volume signature on non-root volumes
+//
+#if 0
+ if (hfsmp->jnl &&
+ hfsmp->hfs_fs_ronly == 0 &&
+ (HFSTOVFS(hfsmp)->mnt_flag & MNT_ROOTFS) == 0) {
+
+ int old_sig = volumeHeader->signature;
+
+ if (vcb->vcbAtrb & kHFSVolumeUnmountedMask) {
+ volumeHeader->signature = kHFSPlusSigWord;
+ } else {
+ volumeHeader->signature = kHFSJSigWord;
+ }
+
+ if (old_sig != volumeHeader->signature) {
+ altflush = 1;
+ }
+ }
+#endif
+// XXXdbg
+
/* Note: only update the lower 16 bits worth of attributes */
volumeHeader->attributes = SWAP_BE32 ((SWAP_BE32 (volumeHeader->attributes) & 0xFFFF0000) + (UInt16) vcb->vcbAtrb);
- volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
+ volumeHeader->journalInfoBlock = SWAP_BE32(vcb->vcbJinfoBlock);
+ if (hfsmp->jnl) {
+ volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion);
+ } else {
+ volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
+ }
volumeHeader->createDate = SWAP_BE32 (vcb->localCreateDate); /* volume create date is in local time */
volumeHeader->modifyDate = SWAP_BE32 (to_hfs_time(vcb->vcbLsMod));
volumeHeader->backupDate = SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp));
HFS_ALT_SECTOR(sectorsize, hfsmp->hfs_phys_block_count);
if (meta_bread(hfsmp->hfs_devvp, altIDSector, sectorsize, NOCRED, &alt_bp) == 0) {
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, alt_bp);
+ }
+
bcopy(volumeHeader, alt_bp->b_data + HFS_ALT_OFFSET(sectorsize), kMDBSize);
- (void) VOP_BWRITE(alt_bp);
+
+ if (hfsmp->jnl) {
+ journal_modify_block_end(hfsmp->jnl, alt_bp);
+ } else {
+ (void) VOP_BWRITE(alt_bp);
+ }
} else if (alt_bp)
brelse(alt_bp);
}
- if (waitfor != MNT_WAIT)
- bawrite(bp);
- else {
- retval = VOP_BWRITE(bp);
- /* When critical data changes, flush the device cache */
- if (critical && (retval == 0)) {
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_end(hfsmp->jnl, bp);
+ journal_end_transaction(hfsmp->jnl);
+ } else {
+ if (waitfor != MNT_WAIT)
+ bawrite(bp);
+ else {
+ retval = VOP_BWRITE(bp);
+ /* When critical data changes, flush the device cache */
+ if (critical && (retval == 0)) {
(void) VOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE,
- NULL, FWRITE, NOCRED, current_proc());
+ NULL, FWRITE, NOCRED, current_proc());
+ }
}
}
+ hfs_global_shared_lock_release(hfsmp);
vcb->vcbFlags &= 0x00FF;
return (retval);
static void ReleaseMetaFileVNode(struct vnode *vp);
+static int hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args);
u_int32_t GetLogicalBlockSize(struct vnode *vp);
//*******************************************************************************
OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
- off_t embeddedOffset, u_int64_t disksize, struct proc *p)
+ off_t embeddedOffset, u_int64_t disksize, struct proc *p, void *args)
{
register ExtendedVCB *vcb;
struct cat_desc cndesc;
UInt32 blockSize;
OSErr retval;
- if (SWAP_BE16(vhp->signature) != kHFSPlusSigWord ||
- SWAP_BE16(vhp->version) != kHFSPlusVersion)
- return (EINVAL);
+ // XXXdbg - added the kHFSJSigWord case
+ if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord &&
+ SWAP_BE16(vhp->signature) != kHFSJSigWord) ||
+ SWAP_BE16(vhp->version) != kHFSPlusVersion) {
+ // XXXdbg
+ printf("hfs: mount: sig 0x%x and version 0x%x are not HFS or HFS+.\n",
+ vhp->signature, vhp->version);
+ return (EINVAL);
+ }
/* Block size must be at least 512 and a power of 2 */
blockSize = SWAP_BE32(vhp->blockSize);
return (EINVAL);
/* don't mount a writable volume if its dirty, it must be cleaned by fsck_hfs */
- if (hfsmp->hfs_fs_ronly == 0 && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0)
+ if (hfsmp->hfs_fs_ronly == 0 && hfsmp->jnl == NULL && (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0)
return (EINVAL);
/* Make sure we can live with the physical block size. */
vcb = HFSTOVCB(hfsmp);
vcb->vcbSigWord = SWAP_BE16(vhp->signature);
+
+ // XXXdbg - remap this in case we've mounted a dirty journaled volume
+ if (vcb->vcbSigWord == kHFSJSigWord) {
+ vcb->vcbSigWord = kHFSPlusSigWord;
+ }
+
+ vcb->vcbJinfoBlock = SWAP_BE32(vhp->journalInfoBlock);
vcb->vcbLsMod = to_bsd_time(SWAP_BE32(vhp->modifyDate));
vcb->vcbAtrb = (UInt16)SWAP_BE32(vhp->attributes);
vcb->vcbClpSiz = SWAP_BE32(vhp->rsrcClumpSize);
/* mark the volume dirty (clear clean unmount bit) */
vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask;
+ if (hfsmp->jnl && hfsmp->hfs_fs_ronly == 0) {
+ hfs_flushvolumeheader(hfsmp, TRUE, TRUE);
+ }
/*
* all done with metadata files so we can unlock now...
/* setup private/hidden directory for unlinked files */
hfsmp->hfs_private_metadata_dir = FindMetaDataDirectory(vcb);
+ if (hfsmp->jnl && (hfsmp->hfs_fs_ronly == 0))
+ hfs_remove_orphans(hfsmp);
if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected
{
MarkVCBDirty( vcb ); // mark VCB dirty so it will be written
}
+
+ //
+ // Check if we need to do late journal initialization. This only
+ // happens if a previous version of MacOS X (or 9) touched the disk.
+ // In that case hfs_late_journal_init() will go re-locate the journal
+ // and journal_info_block files and validate that they're still kosher.
+ //
+ if ( (vcb->vcbAtrb & kHFSVolumeJournaledMask)
+ && (SWAP_BE32(vhp->lastMountedVersion) != kHFSJMountVersion)
+ && (hfsmp->jnl == NULL)) {
+
+ retval = hfs_late_journal_init(hfsmp, vhp, args);
+ if (retval != 0) {
+ hfsmp->jnl = NULL;
+ goto ErrorExit;
+ } else if (hfsmp->jnl) {
+ hfsmp->hfs_mp->mnt_flag |= MNT_JOURNALED;
+ }
+ } else if (hfsmp->jnl) {
+ struct cat_attr jinfo_attr, jnl_attr;
+
+ // if we're here we need to fill in the fileid's for the
+ // journal and journal_info_block.
+ hfsmp->hfs_jnlinfoblkid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jinfo_attr, NULL);
+ hfsmp->hfs_jnlfileid = GetFileInfo(vcb, kRootDirID, ".journal", &jnl_attr, NULL);
+ if (hfsmp->hfs_jnlinfoblkid == 0 || hfsmp->hfs_jnlfileid == 0) {
+ printf("hfs: danger! couldn't find the file-id's for the journal or journal_info_block\n");
+ printf("hfs: jnlfileid %d, jnlinfoblkid %d\n", hfsmp->hfs_jnlfileid, hfsmp->hfs_jnlinfoblkid);
+ }
+ }
+
+
return (0);
ErrorExit:
fndrinfo->frLocation.h = SWAP_BE16 (22460);
fndrinfo->frFlags |= SWAP_BE16 (kIsInvisible + kNameLocked);
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ return (0);
+ }
+ }
+
error = cat_create(hfsmp, &hfsmp->hfs_privdir_desc,
&hfsmp->hfs_privdir_attr, &out_desc);
/* Unlock catalog b-tree */
(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
- if (error)
- return (0);
+ if (error) {
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
+ return (0);
+ }
hfsmp->hfs_privdir_desc.cd_hint = out_desc.cd_hint;
hfsmp->hfs_privdir_desc.cd_cnid = out_desc.cd_cnid;
vput(dvp);
}
hfs_volupdate(hfsmp, VOL_MKDIR, 1);
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
cat_releasedesc(&out_desc);
return (out_desc.cd_cnid);
}
+__private_extern__
+u_long
+GetFileInfo(ExtendedVCB *vcb, u_int32_t dirid, char *name,
+ struct cat_attr *fattr, struct cat_fork *forkinfo)
+{
+ struct hfsmount * hfsmp;
+ struct vnode * dvp = NULL;
+ struct cnode * dcp = NULL;
+ struct FndrDirInfo * fndrinfo;
+ struct cat_desc jdesc;
+ struct timeval tv;
+ int error;
+
+ if (vcb->vcbSigWord != kHFSPlusSigWord)
+ return (0);
+
+ hfsmp = VCBTOHFS(vcb);
+
+ memset(&jdesc, 0, sizeof(struct cat_desc));
+ jdesc.cd_parentcnid = kRootDirID;
+ jdesc.cd_nameptr = name;
+ jdesc.cd_namelen = strlen(name);
+
+ /* Lock catalog b-tree */
+ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc());
+ if (error)
+ return (0);
+
+ error = cat_lookup(hfsmp, &jdesc, 0, NULL, fattr, forkinfo);
+
+ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
+
+ if (error == 0) {
+ return (fattr->ca_fileid);
+ } else if (hfsmp->hfs_fs_ronly) {
+ return (0);
+ }
+}
+
+
+/*
+ * On Journaled HFS, there can be orphaned files. These
+ * are files that were unlinked while busy. If the volume
+ * was not cleanly unmounted then some of these files may
+ * have persisted and need to be removed.
+ */
+__private_extern__
+void
+hfs_remove_orphans(struct hfsmount * hfsmp)
+{
+ struct BTreeIterator * iterator = NULL;
+ struct FSBufferDescriptor btdata;
+ struct HFSPlusCatalogFile filerec;
+ struct HFSPlusCatalogKey * keyp;
+ FCB *fcb;
+ ExtendedVCB *vcb;
+ char filename[32];
+ char tempname[32];
+ size_t namelen;
+ int catlock = 0;
+ int result, started_tr = 0;
+
+ if (hfsmp->hfs_orphans_cleaned)
+ return;
+
+ vcb = HFSTOVCB(hfsmp);
+ fcb = VTOF(vcb->catalogRefNum);
+
+ btdata.bufferAddress = &filerec;
+ btdata.itemSize = sizeof(filerec);
+ btdata.itemCount = 1;
+
+ MALLOC(iterator, struct BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK);
+ bzero(iterator, sizeof(*iterator));
+ keyp = (HFSPlusCatalogKey*)&iterator->key;
+ keyp->parentID = hfsmp->hfs_private_metadata_dir;
+
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if (journal_start_transaction(hfsmp->jnl) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ return;
+ }
+ started_tr = 1;
+ }
+
+ /* Lock catalog b-tree */
+ result = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, current_proc());
+ if (result)
+ goto exit;
+ catlock = 1;
+
+ /*
+ * Position the iterator at the folder thread record.
+ * (i.e. one record before first child)
+ */
+ result = BTSearchRecord(fcb, iterator, NULL, NULL, iterator);
+ if (result)
+ goto exit;
+
+ /* Visit all the children in the HFS+ private directory. */
+ for (;;) {
+ result = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
+ if (result)
+ break;
+ if (keyp->parentID != hfsmp->hfs_private_metadata_dir)
+ break;
+ if (filerec.recordType != kHFSPlusFileRecord)
+ continue;
+
+ (void) utf8_encodestr(keyp->nodeName.unicode, keyp->nodeName.length * 2,
+ filename, &namelen, sizeof(filename), 0, 0);
+
+ (void) sprintf(tempname, "%s%d", HFS_DELETE_PREFIX, filerec.fileID);
+
+ /*
+ * Delete all files named "tempxxx", where
+ * xxx is the file's cnid in decimal.
+ *
+ * Delete all files named "iNodexxx", that
+ * have a link count of zero.
+ */
+ if (bcmp(tempname, filename, namelen) == 0) {
+ struct filefork fork = {0};
+ struct cnode cnode = {0};
+
+ // XXXdebug
+ //printf("hfs_remove_orphans: removing %s\n", filename);
+
+ /* Build a fake cnode */
+ cnode.c_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+ cnode.c_desc.cd_nameptr = filename;
+ cnode.c_desc.cd_namelen = namelen;
+ cnode.c_desc.cd_cnid = filerec.fileID;
+ cnode.c_attr.ca_fileid = filerec.fileID;
+ cnode.c_blocks = filerec.dataFork.totalBlocks +
+ filerec.resourceFork.totalBlocks;
+
+ /* Position iterator at previous entry */
+ if (BTIterateRecord(fcb, kBTreePrevRecord, iterator,
+ NULL, NULL) != 0)
+ break;
+
+ /* Truncate the file to zero (both forks) */
+ if (filerec.dataFork.totalBlocks > 0) {
+ fork.ff_cp = &cnode;
+ cnode.c_datafork = ⋔
+ bcopy(&filerec.dataFork, &fork.ff_data, sizeof(struct cat_fork));
+ if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) {
+ printf("error truncting data fork!\n");
+ break;
+ }
+ }
+ if (filerec.resourceFork.totalBlocks > 0) {
+ fork.ff_cp = &cnode;
+ cnode.c_datafork = NULL;
+ cnode.c_rsrcfork = ⋔
+ bcopy(&filerec.resourceFork, &fork.ff_data, sizeof(struct cat_fork));
+ if (TruncateFileC(vcb, (FCB*)&fork, 0, false) != 0) {
+ printf("error truncting rsrc fork!\n");
+ break;
+ }
+ }
+
+ /* Remove the file record from the Catalog */
+ if (cat_delete(hfsmp, &cnode.c_desc, &cnode.c_attr) != 0) {
+ printf("error deleting cat rec!\n");
+ break;
+ }
+
+ /* Update parent and volume counts */
+ hfsmp->hfs_privdir_attr.ca_entries--;
+ (void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc,
+ &hfsmp->hfs_privdir_attr, NULL, NULL);
+ hfs_volupdate(hfsmp, VOL_RMFILE, 0);
+ }
+ }
+
+exit:
+ /* Unlock catalog b-tree */
+ if (catlock)
+ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, current_proc());
+
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
+
+ FREE(iterator, M_TEMP);
+ hfsmp->hfs_orphans_cleaned = 1;
+}
+
/*
* This will return the correct logical block size for a given vnode.
switch (err) {
case dskFulErr: /* -34 */
- case btNoSpaceAvail: /* -32733 */
+ return ENOSPC;
+ case btNoSpaceAvail: /* -32733 */
+ return EFBIG;
case fxOvFlErr: /* -32750 */
- return ENOSPC; /* +28 */
+ return EOVERFLOW;
case btBadNode: /* -32731 */
- return EIO; /* +5 */
+ return EBADF;
case memFullErr: /* -108 */
return ENOMEM; /* +12 */
return EISDIR; /* 21 */
case fxRangeErr: /* -32751 */
- return EIO; /* 5 */
+ return ERANGE;
case bdNamErr: /* -37 */
return ENAMETOOLONG; /* 63 */
}
+__private_extern__
+int
+hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp,
+ void *_args, int embeddedOffset, int mdb_offset,
+ HFSMasterDirectoryBlock *mdbp, struct ucred *cred)
+{
+ JournalInfoBlock *jibp;
+ struct buf *jinfo_bp, *bp;
+ int sectors_per_fsblock, arg_flags=0, arg_tbufsz=0;
+ int retval, blksize = hfsmp->hfs_phys_block_size;
+ struct vnode *devvp;
+ struct hfs_mount_args *args = _args;
+
+ devvp = hfsmp->hfs_devvp;
+
+ if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) {
+ arg_flags = args->journal_flags;
+ arg_tbufsz = args->journal_tbuffer_size;
+ }
+
+ sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / blksize;
+
+ retval = meta_bread(devvp,
+ embeddedOffset/blksize +
+ (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock),
+ SWAP_BE32(vhp->blockSize), cred, &jinfo_bp);
+ if (retval)
+ return retval;
+
+ jibp = (JournalInfoBlock *)jinfo_bp->b_data;
+ jibp->flags = SWAP_BE32(jibp->flags);
+ jibp->offset = SWAP_BE64(jibp->offset);
+ jibp->size = SWAP_BE64(jibp->size);
+
+ if (jibp->flags & kJIJournalInFSMask) {
+ hfsmp->jvp = hfsmp->hfs_devvp;
+ } else {
+ printf("hfs: journal not stored in fs! don't know what to do.\n");
+ brelse(jinfo_bp);
+ return EINVAL;
+ }
+
+ // save this off for the hack-y check in hfs_remove()
+ hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize);
+
+ if (jibp->flags & kJIJournalNeedInitMask) {
+ printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+ jibp->offset + (off_t)embeddedOffset, jibp->size);
+ hfsmp->jnl = journal_create(hfsmp->jvp,
+ jibp->offset + (off_t)embeddedOffset,
+ jibp->size,
+ devvp,
+ blksize,
+ arg_flags,
+ arg_tbufsz,
+ hfs_sync_metadata, hfsmp->hfs_mp);
+
+ // no need to start a transaction here... if this were to fail
+ // we'd just re-init it on the next mount.
+ jibp->flags &= ~kJIJournalNeedInitMask;
+ jibp->flags = SWAP_BE32(jibp->flags);
+ bwrite(jinfo_bp);
+ jinfo_bp = NULL;
+ jibp = NULL;
+ } else {
+ //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n",
+ // jibp->offset + (off_t)embeddedOffset,
+ // jibp->size, SWAP_BE32(vhp->blockSize));
+
+ hfsmp->jnl = journal_open(hfsmp->jvp,
+ jibp->offset + (off_t)embeddedOffset,
+ jibp->size,
+ devvp,
+ blksize,
+ arg_flags,
+ arg_tbufsz,
+ hfs_sync_metadata, hfsmp->hfs_mp);
+
+ brelse(jinfo_bp);
+ jinfo_bp = NULL;
+ jibp = NULL;
+
+ if (hfsmp->jnl && mdbp) {
+ // reload the mdb because it could have changed
+ // if the journal had to be replayed.
+ retval = meta_bread(devvp, mdb_offset, blksize, cred, &bp);
+ if (retval) {
+ brelse(bp);
+ printf("hfs: failed to reload the mdb after opening the journal (retval %d)!\n",
+ retval);
+ return retval;
+ }
+ bcopy(bp->b_data + HFS_PRI_OFFSET(blksize), mdbp, 512);
+ brelse(bp);
+ bp = NULL;
+ }
+ }
+
+
+ //printf("journal @ 0x%x\n", hfsmp->jnl);
+
+ // if we expected the journal to be there and we couldn't
+ // create it or open it then we have to bail out.
+ if (hfsmp->jnl == NULL) {
+ hfsmp->jnl_start = 0;
+
+ printf("hfs: failed to open/create the journal (retval %d).\n", retval);
+ return EINVAL;
+ }
+ return 0;
+}
+
+
+//
+// This function will go and re-locate the .journal_info_block and
+// the .journal files in case they moved (which can happen if you
+// run Norton SpeedDisk). If we fail to find either file we just
+// disable journaling for this volume and return. We turn off the
+// journaling bit in the vcb and assume it will get written to disk
+// later (if it doesn't on the next mount we'd do the same thing
+// again which is harmless). If we disable journaling we don't
+// return an error so that the volume is still mountable.
+//
+// If the info we find for the .journal_info_block and .journal files
+// isn't what we had stored, we re-set our cached info and proceed
+// with opening the journal normally.
+//
+static int
+hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args)
+{
+ JournalInfoBlock *jibp;
+ struct buf *jinfo_bp, *bp;
+ int sectors_per_fsblock, arg_flags=0, arg_tbufsz=0;
+ int retval, need_flush = 0, write_jibp = 0;
+ struct vnode *devvp;
+ struct cat_attr jib_attr, jattr;
+ struct cat_fork jib_fork, jfork;
+ ExtendedVCB *vcb;
+ u_long fid;
+ struct hfs_mount_args *args = _args;
+
+ devvp = hfsmp->hfs_devvp;
+ vcb = HFSTOVCB(hfsmp);
+
+ if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS)) {
+ if (args->journal_disable) {
+ return 0;
+ }
+
+ arg_flags = args->journal_flags;
+ arg_tbufsz = args->journal_tbuffer_size;
+ }
+
+ fid = GetFileInfo(vcb, kRootDirID, ".journal_info_block", &jib_attr, &jib_fork);
+ if (fid == 0 || jib_fork.cf_extents[0].startBlock == 0 || jib_fork.cf_size == 0) {
+ printf("hfs: can't find the .journal_info_block! disabling journaling (start: %d).\n",
+ jib_fork.cf_extents[0].startBlock);
+ vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+ return 0;
+ }
+ hfsmp->hfs_jnlinfoblkid = fid;
+
+ // make sure the journal_info_block begins where we think it should.
+ if (SWAP_BE32(vhp->journalInfoBlock) != jib_fork.cf_extents[0].startBlock) {
+ printf("hfs: The journal_info_block moved (was: %d; is: %d). Fixing up\n",
+ SWAP_BE32(vhp->journalInfoBlock), jib_fork.cf_extents[0].startBlock);
+
+ vcb->vcbJinfoBlock = jib_fork.cf_extents[0].startBlock;
+ vhp->journalInfoBlock = SWAP_BE32(jib_fork.cf_extents[0].startBlock);
+ }
+
+
+ sectors_per_fsblock = SWAP_BE32(vhp->blockSize) / hfsmp->hfs_phys_block_size;
+ retval = meta_bread(devvp,
+ vcb->hfsPlusIOPosOffset / hfsmp->hfs_phys_block_size +
+ (SWAP_BE32(vhp->journalInfoBlock)*sectors_per_fsblock),
+ SWAP_BE32(vhp->blockSize), NOCRED, &jinfo_bp);
+ if (retval) {
+ printf("hfs: can't read journal info block. disabling journaling.\n");
+ vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+ return 0;
+ }
+
+ jibp = (JournalInfoBlock *)jinfo_bp->b_data;
+ jibp->flags = SWAP_BE32(jibp->flags);
+ jibp->offset = SWAP_BE64(jibp->offset);
+ jibp->size = SWAP_BE64(jibp->size);
+
+ fid = GetFileInfo(vcb, kRootDirID, ".journal", &jattr, &jfork);
+ if (fid == 0 || jfork.cf_extents[0].startBlock == 0 || jfork.cf_size == 0) {
+ printf("hfs: can't find the journal file! disabling journaling (start: %d)\n",
+ jfork.cf_extents[0].startBlock);
+ brelse(jinfo_bp);
+ vcb->vcbAtrb &= ~kHFSVolumeJournaledMask;
+ return 0;
+ }
+ hfsmp->hfs_jnlfileid = fid;
+
+ // make sure the journal file begins where we think it should.
+ if ((jibp->offset / (u_int64_t)vcb->blockSize) != jfork.cf_extents[0].startBlock) {
+ printf("hfs: The journal file moved (was: %lld; is: %d). Fixing up\n",
+ (jibp->offset / (u_int64_t)vcb->blockSize), jfork.cf_extents[0].startBlock);
+
+ jibp->offset = (u_int64_t)jfork.cf_extents[0].startBlock * (u_int64_t)vcb->blockSize;
+ write_jibp = 1;
+ }
+
+ // check the size of the journal file.
+ if (jibp->size != (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize) {
+ printf("hfs: The journal file changed size! (was %lld; is %lld). Fixing up.\n",
+ jibp->size, (u_int64_t)jfork.cf_extents[0].blockCount*vcb->blockSize);
+
+ jibp->size = (u_int64_t)jfork.cf_extents[0].blockCount * vcb->blockSize;
+ write_jibp = 1;
+ }
+
+ if (jibp->flags & kJIJournalInFSMask) {
+ hfsmp->jvp = hfsmp->hfs_devvp;
+ } else {
+ printf("hfs: journal not stored in fs! don't know what to do.\n");
+ brelse(jinfo_bp);
+ return EINVAL;
+ }
+
+ // save this off for the hack-y check in hfs_remove()
+ hfsmp->jnl_start = jibp->offset / SWAP_BE32(vhp->blockSize);
+
+ if (jibp->flags & kJIJournalNeedInitMask) {
+ printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
+ jibp->offset + (off_t)vcb->hfsPlusIOPosOffset, jibp->size);
+ hfsmp->jnl = journal_create(hfsmp->jvp,
+ jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+ jibp->size,
+ devvp,
+ hfsmp->hfs_phys_block_size,
+ arg_flags,
+ arg_tbufsz,
+ hfs_sync_metadata, hfsmp->hfs_mp);
+
+ // no need to start a transaction here... if this were to fail
+ // we'd just re-init it on the next mount.
+ jibp->flags &= ~kJIJournalNeedInitMask;
+ write_jibp = 1;
+
+ } else {
+ //
+ // if we weren't the last person to mount this volume
+ // then we need to throw away the journal because it
+ // is likely that someone else mucked with the disk.
+ // if the journal is empty this is no big deal. if the
+ // disk is dirty this prevents us from replaying the
+ // journal over top of changes that someone else made.
+ //
+ arg_flags |= JOURNAL_RESET;
+
+ //printf("hfs: Opening the journal (joffset 0x%llx sz 0x%llx vhp_blksize %d)...\n",
+ // jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+ // jibp->size, SWAP_BE32(vhp->blockSize));
+
+ hfsmp->jnl = journal_open(hfsmp->jvp,
+ jibp->offset + (off_t)vcb->hfsPlusIOPosOffset,
+ jibp->size,
+ devvp,
+ hfsmp->hfs_phys_block_size,
+ arg_flags,
+ arg_tbufsz,
+ hfs_sync_metadata, hfsmp->hfs_mp);
+ }
+
+
+ if (write_jibp) {
+ jibp->flags = SWAP_BE32(jibp->flags);
+ jibp->offset = SWAP_BE64(jibp->offset);
+ jibp->size = SWAP_BE64(jibp->size);
+
+ bwrite(jinfo_bp);
+ } else {
+ brelse(jinfo_bp);
+ }
+ jinfo_bp = NULL;
+ jibp = NULL;
+
+ //printf("journal @ 0x%x\n", hfsmp->jnl);
+
+ // if we expected the journal to be there and we couldn't
+ // create it or open it then we have to bail out.
+ if (hfsmp->jnl == NULL) {
+ hfsmp->jnl_start = 0;
+
+ printf("hfs: failed to open/create the journal (retval %d).\n", retval);
+ return EINVAL;
+ }
+
+ return 0;
+}
if (cp->c_flags & (IMMUTABLE | APPEND))
return (EPERM);
+
+ // XXXdbg - don't allow modification of the journal or journal_info_block
+ if (VTOHFS(vp)->jnl && cp->c_datafork) {
+ struct HFSPlusExtentDescriptor *extd;
+
+ extd = &cp->c_datafork->ff_data.cf_extents[0];
+ if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+ return EPERM;
+ }
+ }
+
/*
* Go through the fields and update iff not VNOVAL.
*/
if (VTOVCB(vp)->vcbSigWord != kHFSPlusSigWord)
return (0);
+ // XXXdbg - don't allow modification of the journal or journal_info_block
+ if (VTOHFS(vp)->jnl && cp && cp->c_datafork) {
+ struct HFSPlusExtentDescriptor *extd;
+
+ extd = &cp->c_datafork->ff_data.cf_extents[0];
+ if (extd->startBlock == VTOVCB(vp)->vcbJinfoBlock || extd->startBlock == VTOHFS(vp)->jnl_start) {
+ return EPERM;
+ }
+ }
+
#if OVERRIDE_UNKNOWN_PERMISSIONS
if (VTOVFS(vp)->mnt_flag & MNT_UNKNOWNPERMISSIONS) {
return (0);
struct hfsmount *hfsmp = VTOHFS(from_vp);
struct cat_desc tempdesc;
struct cat_attr tempattr;
- int error = 0;
+ int error = 0, started_tr = 0, grabbed_lock = 0;
/* The files must be on the same volume. */
if (from_vp->v_mount != to_vp->v_mount)
VNODE_IS_RSRC(from_vp) || VNODE_IS_RSRC(to_vp))
return (EINVAL);
+ // XXXdbg - don't allow modification of the journal or journal_info_block
+ if (hfsmp->jnl) {
+ struct HFSPlusExtentDescriptor *extd;
+
+ if (from_cp->c_datafork) {
+ extd = &from_cp->c_datafork->ff_data.cf_extents[0];
+ if (extd->startBlock == VTOVCB(from_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+ return EPERM;
+ }
+ }
+
+ if (to_cp->c_datafork) {
+ extd = &to_cp->c_datafork->ff_data.cf_extents[0];
+ if (extd->startBlock == VTOVCB(to_vp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+ return EPERM;
+ }
+ }
+ }
+
from_rvp = from_cp->c_rsrc_vp;
to_rvp = to_cp->c_rsrc_vp;
if (to_rvp)
(void) vinvalbuf(to_rvp, V_SAVE, ap->a_cred, ap->a_p, 0, 0);
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ grabbed_lock = 1;
+ if (hfsmp->jnl) {
+ if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+ goto Err_Exit;
+ }
+ started_tr = 1;
+ }
+
/* Lock catalog b-tree */
error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, ap->a_p);
if (error) goto Err_Exit;
* (except the modify date)
*/
bcopy(&to_cp->c_desc, &from_cp->c_desc, sizeof(struct cat_desc));
+
from_cp->c_hint = 0;
from_cp->c_fileid = from_cp->c_cnid;
from_cp->c_itime = to_cp->c_itime;
if (from_rvp)
vrele(from_rvp);
+ // XXXdbg
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ }
+
return (error);
}
IN struct proc *p;
*/
-
static int
hfs_fsync(ap)
struct vop_fsync_args /* {
register struct buf *bp;
struct timeval tv;
struct buf *nbp;
+ struct hfsmount *hfsmp = VTOHFS(ap->a_vp);
int s;
int wait;
int retry = 0;
* for regular files write out any clusters
*/
if (vp->v_flag & VSYSTEM) {
- if (VTOF(vp)->fcbBTCBPtr != NULL)
- BTFlushPath(VTOF(vp));
+ if (VTOF(vp)->fcbBTCBPtr != NULL) {
+ // XXXdbg
+ if (hfsmp->jnl) {
+ if (BTIsDirty(VTOF(vp))) {
+ panic("hfs: system file vp 0x%x has dirty blocks (jnl 0x%x)\n",
+ vp, hfsmp->jnl);
+ }
+ } else {
+ BTFlushPath(VTOF(vp));
+ }
+ }
} else if (UBCINFOEXISTS(vp))
(void) cluster_push(vp);
if ((bp->b_flags & B_BUSY))
continue;
if ((bp->b_flags & B_DELWRI) == 0)
- panic("hfs_fsync: not dirty");
+ panic("hfs_fsync: bp 0x% not dirty (hfsmp 0x%x)", bp, hfsmp);
+ // XXXdbg
+ if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+ if ((bp->b_flags & B_META) == 0) {
+ panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n",
+ bp, hfsmp->jnl);
+ }
+ // if journal_active() returns >= 0 then the journal is ok and we
+ // shouldn't do anything to this locked block (because it is part
+ // of a transaction). otherwise we'll just go through the normal
+ // code path and flush the buffer.
+ if (journal_active(hfsmp->jnl) >= 0) {
+ continue;
+ }
+ }
+
bremfree(bp);
bp->b_flags |= B_BUSY;
/* Clear B_LOCKED, should only be set on meta files */
bp->b_flags &= ~B_LOCKED;
+
splx(s);
/*
* Wait for I/O associated with indirect blocks to complete,
tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "hfs_fsync", 0);
}
- if (vp->v_dirtyblkhd.lh_first) {
+ // XXXdbg -- is checking for hfsmp->jnl == NULL the right
+ // thing to do?
+ if (hfsmp->jnl == NULL && vp->v_dirtyblkhd.lh_first) {
/* still have some dirty buffers */
if (retry++ > 10) {
vprint("hfs_fsync: dirty", vp);
vp = HFSTOVCB(hfsmp)->catalogRefNum;
+ // XXXdbg - don't need to do this on a journaled volume
+ if (hfsmp->jnl) {
+ return 0;
+ }
+
if (hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p) != 0)
return (0);
register struct buf *bp;
struct timeval tv;
struct buf *nbp;
+ struct hfsmount *hfsmp = VTOHFS(vp);
int s;
/*
if ((bp->b_flags & B_BUSY))
continue;
if ((bp->b_flags & B_DELWRI) == 0)
- panic("hfs_fsync: not dirty");
+ panic("hfs_btsync: not dirty (bp 0x%x hfsmp 0x%x)", bp, hfsmp);
+
+ // XXXdbg
+ if (hfsmp->jnl && (bp->b_flags & B_LOCKED)) {
+ if ((bp->b_flags & B_META) == 0) {
+ panic("hfs: bp @ 0x%x is locked but not meta! jnl 0x%x\n",
+ bp, hfsmp->jnl);
+ }
+ // if journal_active() returns >= 0 then the journal is ok and we
+ // shouldn't do anything to this locked block (because it is part
+ // of a transaction). otherwise we'll just go through the normal
+ // code path and flush the buffer.
+ if (journal_active(hfsmp->jnl) >= 0) {
+ continue;
+ }
+ }
+
if (sync_transaction && !(bp->b_flags & B_LOCKED))
continue;
bremfree(bp);
bp->b_flags |= B_BUSY;
bp->b_flags &= ~B_LOCKED;
+
splx(s);
(void) bawrite(bp);
struct cnode *dcp;
struct hfsmount * hfsmp;
struct timeval tv;
- int error = 0;
+ int error = 0, started_tr = 0, grabbed_lock = 0;
cp = VTOC(vp);
dcp = VTOC(dvp);
vput(vp);
return (EINVAL); /* cannot remove "." */
}
+
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ grabbed_lock = 1;
+ if (hfsmp->jnl) {
+ if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+ goto out;
+ }
+ started_tr = 1;
+ }
+
/*
* Verify the directory is empty (and valid).
* (Rmdir ".." won't be valid since
dcp->c_flag |= C_CHANGE | C_UPDATE;
tv = time;
(void) VOP_UPDATE(dvp, &tv, &tv, 0);
+
hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID));
cp->c_mode = 0; /* Makes the vnode go away...see inactive */
if (dvp)
vput(dvp);
vput(vp);
+
+ // XXXdbg
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ }
+
return (error);
}
int truncated = 0;
struct timeval tv;
int error = 0;
+ int started_tr = 0, grabbed_lock = 0;
/* Redirect directories to rmdir */
if (vp->v_type == VDIR)
VNODE_IS_RSRC(vp)) {
error = EPERM;
goto out;
- }
+ }
/*
* Aquire a vnode for a non-empty resource fork.
goto out;
}
+ // XXXdbg - don't allow deleting the journal or journal_info_block
+ if (hfsmp->jnl && cp->c_datafork) {
+ struct HFSPlusExtentDescriptor *extd;
+
+ extd = &cp->c_datafork->ff_data.cf_extents[0];
+ if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+ error = EPERM;
+ goto out;
+ }
+ }
+
/*
* Check if this file is being used.
*
goto out;
}
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ grabbed_lock = 1;
+ if (hfsmp->jnl) {
+ if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+ goto out;
+ }
+ started_tr = 1;
+ }
+
/* Remove our entry from the namei cache. */
cache_purge(vp);
+ // XXXdbg - if we're journaled, kill any dirty symlink buffers
+ if (hfsmp->jnl && vp->v_type == VLNK && vp->v_dirtyblkhd.lh_first) {
+ struct buf *bp, *nbp;
+
+ recheck:
+ for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
+ nbp = bp->b_vnbufs.le_next;
+
+ if ((bp->b_flags & B_BUSY)) {
+ // if it was busy, someone else must be dealing
+ // with it so just move on.
+ continue;
+ }
+
+ if (!(bp->b_flags & B_META)) {
+ panic("hfs: symlink bp @ 0x%x is not marked meta-data!\n", bp);
+ }
+
+ // if it's part of the current transaction, kill it.
+ if (bp->b_flags & B_LOCKED) {
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+ journal_kill_block(hfsmp->jnl, bp);
+ goto recheck;
+ }
+ }
+ }
+ // XXXdbg
+
/*
* Truncate any non-busy forks. Busy forks will
* get trucated when their vnode goes inactive.
if (error)
goto out;
+ /* Delete the link record */
error = cat_delete(hfsmp, &desc, &cp->c_attr);
+ if ((error == 0) && (--cp->c_nlink < 1)) {
+ char inodename[32];
+ char delname[32];
+ struct cat_desc to_desc;
+ struct cat_desc from_desc;
+
+ /*
+ * This is now esentially an open deleted file.
+ * Rename it to reflect this state which makes
+ * orphan file cleanup easier (see hfs_remove_orphans).
+ * Note: a rename failure here is not fatal.
+ */
+ MAKE_INODE_NAME(inodename, cp->c_rdev);
+ bzero(&from_desc, sizeof(from_desc));
+ from_desc.cd_nameptr = inodename;
+ from_desc.cd_namelen = strlen(inodename);
+ from_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+ from_desc.cd_flags = 0;
+ from_desc.cd_cnid = cp->c_fileid;
+
+ MAKE_DELETED_NAME(delname, cp->c_fileid);
+ bzero(&to_desc, sizeof(to_desc));
+ to_desc.cd_nameptr = delname;
+ to_desc.cd_namelen = strlen(delname);
+ to_desc.cd_parentcnid = hfsmp->hfs_private_metadata_dir;
+ to_desc.cd_flags = 0;
+ to_desc.cd_cnid = cp->c_fileid;
+
+ (void) cat_rename(hfsmp, &from_desc, &hfsmp->hfs_privdir_desc,
+ &to_desc, (struct cat_desc *)NULL);
+ cp->c_flag |= C_DELETED;
+ }
+
/* Unlock the Catalog */
(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
goto out;
cp->c_flag |= C_CHANGE;
- if (--cp->c_nlink < 1)
- cp->c_flag |= C_DELETED;
+ tv = time;
+ (void) VOP_UPDATE(vp, &tv, &tv, 0);
+
hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID));
} else if (dataforkbusy || rsrcforkbusy) {
/* Lock catalog b-tree */
error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
- if (error) goto out;
+ if (error)
+ goto out;
error = cat_rename(hfsmp, &cp->c_desc, &todir_desc,
&to_desc, (struct cat_desc *)NULL);
- hfsmp->hfs_privdir_attr.ca_entries++;
+ // XXXdbg - only bump this count if we were successful
+ if (error == 0) {
+ hfsmp->hfs_privdir_attr.ca_entries++;
+ }
(void)cat_update(hfsmp, &hfsmp->hfs_privdir_desc,
&hfsmp->hfs_privdir_attr, NULL, NULL);
cp->c_flag |= C_CHANGE | C_DELETED | C_NOEXISTS;
--cp->c_nlink;
+ tv = time;
+ (void) VOP_UPDATE(vp, &tv, &tv, 0);
} else /* Not busy */ {
- /* Lock catalog b-tree */
- error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
- if (error) goto out;
-
if (vp->v_type == VDIR && cp->c_entries > 0)
panic("hfs_remove: attempting to delete a non-empty directory!");
if (vp->v_type != VDIR && cp->c_blocks > 0)
panic("hfs_remove: attempting to delete a non-empty file!");
+ /* Lock catalog b-tree */
+ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
+ if (error)
+ goto out;
+
error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr);
- if (error && truncated)
- panic("hfs_remove: couldn't delete a truncated file!");
+ if (error && error != ENXIO && truncated) {
+ if ((cp->c_datafork && cp->c_datafork->ff_data.cf_size != 0) ||
+ (cp->c_rsrcfork && cp->c_rsrcfork->ff_data.cf_size != 0)) {
+ panic("hfs: remove: couldn't delete a truncated file! (%d, data sz %lld; rsrc sz %lld)",
+ error, cp->c_datafork->ff_data.cf_size, cp->c_rsrcfork->ff_data.cf_size);
+ } else {
+ printf("hfs: remove: strangely enough, deleting truncated file %s (%d) got err %d\n",
+ cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error);
+ }
+ }
/* Unlock the Catalog */
(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
if (rvp)
vrele(rvp);
VOP_UNLOCK(vp, 0, p);
- (void) ubc_uncache(vp);
+ // XXXdbg - try to prevent the lost ubc_info panic
+ if ((cp->c_flag & C_HARDLINK) == 0 || cp->c_nlink == 0) {
+ (void) ubc_uncache(vp);
+ }
vrele(vp);
vput(dvp);
+
+ // XXXdbg
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ }
+
return (0);
+
out:
if (rvp)
vrele(rvp);
}
vput(vp);
vput(dvp);
+
+ // XXXdbg
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ }
+
return (error);
}
struct hfsmount *hfsmp;
struct proc *p = fcnp->cn_proc;
struct timeval tv;
- int retval = 0;
+ int retval = 0, started_tr = 0, grabbed_lock = 0;
+ int fdvp_locked = 0;
+ int fvp_locked = 0;
cnid_t oldparent = 0;
cnid_t newparent = 0;
+ // XXXdbg
+ if (fvp)
+ hfsmp = VTOHFS(fvp);
+ else if (tvp)
+ hfsmp = VTOHFS(tvp);
+ else
+ hfsmp = NULL;
+
#if HFS_DIAGNOSTIC
if ((tcnp->cn_flags & HASBUF) == 0 ||
(fcnp->cn_flags & HASBUF) == 0)
goto abortop;
}
- if ((retval = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)))
- goto abortop;
-
/*
* Make sure "from" vnode and its parent are changeable.
*/
fcp = VTOC(fvp);
oldparent = fdcp->c_cnid;
if ((fcp->c_flags & (IMMUTABLE | APPEND)) || (fdcp->c_flags & APPEND)) {
- VOP_UNLOCK(fvp, 0, p);
retval = EPERM;
goto abortop;
}
if (fcp->c_parentcnid != fdcp->c_cnid) {
- VOP_UNLOCK(fvp, 0, p);
retval = EINVAL;
goto abortop;
}
if (fvp == ap->a_tvp &&
(bcmp(fcp->c_desc.cd_nameptr, tcnp->cn_nameptr,
fcp->c_desc.cd_namelen) == 0)) {
- VOP_UNLOCK(fvp, 0, p);
retval = 0;
goto abortop;
}
|| fdcp == fcp
|| (fcnp->cn_flags&ISDOTDOT)
|| (fcp->c_flag & C_RENAME)) {
- VOP_UNLOCK(fvp, 0, p);
retval = EINVAL;
goto abortop;
}
newparent = tdcp->c_cnid;
+ // XXXdbg - don't allow renaming the journal or journal_info_block
+ if (hfsmp->jnl && fcp->c_datafork) {
+ struct HFSPlusExtentDescriptor *extd;
+
+ extd = &fcp->c_datafork->ff_data.cf_extents[0];
+ if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+ retval = EPERM;
+ goto bad;
+ }
+ }
+
+ if (hfsmp->jnl && tcp && tcp->c_datafork) {
+ struct HFSPlusExtentDescriptor *extd;
+
+ extd = &tcp->c_datafork->ff_data.cf_extents[0];
+ if (extd->startBlock == HFSTOVCB(hfsmp)->vcbJinfoBlock || extd->startBlock == hfsmp->jnl_start) {
+ retval = EPERM;
+ goto bad;
+ }
+ }
+
retval = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc);
if ((fvp->v_type == VDIR) && (newparent != oldparent)) {
if (retval) /* write access check above */
}
retval = 0; /* Reset value from above, we dont care about it anymore */
+ /* XXX
+ * Prevent lock heirarchy violation (deadlock):
+ *
+ * If fdvp is the parent of tdvp then we must drop
+ * tdvp lock before aquiring the lock for fdvp.
+ *
+ * XXXdbg - moved this to happen up here *before* we
+ * start a transaction. otherwise we can
+ * deadlock because the vnode layer may get
+ * this lock for someone else and then they'll
+ * never be able to start a transaction.
+ */
+ if (newparent != oldparent) {
+ if (fdcp->c_cnid == tdcp->c_parentcnid) {
+ vput(tdvp);
+ vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
+ vget(tdvp, LK_EXCLUSIVE | LK_RETRY, p);
+ } else {
+ vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
+ }
+ }
+ fdvp_locked = 1;
+ if ((retval = vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p)))
+ goto bad;
+ fvp_locked = 1;
+
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ grabbed_lock = 1;
+ if (hfsmp->jnl) {
+ if ((retval = journal_start_transaction(hfsmp->jnl)) != 0) {
+ goto bad;
+ }
+ started_tr = 1;
+ }
+
/*
* If the destination exists, then be sure its type (file or dir)
* matches that of the source. And, if it is a directory make sure
}
- /* XXX
- * Prevent lock heirarchy violation (deadlock):
- *
- * If fdvp is the parent of tdvp then we must drop
- * tdvp lock before aquiring the lock for fdvp.
- */
- if (newparent != oldparent)
- vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY, p);
-
/* remove the existing entry from the namei cache: */
cache_purge(fvp);
- hfsmp = VTOHFS(fvp);
bzero(&from_desc, sizeof(from_desc));
from_desc.cd_nameptr = fcnp->cn_nameptr;
from_desc.cd_namelen = fcnp->cn_namelen;
/* Lock catalog b-tree */
retval = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_EXCLUSIVE, p);
if (retval) {
- if (newparent != oldparent) /* unlock the lock we just got */
- VOP_UNLOCK(fdvp, 0, p);
goto bad;
}
- retval = cat_rename(hfsmp, &from_desc, &tdcp->c_desc,
- &to_desc, &out_desc);
+ retval = cat_rename(hfsmp, &from_desc, &tdcp->c_desc,
+ &to_desc, &out_desc);
/* Unlock catalog b-tree */
(void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
- if (newparent != oldparent)
+ if (newparent != oldparent) {
VOP_UNLOCK(fdvp, 0, p);
+ fdvp_locked = 0;
+ }
if (retval) goto bad;
fdcp->c_entries--;
tdcp->c_nlink++;
tdcp->c_entries++;
- fdcp->c_flag |= C_UPDATE;
- tdcp->c_flag |= C_UPDATE;
+ fdcp->c_flag |= C_CHANGE | C_UPDATE;
+ tdcp->c_flag |= C_CHANGE | C_UPDATE;
tv = time;
CTIMES(fdcp, &tv, &tv);
CTIMES(tdcp, &tv, &tv);
tdcp->c_childhint = out_desc.cd_hint; /* Cache directory's location */
+ // make sure both directories get updated on disk.
+ if (fdvp != tdvp) {
+ (void) VOP_UPDATE(fdvp, &tv, &tv, 0);
+ }
+ (void) VOP_UPDATE(tdvp, &tv, &tv, 0);
+
hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_RMDIR : VOL_RMFILE,
(fdcp->c_cnid == kHFSRootFolderID));
hfs_volupdate(hfsmp, fvp->v_type == VDIR ? VOL_MKDIR : VOL_MKFILE,
vput(tdvp);
vrele(fdvp);
vput(fvp);
+
+ // XXXdbg
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ }
+
return (0);
bad:
if (fcp)
fcp->c_flag &= ~C_RENAME;
+
+ // XXXdbg make sure both directories get updated on disk.
+ if (fdvp != tdvp) {
+ (void) VOP_UPDATE(fdvp, &tv, &tv, 0);
+ }
+ (void) VOP_UPDATE(tdvp, &tv, &tv, 0);
+
if (tdvp == tvp)
vrele(tdvp);
else
vput(tdvp);
if (tvp)
vput(tvp);
- vrele(fdvp);
- if (VOP_ISLOCKED(fvp))
+ if (fdvp_locked)
+ vput(fdvp);
+ else
+ vrele(fdvp);
+
+ if (fvp_locked)
vput(fvp);
else
vrele(fvp);
+
+ // XXXdbg
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ }
+
return (retval);
abortop:
VOP_ABORTOP(fdvp, fcnp);
vrele(fdvp);
vrele(fvp);
+
return (retval);
}
} */ *ap;
{
register struct vnode *vp, **vpp = ap->a_vpp;
+ struct hfsmount *hfsmp;
struct filefork *fp;
int len, error;
struct buf *bp = NULL;
return (EINVAL);
}
+
+ hfsmp = VTOHFS(ap->a_dvp);
+
/* Create the vnode */
if ((error = hfs_makenode(S_IFLNK | ap->a_vap->va_mode,
- ap->a_dvp, vpp, ap->a_cnp)))
+ ap->a_dvp, vpp, ap->a_cnp))) {
return (error);
+ }
vp = *vpp;
len = strlen(ap->a_target);
fp = VTOF(vp);
fp->ff_clumpsize = VTOVCB(vp)->blockSize;
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ VOP_ABORTOP(ap->a_dvp, ap->a_cnp);
+ vput(ap->a_dvp);
+ return (error);
+ }
+ }
+
/* Allocate space for the link */
error = VOP_TRUNCATE(vp, len, IO_NOZEROFILL,
ap->a_cnp->cn_cred, ap->a_cnp->cn_proc);
/* Write the link to disk */
bp = getblk(vp, 0, roundup((int)fp->ff_size, VTOHFS(vp)->hfs_phys_block_size),
0, 0, BLK_META);
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, bp);
+ }
bzero(bp->b_data, bp->b_bufsize);
bcopy(ap->a_target, bp->b_data, len);
- bawrite(bp);
+ if (hfsmp->jnl) {
+ journal_modify_block_end(hfsmp->jnl, bp);
+ } else {
+ bawrite(bp);
+ }
out:
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
vput(vp);
return (error);
}
off_t off = uio->uio_offset;
int retval = 0;
int eofflag = 0;
-
+ void *user_start = NULL;
+ int user_len;
+
/* We assume it's all one big buffer... */
if (uio->uio_iovcnt > 1 || uio->uio_resid < AVERAGE_HFSDIRENTRY_SIZE)
return EINVAL;
+ // XXXdbg
+ // We have to lock the user's buffer here so that we won't
+ // fault on it after we've acquired a shared lock on the
+ // catalog file. The issue is that you can get a 3-way
+ // deadlock if someone else starts a transaction and then
+ // tries to lock the catalog file but can't because we're
+ // here and we can't service our page fault because VM is
+ // blocked trying to start a transaction as a result of
+ // trying to free up pages for our page fault. It's messy
+ // but it does happen on dual-procesors that are paging
+ // heavily (see radar 3082639 for more info). By locking
+ // the buffer up-front we prevent ourselves from faulting
+ // while holding the shared catalog file lock.
+ //
+ // Fortunately this and hfs_search() are the only two places
+ // currently (10/30/02) that can fault on user data with a
+ // shared lock on the catalog file.
+ //
+ if (hfsmp->jnl && uio->uio_segflg == UIO_USERSPACE) {
+ user_start = uio->uio_iov->iov_base;
+ user_len = uio->uio_iov->iov_len;
+
+ if ((retval = vslock(user_start, user_len)) != 0) {
+ return retval;
+ }
+ }
+
+
/* Create the entries for . and .. */
if (uio->uio_offset < sizeof(rootdots)) {
caddr_t dep;
}
Exit:;
+ if (hfsmp->jnl && user_start) {
+ vsunlock(user_start, user_len, TRUE);
+ }
+
if (ap->a_eofflag)
*ap->a_eofflag = eofflag;
}
bcopy(bp->b_data, fp->ff_symlinkptr, (size_t)fp->ff_size);
if (bp) {
- bp->b_flags |= B_INVAL; /* data no longer needed */
+ if (VTOHFS(vp)->jnl && (bp->b_flags & B_LOCKED) == 0) {
+ bp->b_flags |= B_INVAL; /* data no longer needed */
+ }
brelse(bp);
}
}
struct cat_fork *rsrcforkp = NULL;
struct cat_fork datafork;
int updateflag;
+ struct hfsmount *hfsmp;
int error;
+ hfsmp = VTOHFS(vp);
+
/* XXX do we really want to clear the sytem cnode flags here???? */
if ((vp->v_flag & VSYSTEM) ||
(VTOVFS(vp)->mnt_flag & MNT_RDONLY) ||
updateflag = cp->c_flag & (C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE);
/* Nothing to update. */
- if (updateflag == 0)
+ if (updateflag == 0) {
return (0);
+ }
/* HFS standard doesn't have access times. */
- if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord))
+ if ((updateflag == C_ACCESS) && (VTOVCB(vp)->vcbSigWord == kHFSSigWord)) {
return (0);
+ }
if (updateflag & C_ACCESS) {
/*
* If only the access time is changing then defer
(dataforkp && cp->c_datafork->ff_unallocblocks) ||
(rsrcforkp && cp->c_rsrcfork->ff_unallocblocks)) {
if (updateflag & (C_CHANGE | C_UPDATE))
- hfs_volupdate(VTOHFS(vp), VOL_UPDATE, 0);
+ hfs_volupdate(hfsmp, VOL_UPDATE, 0);
cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_UPDATE);
cp->c_flag |= C_MODIFIED;
+
return (0);
}
+
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ if (hfsmp->jnl) {
+ if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+ hfs_global_shared_lock_release(hfsmp);
+ return error;
+ }
+ }
+
+
/*
* For files with invalid ranges (holes) the on-disk
* field representing the size of the file (cf_size)
* A shared lock is sufficient since an update doesn't change
* the tree and the lock on vp protects the cnode.
*/
- error = hfs_metafilelocking(VTOHFS(vp), kHFSCatalogFileID, LK_SHARED, p);
- if (error)
+ error = hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_SHARED, p);
+ if (error) {
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
return (error);
+ }
/* XXX - waitfor is not enforced */
- error = cat_update(VTOHFS(vp), &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp);
+ error = cat_update(hfsmp, &cp->c_desc, &cp->c_attr, dataforkp, rsrcforkp);
/* Unlock the Catalog b-tree file. */
- (void) hfs_metafilelocking(VTOHFS(vp), kHFSCatalogFileID, LK_RELEASE, p);
+ (void) hfs_metafilelocking(hfsmp, kHFSCatalogFileID, LK_RELEASE, p);
if (updateflag & (C_CHANGE | C_UPDATE))
- hfs_volupdate(VTOHFS(vp), VOL_UPDATE, 0);
+ hfs_volupdate(hfsmp, VOL_UPDATE, 0);
+
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_end_transaction(hfsmp->jnl);
+ }
+ hfs_global_shared_lock_release(hfsmp);
/* After the updates are finished, clear the flags */
cp->c_flag &= ~(C_ACCESS | C_CHANGE | C_MODIFIED | C_UPDATE | C_ATIMEMOD);
struct proc *p;
struct cat_desc in_desc, out_desc;
struct cat_attr attr;
- int error;
+ int error, started_tr = 0, grabbed_lock = 0;
enum vtype vnodetype;
p = cnp->cn_proc;
in_desc.cd_parentcnid = dcp->c_cnid;
in_desc.cd_flags = S_ISDIR(mode) ? CD_ISDIR : 0;
+ // XXXdbg
+ hfs_global_shared_lock_acquire(hfsmp);
+ grabbed_lock = 1;
+ if (hfsmp->jnl) {
+ if ((error = journal_start_transaction(hfsmp->jnl)) != 0) {
+ goto exit;
+ }
+ started_tr = 1;
+ }
+
/* Lock catalog b-tree */
error = hfs_metafilelocking(VTOHFS(dvp), kHFSCatalogFileID, LK_EXCLUSIVE, p);
if (error)
dcp->c_flag |= C_CHANGE | C_UPDATE;
tv = time;
(void) VOP_UPDATE(dvp, &tv, &tv, 0);
+
hfs_volupdate(hfsmp, vnodetype == VDIR ? VOL_MKDIR : VOL_MKFILE,
(dcp->c_cnid == kHFSRootFolderID));
+ // XXXdbg
+ // have to end the transaction here before we call hfs_getnewvnode()
+ // because that can cause us to try and reclaim a vnode on a different
+ // file system which could cause us to start a transaction which can
+ // deadlock with someone on that other file system (since we could be
+ // holding two transaction locks as well as various vnodes and we did
+ // not obtain the locks on them in the proper order).
+ //
+ // NOTE: this means that if the quota check fails or we have to update
+ // the change time on a block-special device that those changes
+ // will happen as part of independent transactions.
+ //
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ started_tr = 0;
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ grabbed_lock = 0;
+ }
+
/* Create a vnode for the object just created: */
error = hfs_getnewvnode(hfsmp, NULL, &out_desc, 0, &attr, NULL, &tvp);
if (error)
goto exit;
+
#if QUOTA
cp = VTOC(tvp);
/*
VOP_RMDIR(dvp,tvp, cnp);
else
VOP_REMOVE(dvp,tvp, cnp);
+
return (error);
}
#endif /* QUOTA */
tvp->v_type = IFTOVT(mode);
cp->c_flag |= C_CHANGE;
tv = time;
- if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) {
- vput(tvp);
+ if ((error = VOP_UPDATE(tvp, &tv, &tv, 1))) {
+ vput(tvp);
goto exit;
}
}
FREE_ZONE(cnp->cn_pnbuf, cnp->cn_pnlen, M_NAMEI);
vput(dvp);
+ // XXXdbg
+ if (started_tr) {
+ journal_end_transaction(hfsmp->jnl);
+ started_tr = 0;
+ }
+ if (grabbed_lock) {
+ hfs_global_shared_lock_release(hfsmp);
+ grabbed_lock = 0;
+ }
+
return (error);
}
err = ReleaseNode (btreePtr, &nodeRec);
M_ExitOnError (err);
+ /*
+ * Under Mac OS, b-tree nodes can be non-contiguous on disk when the
+ * allocation block size is smaller than the b-tree node size.
+ *
+ * If journaling is turned on for this volume we can't deal with this
+ * situation and so we bail out. If journaling isn't on it's ok as
+ * hfs_strategy_fragmented() deals with it. Journaling can't support
+ * this because it assumes that if you give it a block that it's
+ * contiguous on disk.
+ */
+ if ( FCBTOHFS(filePtr)->jnl && !NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize) ) {
+ return fsBTInvalidNodeErr;
+ }
+
//////////////////////////////// Success ////////////////////////////////////
//\80\80 align LEOF to multiple of node size? - just on close
if (filePtr == nil) return paramErr;
if (searchIterator == nil) return paramErr;
+ node.buffer = nil;
+ node.blockHeader = nil;
+
btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
if (btreePtr == nil) return fsBTInvalidFileErr;
////////////////////////// Priliminary Checks ///////////////////////////////
- left.buffer = nil;
- right.buffer = nil;
- node.buffer = nil;
+ left.buffer = nil;
+ left.blockHeader = nil;
+ right.buffer = nil;
+ right.blockHeader = nil;
+ node.buffer = nil;
+ node.blockHeader = nil;
if (filePtr == nil)
////////////////////////// Priliminary Checks ///////////////////////////////
- left.buffer = nil;
- right.buffer = nil;
- node.buffer = nil;
+ left.buffer = nil;
+ left.blockHeader = nil;
+ right.buffer = nil;
+ right.blockHeader = nil;
+ node.buffer = nil;
+ node.blockHeader = nil;
btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
UInt16 index;
Boolean recordFit;
-
////////////////////////// Priliminary Checks ///////////////////////////////
nodeRec.buffer = nil; // so we can call ReleaseNode
+ nodeRec.blockHeader = nil;
err = CheckInsertParams (filePtr, iterator, record, recordLen);
if (err != noErr)
err = GetNewNode (btreePtr, insertNodeNum, &nodeRec);
M_ExitOnError (err);
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+
((NodeDescPtr)nodeRec.buffer)->kind = kBTLeafNode;
((NodeDescPtr)nodeRec.buffer)->height = 1;
btreePtr->rootNode = insertNodeNum;
btreePtr->firstLeafNode = insertNodeNum;
btreePtr->lastLeafNode = insertNodeNum;
+
M_BTreeHeaderDirty (btreePtr);
goto Success;
if (index > 0)
{
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+
recordFit = InsertKeyRecord (btreePtr, nodeRec.buffer, index,
&iterator->key, KeyLength(btreePtr, &iterator->key),
record->bufferAddress, recordLen);
++btreePtr->writeCount;
++btreePtr->leafRecords;
M_BTreeHeaderDirty (btreePtr);
-
+
// create hint
iterator->hint.writeCount = btreePtr->writeCount;
iterator->hint.nodeNum = insertNodeNum;
////////////////////////// Priliminary Checks ///////////////////////////////
nodeRec.buffer = nil; // so we can call ReleaseNode
+ nodeRec.blockHeader = nil;
err = CheckInsertParams (filePtr, iterator, record, recordLen);
if (err != noErr)
err = GetNode (btreePtr, insertNodeNum, &nodeRec);
if( err == noErr )
{
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+
err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit);
M_ExitOnError (err);
// optimization - if simple replace will work then don't extend btree
// \80\80 if we tried this before, and failed because it wouldn't fit then we shouldn't try this again...
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+
err = TrySimpleReplace (btreePtr, nodeRec.buffer, iterator, record, recordLen, &recordFit);
M_ExitOnError (err);
}
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+
DeleteRecord (btreePtr, nodeRec.buffer, index); // delete existing key/record
err = InsertTree (btreePtr, treePathTable, &iterator->key, record->bufferAddress,
////////////////////////// Priliminary Checks ///////////////////////////////
nodeRec.buffer = nil; // so we can call ReleaseNode
+ nodeRec.blockHeader = nil;
btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen);
M_ExitOnError (err);
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+
err = callBackProc(keyPtr, recordPtr, recordLen, callBackState);
M_ExitOnError (err);
err = GetRecordByIndex(btreePtr, nodeRec.buffer, index, &keyPtr, &recordPtr, &recordLen);
M_ExitOnError (err);
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &nodeRec);
+
err = callBackProc(keyPtr, recordPtr, recordLen, callBackState);
M_ExitOnError (err);
////////////////////////// Priliminary Checks ///////////////////////////////
nodeRec.buffer = nil; // so we can call ReleaseNode
+ nodeRec.blockHeader = nil;
M_ReturnErrorIf (filePtr == nil, paramErr);
M_ReturnErrorIf (iterator == nil, paramErr);
++btreePtr->writeCount;
--btreePtr->leafRecords;
M_BTreeHeaderDirty (btreePtr);
-
+
iterator->hint.nodeNum = 0;
return noErr;
return noErr;
}
+// XXXdbg
+__private_extern__
+OSStatus
+BTIsDirty(FCB *filePtr)
+{
+ BTreeControlBlockPtr btreePtr;
+ btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+ return TreeIsDirty(btreePtr);
+}
/*-------------------------------------------------------------------------------
Routine: BTFlushPath - Flush BTreeControlBlock to Header Node.
BTHeaderRec *header;
+ node.buffer = nil;
+ node.blockHeader = nil;
+
btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
if (btreePtr == nil)
return (fsBTInvalidFileErr);
}
+/*-------------------------------------------------------------------------------
+Routine: BTCheckFreeSpace
+
+Function: Makes sure there is enough free space so that a tree operation
+ will succeed.
+
+Input: fcb - pointer file control block
+
+Output: none
+
+Result: noErr - success
+
+-------------------------------------------------------------------------------*/
+
+__private_extern__
+OSStatus BTCheckFreeSpace (FCB *filePtr)
+{
+ BTreeControlBlockPtr btreePtr;
+ int nodesNeeded, err = noErr;
+
+
+ M_ReturnErrorIf (filePtr == nil, paramErr);
+
+ btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+
+ REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true);
+
+ M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr);
+
+ // XXXdbg this is highly conservative but so much better than
+ // winding up with turds on your disk.
+ //
+ nodesNeeded = (btreePtr->treeDepth + 1) * 10;
+
+ if (btreePtr->freeNodes < nodesNeeded) {
+ err = ExtendBTree(btreePtr, nodesNeeded + btreePtr->totalNodes - btreePtr->freeNodes);
+ }
+
+ return err;
+}
+
+
+__private_extern__
+OSStatus BTHasContiguousNodes (FCB *filePtr)
+{
+ BTreeControlBlockPtr btreePtr;
+ int nodesNeeded, err = noErr;
+
+
+ M_ReturnErrorIf (filePtr == nil, paramErr);
+
+ btreePtr = (BTreeControlBlockPtr) filePtr->fcbBTCBPtr;
+
+ REQUIRE_FILE_LOCK(btreePtr->fileRefNum, true);
+
+ M_ReturnErrorIf (btreePtr == nil, fsBTInvalidFileErr);
+
+ return NodesAreContiguous(FCBTOVCB(filePtr), filePtr, btreePtr->nodeSize);
+}
nodeNumber = 0; // first node number of header map record
node.buffer = nil; // clear node.buffer to get header node
// - and for ErrorExit
+ node.blockHeader = nil;
while (true)
{
err = GetMapNode (btreePtr, &node, &mapPtr, &mapSize);
M_ExitOnError (err);
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &node);
+
//////////////////////// Find Word with Free Bit ////////////////////////////
pos = mapPtr;
//////////////////////////// Find Map Record ////////////////////////////////
nodeIndex = 0; // first node number of header map record
node.buffer = nil; // invalidate node.buffer to get header node
+ node.blockHeader = nil;
while (nodeNum >= nodeIndex)
{
//////////////////////////// Mark Node Free /////////////////////////////////
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &node);
+
nodeNum -= (nodeIndex - (mapSize << 3)); // relative to this map record
bitOffset = 15 - (nodeNum & 0x0000000F); // last 4 bits are bit offset
mapPos += nodeNum >> 4; // point to word containing map bit
filePtr = GetFileControlBlock(btreePtr->fileRefNum);
mapNode.buffer = nil;
+ mapNode.blockHeader = nil;
newNode.buffer = nil;
+ newNode.blockHeader = nil;
mapNodeRecSize = nodeSize - sizeof(BTNodeDescriptor) - 6; // 2 bytes of free space (see note)
/////////////////////// Initialize New Map Nodes ////////////////////////////
+ // XXXdbg - this is the correct place for this:
+ ModifyBlockStart(btreePtr->fileRefNum, &mapNode);
((BTNodeDescriptor*)mapNode.buffer)->fLink = firstNewMapNodeNum;
err = GetNewNode (btreePtr, nodeNum, &newNode);
M_ExitOnError (err);
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &newNode);
+
((NodeDescPtr)newNode.buffer)->numRecords = 1;
((NodeDescPtr)newNode.buffer)->kind = kBTMapNode;
err = GetNode (btreePtr, nextNodeNum, &mapNode);
M_ExitOnError (err);
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &mapNode);
+
mapIndex = 0;
mapStart = (UInt16 *) GetRecordAddress (btreePtr, mapNode.buffer, mapIndex);
////////////////////////////// Error Exit ///////////////////////////////////
ErrorExit:
-
+
(void) ReleaseNode (btreePtr, &mapNode);
(void) ReleaseNode (btreePtr, &newNode);
+__private_extern__
+OSStatus TreeIsDirty(BTreeControlBlockPtr btreePtr)
+{
+ return (btreePtr->flags & kBTHeaderDirty);
+}
+
+
+
/*-------------------------------------------------------------------------------
Routine: UpdateHeader - Write BTreeInfoRec fields to Header node.
BTHeaderRec *header;
UInt32 options;
-
if ((btreePtr->flags & kBTHeaderDirty) == 0) // btree info already flushed
return noErr;
err = GetNode (btreePtr, kHeaderNodeNum, &node );
- if (err != noErr)
+ if (err != noErr) {
return err;
+ }
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &node);
+
header = (BTHeaderRec*) ((char *)node.buffer + sizeof(BTNodeDescriptor));
header->treeDepth = btreePtr->treeDepth;
// assume foundRecord points to Boolean
left->buffer = nil;
+ left->blockHeader = nil;
middle->buffer = nil;
+ middle->blockHeader = nil;
right->buffer = nil;
+ right->blockHeader = nil;
foundIt = false;
// release old buffer if we have one
if ( theScanStatePtr->bufferPtr != NULL )
{
- theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE);
+ theScanStatePtr->bufferPtr->b_flags |= (B_INVAL | B_AGE);
brelse( theScanStatePtr->bufferPtr );
theScanStatePtr->bufferPtr = NULL;
theScanStatePtr->currentNodePtr = NULL;
// now read blocks from the device
myErr = bread( myDevPtr,
- myPhyBlockNum,
- myBufferSize,
- NOCRED,
- &theScanStatePtr->bufferPtr );
+ myPhyBlockNum,
+ myBufferSize,
+ NOCRED,
+ &theScanStatePtr->bufferPtr );
if ( myErr != E_NONE )
{
goto ExitThisRoutine;
if ( scanState->bufferPtr != NULL )
{
scanState->bufferPtr->b_flags |= (B_INVAL | B_AGE);
- brelse( scanState->bufferPtr );
+ brelse( scanState->bufferPtr );
scanState->bufferPtr = NULL;
scanState->currentNodePtr = NULL;
}
PanicIf ((level == 1) && (((NodeDescPtr)targetNode->buffer)->kind != kBTLeafNode), "\P InsertLevel: non-leaf at level 1! ");
#endif
leftNode.buffer = nil;
+ leftNode.blockHeader = nil;
targetNodeNum = treePathTable [level].node;
insertParent = false;
updateParent = false;
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, targetNode);
+
////// process first insert //////
-
+
err = InsertNode (btreePtr, primaryKey, targetNode, targetNodeNum, index,
&newNodeNum, &newIndex, &leftNode, &updateParent, &insertParent, &newRoot );
M_ExitOnError (err);
UInt8 * recPtr;
UInt16 recSize;
+ parentNode.buffer = nil;
+ parentNode.blockHeader = nil;
+
secondaryKey = nil;
PanicIf ( (level == btreePtr->treeDepth), "\p InsertLevel: unfinished insert!?");
if ( updateParent )
{
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &parentNode);
+
//\80\80Â debug: check if ptr == targetNodeNum
GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize);
PanicIf( (*(UInt32 *) recPtr) != targetNodeNum, "\p InsertLevel: parent ptr doesn't match target node!");
{
err = GetNode (btreePtr, leftNodeNum, leftNode); // will be released by caller or a split below
M_ExitOnError (err);
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, leftNode);
}
PanicIf ( ((NodeDescPtr) leftNode->buffer)->fLink != node, "\p InsertNode, RotateLeft: invalid sibling link!" );
return noErr;
ErrorExit:
-
(void) ReleaseNode (btreePtr, leftNode);
return err;
Boolean deleteRequired;
Boolean updateRequired;
-
+ // XXXdbg - initialize these to null in case we get an
+ // error and try to exit before it's initialized
+ parentNode.buffer = nil;
+ parentNode.blockHeader = nil;
+
deleteRequired = false;
updateRequired = false;
targetNodePtr = targetNode->buffer;
PanicIf (targetNodePtr == nil, "\pDeleteTree: targetNode has nil buffer!");
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, targetNode);
+
DeleteRecord (btreePtr, targetNodePtr, index);
//\80\80 coalesce remaining records?
deleteRequired = true;
+ siblingNode.buffer = nil;
+ siblingNode.blockHeader = nil;
+
////////////////// Get Siblings & Update Links //////////////////////////
siblingNodeNum = targetNodePtr->bLink; // Left Sibling Node
{
err = GetNode (btreePtr, siblingNodeNum, &siblingNode);
M_ExitOnError (err);
+
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &siblingNode);
+
((NodeDescPtr)siblingNode.buffer)->fLink = targetNodePtr->fLink;
err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction);
M_ExitOnError (err);
{
err = GetNode (btreePtr, siblingNodeNum, &siblingNode);
M_ExitOnError (err);
+
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &siblingNode);
+
((NodeDescPtr)siblingNode.buffer)->bLink = targetNodePtr->bLink;
err = UpdateNode (btreePtr, &siblingNode, 0, kLockTransaction);
M_ExitOnError (err);
err = UpdateNode (btreePtr, targetNode, 0, kLockTransaction);
M_ExitOnError (err);
+
err = FreeNode (btreePtr, targetNodeNum);
M_ExitOnError (err);
}
UInt16 recSize;
UInt32 insertNode;
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &parentNode);
+
//\80\80Â debug: check if ptr == targetNodeNum
GetRecordByIndex (btreePtr, parentNode.buffer, index, &keyPtr, &recPtr, &recSize);
PanicIf( (*(UInt32 *) recPtr) != targetNodeNum, "\p DeleteTree: parent ptr doesn't match targetNodeNum!!");
return noErr;
ErrorExit:
-
+
(void) ReleaseNode (btreePtr, targetNode);
(void) ReleaseNode (btreePtr, &parentNode);
originalRoot = btreePtr->rootNode;
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, blockPtr);
+
while (true)
{
if ( ((NodeDescPtr)blockPtr->buffer)->numRecords > 1)
//// Get New Root Node
err = GetNode (btreePtr, btreePtr->rootNode, blockPtr);
M_ExitOnError (err);
+
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, blockPtr);
}
if (btreePtr->rootNode != originalRoot)
if ( left != nil )
{
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, leftNode);
+
left->fLink = newNodeNum;
err = UpdateNode (btreePtr, leftNode, 0, kLockTransaction);
M_ExitOnError (err);
err = GetNewNode (btreePtr, newNodeNum, leftNode);
M_ExitOnError (err);
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, leftNode);
+
left = leftNode->buffer;
left->fLink = rightNodeNum;
err = RotateLeft (btreePtr, left, right, index, keyPtr, recPtr, recSize,
insertIndex, insertNodeNum, &recordFit, recsRotated);
- M_ExitOnError (err);
+ M_ExitOnError (err);
+
return noErr;
ErrorExit:
Boolean didItFit;
UInt16 keyLength;
+ rootNode.buffer = nil;
+ rootNode.blockHeader = nil;
+
PanicIf (leftNode == nil, "\pAddNewRootNode: leftNode == nil");
PanicIf (rightNode == nil, "\pAddNewRootNode: rightNode == nil");
err = GetNewNode (btreePtr, rootNum, &rootNode);
M_ExitOnError (err);
+ // XXXdbg
+ ModifyBlockStart(btreePtr->fileRefNum, &rootNode);
+
((NodeDescPtr)rootNode.buffer)->kind = kBTIndexNode;
((NodeDescPtr)rootNode.buffer)->height = ++btreePtr->treeDepth;
err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey, NULL);
ReturnIfError(err);
+ err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+ ReturnIfError(err);
+
if ( isHFSPlus )
{
//-- Step 1: Check the catalog nodes for extents
err = noErr;
*hint = 0;
+
+ // XXXdbg - preflight that there's enough space
+ err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+ if (err)
+ return err;
+
MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
bzero(btIterator, sizeof(*btIterator));
if (err == noErr)
*hint = btIterator->hint.nodeNum;
+ (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum));
+
FREE(btIterator, M_TEMP);
return err;
}
OSErr err;
err = noErr;
+
+ // XXXdbg - preflight that there's enough space
+ err = BTCheckFreeSpace(GetFileControlBlock(vcb->extentsRefNum));
+ if (err)
+ return err;
+
MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
bzero(btIterator, sizeof(*btIterator));
}
err = BTDeleteRecord(GetFileControlBlock(vcb->extentsRefNum), btIterator);
-
+ (void) BTFlushPath(GetFileControlBlock(vcb->extentsRefNum));
+
FREE(btIterator, M_TEMP);
return err;
}
// Need to find and change a record in Extents BTree
//
btFCB = GetFileControlBlock(vcb->extentsRefNum);
+
+ // XXXdbg - preflight that there's enough space
+ err = BTCheckFreeSpace(btFCB);
+ if (err)
+ return err;
+
MALLOC(btIterator, BTreeIterator *, sizeof(*btIterator), M_TEMP, M_WAITOK);
bzero(btIterator, sizeof(*btIterator));
if (err == noErr)
err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize);
+ (void) BTFlushPath(btFCB);
}
else { // HFS Plus volume
HFSPlusExtentRecord foundData; // The extent data actually found
BlockMoveData(extentData, &foundData, sizeof(HFSPlusExtentRecord));
err = BTReplaceRecord(btFCB, btIterator, &btRecord, btRecordSize);
}
+ (void) BTFlushPath(btFCB);
}
FREE(btIterator, M_TEMP);
}
return true;
}
+
+
+//_________________________________________________________________________________
+//
+// Routine: NodesAreContiguous
+//
+// Purpose: Ensure that all b-tree nodes are contiguous on disk
+// Called by BTOpenPath during volume mount
+//_________________________________________________________________________________
+
+Boolean NodesAreContiguous(
+ ExtendedVCB *vcb,
+ FCB *fcb,
+ UInt32 nodeSize)
+{
+ UInt32 mask;
+ UInt32 startBlock;
+ UInt32 blocksChecked;
+ UInt32 hint;
+ HFSPlusExtentKey key;
+ HFSPlusExtentRecord extents;
+ OSErr result;
+ Boolean lastExtentReached;
+
+
+ if (vcb->blockSize >= nodeSize)
+ return TRUE;
+
+ mask = (nodeSize / vcb->blockSize) - 1;
+
+ // check the local extents
+ (void) GetFCBExtentRecord(fcb, extents);
+ if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) )
+ return FALSE;
+
+ if (lastExtentReached || (SInt64)((SInt64)blocksChecked * (SInt64)vcb->blockSize) >= fcb->ff_size)
+ return TRUE;
+
+ startBlock = blocksChecked;
+
+ // check the overflow extents (if any)
+ while ( !lastExtentReached )
+ {
+ result = FindExtentRecord(vcb, kDataForkType, fcb->ff_cp->c_fileid, startBlock, FALSE, &key, extents, &hint);
+ if (result) break;
+
+ if ( !ExtentsAreIntegral(extents, mask, &blocksChecked, &lastExtentReached) )
+ return FALSE;
+
+ startBlock += blocksChecked;
+ }
+
+ return TRUE;
+}
+
if (bp) {
if (dirty) {
- bdwrite(bp);
+ // XXXdbg
+ struct hfsmount *hfsmp = VCBTOHFS(vcb);
+
+ if (hfsmp->jnl) {
+ journal_modify_block_end(hfsmp->jnl, bp);
+ } else {
+ bdwrite(bp);
+ }
} else {
brelse(bp);
}
UInt32 bitsPerBlock;
UInt32 wordsPerBlock;
Boolean dirty = false;
+ struct hfsmount *hfsmp = VCBTOHFS(vcb);
// Since this routine doesn't wrap around
if (maxBlocks > (endingBlock - startingBlock)) {
endingBlock = block + maxBlocks; // if we get this far, we've found enough
}
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+ }
+
//
// Allocate all of the consecutive blocks
//
if (err != noErr) goto Exit;
buffer = currCache;
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+ }
+
wordsLeft = wordsPerBlock;
}
UInt32 blockRef;
UInt32 bitsPerBlock;
UInt32 wordsPerBlock;
+ // XXXdbg
+ struct hfsmount *hfsmp = VCBTOHFS(vcb);
//
// Pre-read the bitmap block containing the first word of allocation
wordsLeft = wordsPerBlock - wordIndexInBlock;
}
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+ }
+
//
// If the first block to allocate doesn't start on a word
// boundary in the bitmap, then treat that first word
err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
if (err != noErr) goto Exit;
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+ }
+
// Readjust currentWord and wordsLeft
currentWord = buffer;
wordsLeft = wordsPerBlock;
err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
if (err != noErr) goto Exit;
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+ }
+
// Readjust currentWord and wordsLeft
currentWord = buffer;
wordsLeft = wordsPerBlock;
UInt32 blockRef;
UInt32 bitsPerBlock;
UInt32 wordsPerBlock;
+ // XXXdbg
+ struct hfsmount *hfsmp = VCBTOHFS(vcb);
//
// Pre-read the bitmap block containing the first word of allocation
err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
if (err != noErr) goto Exit;
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+ }
+
//
// Initialize currentWord, and wordsLeft.
//
err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
if (err != noErr) goto Exit;
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+ }
+
// Readjust currentWord and wordsLeft
currentWord = buffer;
wordsLeft = wordsPerBlock;
err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef);
if (err != noErr) goto Exit;
+ // XXXdbg
+ if (hfsmp->jnl) {
+ journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef);
+ }
+
// Readjust currentWord and wordsLeft
currentWord = buffer;
wordsLeft = wordsPerBlock;
void *blockHeader;
ByteCount blockSize;
Boolean blockReadFromDisk;
- Byte reserved[3];
+ Byte isModified; // XXXdbg - for journaling
+ Byte reserved[2];
};
typedef struct BlockDescriptor BlockDescriptor;
typedef BlockDescriptor *BlockDescPtr;
extern OSStatus BTSetLastSync (FCB *filePtr,
UInt32 lastfsync );
+extern OSStatus BTCheckFreeSpace (FCB *filePtr);
+
+extern OSStatus BTHasContiguousNodes(FCB *filePtr);
+
#endif /* __APPLE_API_PRIVATE */
#endif /* KERNEL */
#endif // __BTREESINTERNAL__
OSStatus TrashNode (BTreeControlBlockPtr btreePtr,
NodePtr nodePtr );
+// XXXdbg
+void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr);
+// XXXdbg
+
OSStatus UpdateNode (BTreeControlBlockPtr btreePtr,
NodePtr nodePtr,
UInt32 transactionID,
if (fp->f_type != DTYPE_VNODE)
return(KERN_INVALID_ARGUMENT);
+
+ if (!(fp->f_flag & FREAD))
+ return (KERN_PROTECTION_FAILURE);
+
vp = (struct vnode *)fp->f_data;
if (vp->v_type != VREG)
#include <sys/types.h>
-#include <stdlib.h>
+//#include <stdlib.h>
static inline char *med3 __P((char *, char *, char *, int (*)()));
static inline void swapfunc __P((char *, char *, int, int));
:(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
}
+__private_extern__
void
qsort(a, n, es, cmp)
void *a;
/*
- * Copyright (c) 1999-2001 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
simple_unlock(&vp->v_interlock);
}
+/*
+ * Serialize the requests to the VM
+ * Returns:
+ * 0 - Failure
+ * 1 - Sucessful in acquiring the lock
+ * 2 - Sucessful in acquiring the lock recursively
+ * do not call ubc_unbusy()
+ * [This is strange, but saves 4 bytes in struct ubc_info]
+ */
+static int
+ubc_busy(struct vnode *vp)
+{
+ register struct ubc_info *uip;
+
+ if (!UBCINFOEXISTS(vp))
+ return (0);
+
+ uip = vp->v_ubcinfo;
+
+ while (ISSET(uip->ui_flags, UI_BUSY)) {
+
+ if (uip->ui_owner == (void *)current_thread())
+ return (2);
+
+ SET(uip->ui_flags, UI_WANTED);
+ (void) tsleep((caddr_t)&vp->v_ubcinfo, PINOD, "ubcbusy", 0);
+
+ if (!UBCINFOEXISTS(vp))
+ return (0);
+ }
+ uip->ui_owner = (void *)current_thread();
+
+ SET(uip->ui_flags, UI_BUSY);
+
+ return (1);
+}
+
+static void
+ubc_unbusy(struct vnode *vp)
+{
+ register struct ubc_info *uip;
+
+ if (!UBCINFOEXISTS(vp)) {
+ wakeup((caddr_t)&vp->v_ubcinfo);
+ return;
+ }
+ uip = vp->v_ubcinfo;
+ CLR(uip->ui_flags, UI_BUSY);
+ uip->ui_owner = (void *)NULL;
+
+ if (ISSET(uip->ui_flags, UI_WANTED)) {
+ CLR(uip->ui_flags, UI_WANTED);
+ wakeup((caddr_t)&vp->v_ubcinfo);
+ }
+}
+
/*
* Initialization of the zone for Unified Buffer Cache.
*/
uip->ui_refcount = 1;
uip->ui_size = 0;
uip->ui_mapped = 0;
+ uip->ui_owner = (void *)NULL;
ubc_lock(vp);
}
#if DIAGNOSTIC
void
ubc_info_deallocate(struct ubc_info *uip)
{
+
assert(uip->ui_refcount > 0);
- if (uip->ui_refcount-- == 1)
+ if (uip->ui_refcount-- == 1) {
+ struct vnode *vp;
+
+ vp = uip->ui_vnode;
+ if (ISSET(uip->ui_flags, UI_WANTED)) {
+ CLR(uip->ui_flags, UI_WANTED);
+ wakeup((caddr_t)&vp->v_ubcinfo);
+ }
+
ubc_info_free(uip);
+ }
}
/*
{
kern_return_t kret;
struct ubc_info *uip;
+ int recursed;
memory_object_control_t control;
memory_object_perf_info_data_t perf;
if (!UBCINFOEXISTS(vp))
return (0);
+ if ((recursed = ubc_busy(vp)) == 0)
+ return (0);
+
uip = vp->v_ubcinfo;
assert(uip != UBC_INFO_NULL);
if (kret != KERN_SUCCESS) {
printf("ubc_uncache: memory_object_change_attributes_named "
"kret = %d", kret);
+ if (recursed == 1)
+ ubc_unbusy(vp);
return (0);
}
ubc_release_named(vp);
+ if (recursed == 1)
+ ubc_unbusy(vp);
return (1);
}
ubc_getobject(struct vnode *vp, int flags)
{
struct ubc_info *uip;
+ int recursed;
memory_object_control_t control;
- uip = vp->v_ubcinfo;
-
if (UBCINVALID(vp))
return (0);
- ubc_lock(vp);
+ if ((recursed = ubc_busy(vp)) == 0)
+ return (0);
+ uip = vp->v_ubcinfo;
control = uip->ui_control;
if ((flags & UBC_HOLDOBJECT) && (!ISSET(uip->ui_flags, UI_HASOBJREF))) {
* Take a temporary reference on the ubc info so that it won't go
* away during our recovery attempt.
*/
+ ubc_lock(vp);
uip->ui_refcount++;
ubc_unlock(vp);
if (memory_object_recover_named(control, TRUE) == KERN_SUCCESS) {
- ubc_lock(vp);
SET(uip->ui_flags, UI_HASOBJREF);
- ubc_unlock(vp);
} else {
control = MEMORY_OBJECT_CONTROL_NULL;
}
+ if (recursed == 1)
+ ubc_unbusy(vp);
ubc_info_deallocate(uip);
} else {
- ubc_unlock(vp);
+ if (recursed == 1)
+ ubc_unbusy(vp);
}
return (control);
ubc_hold(struct vnode *vp)
{
struct ubc_info *uip;
+ int recursed;
memory_object_control_t object;
if (UBCINVALID(vp))
return (0);
- if (!UBCINFOEXISTS(vp)) {
+ if ((recursed = ubc_busy(vp)) == 0) {
/* must be invalid or dying vnode */
assert(UBCINVALID(vp) ||
- ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE)));
+ ((vp->v_flag & VXLOCK) || (vp->v_flag & VTERMINATE)));
return (0);
}
ubc_lock(vp);
uip->ui_refcount++;
+ ubc_unlock(vp);
if (!ISSET(uip->ui_flags, UI_HASOBJREF)) {
- ubc_unlock(vp);
- if (memory_object_recover_named(uip->ui_control, TRUE) != KERN_SUCCESS) {
+ if (memory_object_recover_named(uip->ui_control, TRUE)
+ != KERN_SUCCESS) {
+ if (recursed == 1)
+ ubc_unbusy(vp);
ubc_info_deallocate(uip);
return (0);
}
- ubc_lock(vp);
SET(uip->ui_flags, UI_HASOBJREF);
- ubc_unlock(vp);
- } else {
- ubc_unlock(vp);
}
+ if (recursed == 1)
+ ubc_unbusy(vp);
assert(uip->ui_refcount > 0);
+
return (1);
}
ubc_release_named(struct vnode *vp)
{
struct ubc_info *uip;
+ int recursed;
memory_object_control_t control;
- kern_return_t kret;
+ kern_return_t kret = KERN_FAILURE;
if (UBCINVALID(vp))
return (0);
- if (!UBCINFOEXISTS(vp))
+ if ((recursed = ubc_busy(vp)) == 0)
return (0);
-
uip = vp->v_ubcinfo;
/* can not release held or mapped vnodes */
if (ISSET(uip->ui_flags, UI_HASOBJREF) &&
- (uip->ui_refcount == 1) && !uip->ui_mapped) {
+ (uip->ui_refcount == 1) && !uip->ui_mapped) {
control = uip->ui_control;
assert(control);
CLR(uip->ui_flags, UI_HASOBJREF);
kret = memory_object_release_name(control,
MEMORY_OBJECT_RESPECT_CACHE);
- return ((kret != KERN_SUCCESS) ? 0 : 1);
- } else
- return (0);
+ }
+
+ if (recursed == 1)
+ ubc_unbusy(vp);
+ return ((kret != KERN_SUCCESS) ? 0 : 1);
}
/*
s = splbio();
for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
nbp = bp->b_vnbufs.le_next;
- if ((bp->b_flags & B_BUSY))
+ // XXXdbg - don't flush locked blocks. they may be journaled.
+ if ((bp->b_flags & B_BUSY) || (bp->b_flags & B_LOCKED))
continue;
if ((bp->b_flags & B_DELWRI) == 0)
panic("spec_fsync: not dirty");
int getpages;
{
register struct nfsnode *np = VTONFS(vp);
- register int biosize, diff, i;
+ register int biosize, i;
+ off_t diff;
struct buf *bp = 0, *rabp;
struct vattr vattr;
struct proc *p;
bufsize = biosize;
if ((off_t)(lbn + 1) * biosize > np->n_size &&
(off_t)(lbn + 1) * biosize - np->n_size < biosize) {
- bufsize = np->n_size - lbn * biosize;
+ bufsize = np->n_size - (off_t)lbn * biosize;
bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
}
bp = nfs_getcacheblk(vp, lbn, bufsize, p, operation);
bp = getblk(vp, bn, size, 0, 0, operation);
if( vp->v_type == VREG)
- bp->b_blkno = (bn * biosize) / DEV_BSIZE;
+ bp->b_blkno = ((off_t)bn * biosize) / DEV_BSIZE;
return (bp);
}
register struct mbuf *m, **mpp;
register char *cp1, *cp2;
register int len;
- struct mbuf *om, *m2, *recm = 0;
+ struct mbuf *om, *m2, *recm;
u_long recmark;
if (slp->ns_flag & SLP_GETSTREAM)
/*
* Now get the record part.
+ *
+ * Note that slp->ns_reclen may be 0. Linux sometimes
+ * generates 0-length RPCs
*/
+ recm = NULL;
if (slp->ns_cc == slp->ns_reclen) {
recm = slp->ns_raw;
slp->ns_raw = slp->ns_rawend = (struct mbuf *)0;
#if 0
/* (removed for UBC) */
bufsize = biosize;
- if ((lbn + 1) * biosize > np->n_size) {
- bufsize = np->n_size - lbn * biosize;
+ if ((off_t)(lbn + 1) * biosize > np->n_size) {
+ bufsize = np->n_size - (off_t)lbn * biosize;
bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
}
#endif
biosize = min(vp->v_mount->mnt_stat.f_iosize, PAGE_SIZE); /* nfs_bio.c */
- *ap->a_offset = (off_t)(ap->a_lblkno * biosize);
+ *ap->a_offset = (off_t)ap->a_lblkno * biosize;
return (0);
}
#define b_trans_head b_freelist.tqe_prev
#define b_trans_next b_freelist.tqe_next
#define b_real_bp b_saveaddr
+#define b_iostate b_rcred
+
+/* journaling uses this cluster i/o field for its own
+ * purposes because meta data buf's should never go
+ * through the clustering code.
+ */
+#define b_transaction b_vectorlist
+
+
/*
* These flags are kept in b_flags.
#define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */
#define B_WRITEINPROG 0x01000000 /* Write in progress. */
#define B_HDRALLOC 0x02000000 /* zone allocated buffer header */
-#define B_UNUSED1 0x04000000 /* Unused bit */
+#define B_NORELSE 0x04000000 /* don't brelse() in bwrite() */
#define B_NEED_IODONE 0x08000000
/* need to do a biodone on the */
/* real_bp associated with a cluster_io */
#define DKIOCGETMAXBLOCKCOUNTREAD _IOR('d', 64, u_int64_t)
#define DKIOCGETMAXBLOCKCOUNTWRITE _IOR('d', 65, u_int64_t)
+#define DKIOCGETMAXBYTECOUNTREAD _IOR('d', 70, u_int64_t)
+#define DKIOCGETMAXBYTECOUNTWRITE _IOR('d', 71, u_int64_t)
#define DKIOCGETMAXSEGMENTCOUNTREAD _IOR('d', 66, u_int64_t)
#define DKIOCGETMAXSEGMENTCOUNTWRITE _IOR('d', 67, u_int64_t)
+#define DKIOCGETMAXSEGMENTBYTECOUNTREAD _IOR('d', 68, u_int64_t)
+#define DKIOCGETMAXSEGMENTBYTECOUNTWRITE _IOR('d', 69, u_int64_t)
#ifdef KERNEL
#define DKIOCSETBLOCKSIZE _IOW('d', 24, u_int32_t)
#define M_IP6MISC 88 /* IPv6 misc. memory */
#define M_TSEGQ 89 /* TCP segment queue entry */
#define M_IGMP 90
+#define M_JOURNAL 91 /* VFS Journaling code */
-#define M_LAST 91 /* Must be last type + 1 */
+#define M_LAST 92 /* Must be last type + 1 */
/* Strings corresponding to types of memory */
/* Must be in synch with the #defines above */
"UDF mount" /* 85 M_UDFMNT */ \
"IPv6 NDP", /* 86 M_IP6NDP */ \
"IPv6 options", /* 87 M_IP6OPT */ \
- "IPv6 Misc" /* 88 M_IP6MISC */\
- "TCP Segment Q" /* 89 M_TSEGQ */\
- "IGMP state" /* 90 M_IGMP */\
+ "IPv6 Misc", /* 88 M_IP6MISC */\
+ "TCP Segment Q",/* 89 M_TSEGQ */\
+ "IGMP state", /* 90 M_IGMP */\
+ "Journaling" /* 91 M_JOURNAL */\
}
struct kmemstats {
#define MNT_DONTBROWSE 0x00100000 /* file system is not appropriate path to user data */
#define MNT_UNKNOWNPERMISSIONS 0x00200000 /* no known mapping for uid/gid in permissions information on disk */
#define MNT_AUTOMOUNTED 0x00400000 /* filesystem was mounted by automounter */
+#define MNT_JOURNALED 0x00800000 /* filesystem is journaled */
/*
* NFS export related mount flags.
MNT_DEFEXPORTED | MNT_EXPORTANON| MNT_EXKERB | \
MNT_LOCAL | MNT_QUOTA | \
MNT_ROOTFS | MNT_DOVOLFS | MNT_DONTBROWSE | \
- MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_FIXEDSCRIPTENCODING )
+ MNT_UNKNOWNPERMISSIONS | MNT_AUTOMOUNTED | MNT_JOURNALED | MNT_FIXEDSCRIPTENCODING )
/*
* External filesystem command modifier flags.
* Unmount can use the MNT_FORCE flag.
int ui_refcount;/* ref count on the ubc_info */
off_t ui_size; /* file size for the vnode */
long ui_mapped; /* is it currently mapped */
+ void *ui_owner; /* for recursive ubc_busy */
};
/* Defines for ui_flags */
#define UI_HASOBJREF 0x00000004 /* hold a reference on object */
#define UI_WASMAPPED 0x00000008 /* vnode was mapped */
#define UI_DONTCACHE 0x00000010 /* do not cache object */
+#define UI_BUSY 0x00000020 /* for VM synchronization */
+#define UI_WANTED 0x00000040 /* for VM synchronization */
#endif /* __APPLE_API_PRIVATE */
EXPINC_SUBDIRS_I386 = \
DATAFILES = \
- vfs_support.h
+ vfs_support.h vfs_journal.h
INSTALL_MI_LIST = ${DATAFILES}
/* number of per vnode, "in flight" buffer writes */
#define BUFWRITE_THROTTLE 9
+
/*
* Time in seconds before a buffer on a list is
* considered as a stale buffer
simple_lock(&bufhashlist_slock);
-#if 0
- if(incore(bp->b_vp, bp->b_lblkno))
- panic("binshash: already incore");
+#if 0
+ if((bad = incore(bp->b_vp, bp->b_lblkno)))
+ panic("binshash: already incore bp 0x%x, bad 0x%x\n", bp, bad);
#endif /* 0 */
BHASHENTCHECK(bp);
*/
bp->b_rcred = crdup(cred);
}
+
VOP_STRATEGY(bp);
trace(TR_BREADMISS, pack(vp, size), blkno);
p->p_stats->p_ru.ru_oublock++; /* XXX */
/* Release the buffer. */
- brelse(bp);
+ // XXXdbg - only if the unused bit is set
+ if (!ISSET(bp->b_flags, B_NORELSE)) {
+ brelse(bp);
+ } else {
+ CLR(bp->b_flags, B_NORELSE);
+ }
return (rv);
} else {
if (nbdwrite < 0)
panic("bdwrite: Negative nbdwrite");
- if (nbdwrite > ((nbuf/4)*3)) {
+ // can't do a bawrite() if the LOCKED bit is set because the
+ // buffer is part of a transaction and can't go to disk until
+ // the LOCKED bit is cleared.
+ if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf/4)*3)) {
if (return_error)
return (EAGAIN);
else
trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
+ // if we're invalidating a buffer that has the B_CALL bit
+ // set then call the b_iodone function so it gets cleaned
+ // up properly.
+ //
+ if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
+ if (ISSET(bp->b_flags, B_CALL) && !ISSET(bp->b_flags, B_DELWRI)) {
+ panic("brelse: CALL flag set but not DELWRI! bp 0x%x\n", bp);
+ }
+ if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
+ void (*iodone_func)(struct buf *) = bp->b_iodone;
+
+ CLR(bp->b_flags, B_CALL); /* but note callout done */
+ bp->b_iodone = NULL;
+
+ if (iodone_func == NULL) {
+ panic("brelse: bp @ 0x%x has NULL b_iodone!\n", bp);
+ }
+ (*iodone_func)(bp);
+ }
+ }
+
/* IO is done. Cleanup the UPL state */
if (!ISSET(bp->b_flags, B_META)
&& UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
brelse(bp);
goto start;
}
+ /*
+ * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
+ * CALLED! BE CAREFUL.
+ */
/*
* if it is meta, the queue may be set to other
}
if (ISSET(bp->b_flags, B_META) && (bp->b_data == 0))
- panic("allocbuf: bp->b_data is NULL");
+ panic("allocbuf: bp->b_data is NULL, buf @ 0x%x", bp);
bp->b_bufsize = desired_size;
bp->b_bcount = size;
panic("getnewbuf: null bp");
found:
+ if (ISSET(bp->b_flags, B_LOCKED)) {
+ panic("getnewbuf: bp @ 0x%x is LOCKED! (flags 0x%x)\n", bp, bp->b_flags);
+ }
+
if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
- panic("getnewbuf: le_prev is deadbeef");
+ panic("getnewbuf: le_prev is deadbeef, buf @ 0x%x", bp);
if(ISSET(bp->b_flags, B_BUSY))
- panic("getnewbuf reusing BUSY buf");
+ panic("getnewbuf reusing BUSY buf @ 0x%x", bp);
/* Clean it */
if (bcleanbuf(bp)) {
}
if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
+ void (*iodone_func)(struct buf *) = bp->b_iodone;
+
CLR(bp->b_flags, B_CALL); /* but note callout done */
- (*bp->b_iodone)(bp);
+ bp->b_iodone = NULL;
+
+ if (iodone_func == NULL) {
+ panic("biodone: bp @ 0x%x has NULL b_iodone!\n", bp);
+ } else {
+ (*iodone_func)(bp);
+ }
} else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */
brelse(bp);
else { /* or just wakeup the buffer */
/* clear out various fields */
bp->b_flags = B_BUSY;
bp->b_blkno = bp->b_lblkno = 0;
+
bp->b_iodone = 0;
bp->b_error = 0;
bp->b_resid = 0;
(void) thread_funnel_set(kernel_flock, funnel_state);
}
+
+
+static int
+bp_cmp(void *a, void *b)
+{
+ struct buf *bp_a = *(struct buf **)a,
+ *bp_b = *(struct buf **)b;
+ daddr_t res;
+
+ // don't have to worry about negative block
+ // numbers so this is ok to do.
+ //
+ res = (bp_a->b_blkno - bp_b->b_blkno);
+
+ return (int)res;
+}
+
+#define NFLUSH 32
+
+int
+bflushq(int whichq, struct mount *mp)
+{
+ struct buf *bp, *next;
+ int i, buf_count, s;
+ int counter=0, total_writes=0;
+ static struct buf *flush_table[NFLUSH];
+
+ if (whichq < 0 || whichq >= BQUEUES) {
+ return;
+ }
+
+
+ restart:
+ bp = TAILQ_FIRST(&bufqueues[whichq]);
+ for(buf_count=0; bp; bp=next) {
+ next = bp->b_freelist.tqe_next;
+
+ if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
+ continue;
+ }
+
+ if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_BUSY) == 0) {
+ if (whichq != BQ_LOCKED && (bp->b_flags & B_LOCKED)) {
+ panic("bflushq: bp @ 0x%x is locked!\n", bp);
+ }
+
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+ flush_table[buf_count] = bp;
+ buf_count++;
+ total_writes++;
+
+ if (buf_count >= NFLUSH) {
+ qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
+
+ for(i=0; i < buf_count; i++) {
+ bawrite(flush_table[i]);
+ }
+
+ goto restart;
+ }
+ }
+ }
+
+ if (buf_count > 0) {
+ qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
+ for(i=0; i < buf_count; i++) {
+ bawrite(flush_table[i]);
+ }
+ }
+
+ return total_writes;
+}
-
/*
* Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
*
#define CL_NOZERO 0x80
#define CL_PAGEIN 0x100
#define CL_DEV_MEMORY 0x200
+#define CL_PRESERVE 0x400
+
+struct clios {
+ u_int io_completed;
+ u_int io_issued;
+ off_t io_offset;
+ int io_error;
+ int io_wanted;
+};
+
static void cluster_zero(upl_t upl, vm_offset_t upl_offset,
int size, struct buf *bp);
static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
off_t newEOF, int devblocksize, int flags);
static int cluster_phys_read(struct vnode *vp, struct uio *uio,
- off_t filesize);
-static int cluster_phys_write(struct vnode *vp, struct uio *uio, off_t newEOF);
+ off_t filesize, int devblocksize, int flags);
+static int cluster_phys_write(struct vnode *vp, struct uio *uio,
+ off_t newEOF, int devblocksize, int flags);
+static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
+ vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
int total_resid;
int upl_offset;
int zero_offset;
+ int l_blkno;
upl_t upl;
struct buf *cbp;
struct buf *cbp_head;
struct buf *cbp_next;
struct buf *real_bp;
struct vnode *vp;
+ struct clios *iostate;
int commit_size;
int pg_offset;
real_bp = cbp->b_real_bp;
vp = cbp->b_vp;
zero_offset= cbp->b_validend;
+ l_blkno = cbp->b_lblkno;
+ iostate = (struct clios *)cbp->b_iostate;
while (cbp) {
if (cbp->b_vectorcount > 1)
cbp = cbp_next;
}
+ if (zero_offset)
+ cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
+
if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
vp->v_flag &= ~VTHROTTLED;
wakeup((caddr_t)&vp->v_numoutput);
}
- if (zero_offset)
- cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
+ if (iostate) {
+ if (error) {
+ off_t error_offset;
+
+ error_offset = (off_t)l_blkno * PAGE_SIZE_64;
+ if (iostate->io_error == 0) {
+ iostate->io_error = error;
+ iostate->io_offset = error_offset;
+ } else {
+ if (error_offset < iostate->io_offset)
+ iostate->io_offset = error_offset;
+ }
+ }
+ iostate->io_completed += total_size;
+
+ if (iostate->io_wanted) {
+ iostate->io_wanted = 0;
+ wakeup((caddr_t)&iostate->io_wanted);
+ }
+ }
if ((b_flags & B_NEED_IODONE) && real_bp) {
if (error) {
real_bp->b_flags |= B_ERROR;
error = EIO;
if (b_flags & B_COMMIT_UPL) {
- pg_offset = upl_offset & PAGE_MASK;
+ pg_offset = upl_offset & PAGE_MASK;
commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
- if (error || (b_flags & B_NOCACHE)) {
+ if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
int upl_abort_code;
- if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
+ if (b_flags & B_PHYS)
+ upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
+ else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
else if (b_flags & B_PGIN)
upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
} else {
int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
- if ( !(b_flags & B_PAGEOUT))
+ if (b_flags & B_PHYS)
+ upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
+ else if ( !(b_flags & B_PAGEOUT))
upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
if (b_flags & B_AGE)
upl_commit_flags |= UPL_COMMIT_INACTIVATE;
}
static int
-cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp)
+cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
struct vnode *vp;
upl_t upl;
vm_offset_t upl_offset;
int devblocksize;
int flags;
struct buf *real_bp;
+ struct clios *iostate;
{
struct buf *cbp;
struct iovec *iovp;
- u_int size;
+ u_int size;
+ u_int io_size;
int io_flags;
int error = 0;
int retval = 0;
u_int max_vectors;
int priv;
int zero_offset = 0;
+ u_int first_lblkno;
if (flags & CL_READ) {
io_flags = (B_VECTORLIST | B_READ);
}
pl = ubc_upl_pageinfo(upl);
- if (flags & CL_ASYNC)
- io_flags |= (B_CALL | B_ASYNC);
if (flags & CL_AGE)
io_flags |= B_AGE;
if (flags & CL_DUMP)
io_flags |= B_NOCACHE;
if (flags & CL_PAGEIN)
io_flags |= B_PGIN;
+ if (flags & CL_PAGEOUT)
+ io_flags |= B_PAGEOUT;
+ if (flags & CL_COMMIT)
+ io_flags |= B_COMMIT_UPL;
+ if (flags & CL_PRESERVE)
+ io_flags |= B_PHYS;
if (devblocksize)
size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
zero_offset = upl_offset + non_rounded_size;
}
while (size) {
- size_t io_size;
int vsize;
int i;
int pl_index;
else
io_size = size;
- if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
+ if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
if (error == EOPNOTSUPP)
panic("VOP_CMAP Unimplemented");
break;
if (error)
break;
- if (flags & CL_ASYNC)
- cbp->b_iodone = (void *)cluster_iodone;
+ if (flags & CL_ASYNC) {
+ cbp->b_flags |= (B_CALL | B_ASYNC);
+ cbp->b_iodone = (void *)cluster_iodone;
+ }
cbp->b_flags |= io_flags;
cbp->b_lblkno = lblkno;
cbp->b_uploffset = upl_offset;
cbp->b_trans_next = (struct buf *)0;
+ if (cbp->b_iostate = (void *)iostate)
+ iostate->io_issued += io_size;
+
if (flags & CL_READ)
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
* then go ahead and issue the I/O
*/
start_io:
- if (flags & CL_COMMIT)
- cbp_head->b_flags |= B_COMMIT_UPL;
- if (flags & CL_PAGEOUT)
- cbp_head->b_flags |= B_PAGEOUT;
- if (flags & CL_PAGEIN)
- cbp_head->b_flags |= B_PGIN;
-
if (real_bp) {
cbp_head->b_flags |= B_NEED_IODONE;
cbp_head->b_real_bp = real_bp;
if (error) {
int abort_size;
+ io_size = 0;
+
for (cbp = cbp_head; cbp;) {
struct buf * cbp_next;
_FREE(cbp->b_vectorlist, M_SEGMENT);
upl_offset -= cbp->b_bcount;
size += cbp->b_bcount;
+ io_size += cbp->b_bcount;
cbp_next = cbp->b_trans_next;
free_io_buf(cbp);
cbp = cbp_next;
}
+ if (iostate) {
+ if (iostate->io_error == 0) {
+ iostate->io_error = error;
+ iostate->io_offset = f_offset - (off_t)io_size;
+ }
+ iostate->io_issued -= io_size;
+
+ if (iostate->io_wanted) {
+ iostate->io_wanted = 0;
+ wakeup((caddr_t)&iostate->io_wanted);
+ }
+ }
pg_offset = upl_offset & PAGE_MASK;
abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
if (flags & CL_COMMIT) {
int upl_abort_code;
- if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
+ if (flags & CL_PRESERVE)
+ upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
+ else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
else if (flags & CL_PAGEIN)
- upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
+ upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
else
upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
}
return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
- local_flags, (struct buf *)0));
+ local_flags, (struct buf *)0, (struct clios *)0));
}
int
size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
- local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
+ local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
if (retval == 0) {
int b_lblkno;
f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
- return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp));
+ return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
}
int
int retval = 0;
- if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
+ if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
{
retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
return(retval);
if (upl_flags & UPL_PHYS_CONTIG)
{
- /*
- * since the interface to the IOKit below us uses physical block #'s and
- * block counts to specify the I/O, we can't handle anything that isn't
- * devblocksize aligned
- */
- if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
- return(EINVAL);
-
if (flags & IO_HEADZEROFILL)
{
flags &= ~IO_HEADZEROFILL;
return(retval);
}
- retval = cluster_phys_write(vp, uio, newEOF);
+ retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
{
return(retval);
}
+
static int
cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
struct vnode *vp;
(int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
- io_size, devblocksize, 0, (struct buf *)0);
+ io_size, devblocksize, 0, (struct buf *)0, (struct clios *)0);
if (error == 0) {
/*
return (error);
}
+
static int
-cluster_phys_write(vp, uio, newEOF)
+cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
struct vnode *vp;
struct uio *uio;
off_t newEOF;
+ int devblocksize;
+ int flags;
{
+ upl_page_info_t *pl;
+ vm_offset_t src_paddr;
upl_t upl;
vm_offset_t upl_offset;
+ int tail_size;
int io_size;
int upl_size;
int upl_needed_size;
(vm_offset_t)iov->iov_base & ~PAGE_MASK,
&upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
- if (kret != KERN_SUCCESS)
- {
- /* cluster_phys_write: failed to get pagelist */
- /* note: return kret here */
+ if (kret != KERN_SUCCESS) {
+ /*
+ * cluster_phys_write: failed to get pagelist
+ * note: return kret here
+ */
return(EINVAL);
- }
-
+ }
/*
* Consider the possibility that upl_size wasn't satisfied.
* This is a failure in the physical memory case.
*/
- if (upl_size < upl_needed_size)
- {
- kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
- return(EINVAL);
- }
+ if (upl_size < upl_needed_size) {
+ kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+ return(EINVAL);
+ }
+ pl = ubc_upl_pageinfo(upl);
- /*
- * issue a synchronous write to cluster_io
- */
+ src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
- error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
- io_size, 0, CL_DEV_MEMORY, (struct buf *)0);
+ while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
+ int head_size;
- if (error == 0) {
- /*
- * The cluster_io write completed successfully,
- * update the uio structure and commit.
- */
+ head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
- ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
-
- iov->iov_base += io_size;
- iov->iov_len -= io_size;
- uio->uio_resid -= io_size;
- uio->uio_offset += io_size;
+ if (head_size > io_size)
+ head_size = io_size;
+
+ error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
+
+ if (error) {
+ ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+ return(EINVAL);
+ }
+ upl_offset += head_size;
+ src_paddr += head_size;
+ io_size -= head_size;
}
- else
- ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+ tail_size = io_size & (devblocksize - 1);
+ io_size -= tail_size;
+
+ if (io_size) {
+ /*
+ * issue a synchronous write to cluster_io
+ */
+ error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
+ io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
+ }
+ if (error == 0) {
+ /*
+ * The cluster_io write completed successfully,
+ * update the uio structure
+ */
+ uio->uio_resid -= io_size;
+ iov->iov_len -= io_size;
+ iov->iov_base += io_size;
+ uio->uio_offset += io_size;
+ src_paddr += io_size;
+
+ if (tail_size)
+ error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
+ }
+ /*
+ * just release our hold on the physically contiguous
+ * region without changing any state
+ */
+ ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
return (error);
}
+
static int
cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
struct vnode *vp;
read_size = newEOF - upl_f_offset;
retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
- CL_READ, (struct buf *)0);
+ CL_READ, (struct buf *)0, (struct clios *)0);
if (retval) {
/*
* we had an error during the read which causes us to abort
read_size = newEOF - (upl_f_offset + upl_offset);
retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
- CL_READ, (struct buf *)0);
+ CL_READ, (struct buf *)0, (struct clios *)0);
if (retval) {
/*
* we had an error during the read which causes us to abort
if (last_blkno > vp->v_lastw)
vp->v_lastw = last_blkno;
- ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
+ ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
continue;
issue_io:
/*
tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
}
retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
- io_flags, (struct buf *)0);
+ io_flags, (struct buf *)0, (struct clios *)0);
}
}
KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
if (upl_flags & UPL_PHYS_CONTIG)
{
- retval = cluster_phys_read(vp, uio, filesize);
+ retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
}
else if (uio->uio_resid < 4 * PAGE_SIZE)
{
return(retval);
}
+
static int
cluster_read_x(vp, uio, filesize, devblocksize, flags)
struct vnode *vp;
*/
error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
- io_size, devblocksize, CL_READ, (struct buf *)0);
+ io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
}
if (error == 0) {
/*
return (retval);
}
+
static int
cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
struct vnode *vp;
(int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
- io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0);
+ io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0, (struct clios *)0);
if (error == 0) {
/*
}
+
static int
-cluster_phys_read(vp, uio, filesize)
+cluster_phys_read(vp, uio, filesize, devblocksize, flags)
struct vnode *vp;
struct uio *uio;
off_t filesize;
+ int devblocksize;
+ int flags;
{
+ upl_page_info_t *pl;
upl_t upl;
vm_offset_t upl_offset;
+ vm_offset_t dst_paddr;
off_t max_size;
int io_size;
+ int tail_size;
int upl_size;
int upl_needed_size;
int pages_in_pl;
int upl_flags;
kern_return_t kret;
struct iovec *iov;
+ struct clios iostate;
int error;
/*
max_size = filesize - uio->uio_offset;
- if (max_size < (off_t)((unsigned int)iov->iov_len))
- io_size = max_size;
+ if (max_size > (off_t)((unsigned int)iov->iov_len))
+ io_size = iov->iov_len;
else
- io_size = iov->iov_len;
+ io_size = max_size;
upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
upl_needed_size = upl_offset + io_size;
+ error = 0;
pages_in_pl = 0;
upl_size = upl_needed_size;
upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
(vm_offset_t)iov->iov_base & ~PAGE_MASK,
&upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
- if (kret != KERN_SUCCESS)
- {
- /* cluster_phys_read: failed to get pagelist */
- return(EINVAL);
- }
+ if (kret != KERN_SUCCESS) {
+ /*
+ * cluster_phys_read: failed to get pagelist
+ */
+ return(EINVAL);
+ }
+ if (upl_size < upl_needed_size) {
+ /*
+ * The upl_size wasn't satisfied.
+ */
+ ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+ return(EINVAL);
+ }
+ pl = ubc_upl_pageinfo(upl);
+
+ dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
+ while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
+ int head_size;
+
+ head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
+
+ if (head_size > io_size)
+ head_size = io_size;
+
+ error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
+
+ if (error) {
+ ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+
+ return(EINVAL);
+ }
+ upl_offset += head_size;
+ dst_paddr += head_size;
+ io_size -= head_size;
+ }
+ tail_size = io_size & (devblocksize - 1);
+ io_size -= tail_size;
+
+ iostate.io_completed = 0;
+ iostate.io_issued = 0;
+ iostate.io_error = 0;
+ iostate.io_wanted = 0;
+
+ while (io_size && error == 0) {
+ int xsize;
+
+ if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
+ xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
+ else
+ xsize = io_size;
+ /*
+ * request asynchronously so that we can overlap
+ * the preparation of the next I/O... we'll do
+ * the commit after all the I/O has completed
+ * since its all issued against the same UPL
+ * if there are already too many outstanding reads
+ * throttle back until we reach a more reasonable level
+ */
+ while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
+ iostate.io_wanted = 1;
+ tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
+ }
+
+ error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
+ CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
+ (struct buf *)0, &iostate);
+ /*
+ * The cluster_io read was issued successfully,
+ * update the uio structure
+ */
+ if (error == 0) {
+ uio->uio_resid -= xsize;
+ iov->iov_len -= xsize;
+ iov->iov_base += xsize;
+ uio->uio_offset += xsize;
+ dst_paddr += xsize;
+ upl_offset += xsize;
+ io_size -= xsize;
+ }
+ }
/*
- * Consider the possibility that upl_size wasn't satisfied.
+ * make sure any async reads have completed before
+ * we proceed
*/
- if (upl_size < upl_needed_size)
- {
- ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
- return(EINVAL);
- }
+ while (iostate.io_issued != iostate.io_completed) {
+ iostate.io_wanted = 1;
+ tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
+ }
+ if (iostate.io_error) {
+ error = iostate.io_error;
+ }
+ if (error == 0 && tail_size)
+ error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
/*
- * issue a synchronous read to cluster_io
+ * just release our hold on the physically contiguous
+ * region without changing any state
*/
-
- error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
- io_size, 0, CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
-
- if (error == 0)
- {
- /*
- * The cluster_io read completed successfully,
- * update the uio structure and commit.
- */
-
- ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
-
- iov->iov_base += io_size;
- iov->iov_len -= io_size;
- uio->uio_resid -= io_size;
- uio->uio_offset += io_size;
- }
- else
- ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
+ ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
return (error);
}
+
/*
* generate advisory I/O's in the largest chunks possible
* the completed pages will be released into the VM cache
* issue an asynchronous read to cluster_io
*/
retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
- CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
+ CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
issued_io = 1;
}
vp->v_flag |= VTHROTTLED;
tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
}
- cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0);
+ cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
size -= io_size;
}
return(1);
}
+
+
+
+static int
+cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
+{
+ struct iovec *iov;
+ upl_page_info_t *pl;
+ upl_t upl;
+ vm_offset_t ubc_paddr;
+ kern_return_t kret;
+ int error = 0;
+
+ iov = uio->uio_iov;
+
+ kret = ubc_create_upl(vp,
+ uio->uio_offset & ~PAGE_MASK_64,
+ PAGE_SIZE,
+ &upl,
+ &pl,
+ UPL_FLAGS_NONE);
+
+ if (kret != KERN_SUCCESS)
+ return(EINVAL);
+
+ if (!upl_valid_page(pl, 0)) {
+ /*
+ * issue a synchronous read to cluster_io
+ */
+ error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
+ CL_READ, (struct buf *)0, (struct clios *)0);
+ if (error) {
+ ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+ return(error);
+ }
+ }
+ ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
+
+ if (flags & CL_READ)
+ copyp2p(ubc_paddr, usr_paddr, xsize, 2);
+ else
+ copyp2p(usr_paddr, ubc_paddr, xsize, 1);
+
+ if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
+ /*
+ * issue a synchronous write to cluster_io
+ */
+ error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
+ 0, (struct buf *)0, (struct clios *)0);
+ }
+ if (error == 0) {
+ uio->uio_offset += xsize;
+ iov->iov_base += xsize;
+ iov->iov_len -= xsize;
+ uio->uio_resid -= xsize;
+ }
+ ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+
+ return (error);
+}
--- /dev/null
+/*
+ * Copyright (c) 1995-2002 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License"). You may not use this file except in compliance with the
+ * License. Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ *
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+//
+// This file implements a simple write-ahead journaling layer.
+// In theory any file system can make use of it by calling these
+// functions when the fs wants to modify meta-data blocks. See
+// vfs_journal.h for a more detailed description of the api and
+// data structures.
+//
+// Dominic Giampaolo (dbg@apple.com)
+//
+
+#ifdef KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/ioctl.h>
+#include <sys/tty.h>
+#include <sys/ubc.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <kern/thread_act.h>
+#include <sys/disk.h>
+#include <miscfs/specfs/specdev.h>
+
+extern task_t kernel_task;
+
+#else
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include "compat.h"
+
+#endif /* KERNEL */
+
+#include "vfs_journal.h"
+
+
+// number of bytes to checksum in a block_list_header
+// NOTE: this should be enough to clear out the header
+// fields as well as the first entry of binfo[]
+#define BLHDR_CHECKSUM_SIZE 32
+
+
+
+static int end_transaction(transaction *tr, int force_it);
+static void abort_transaction(journal *jnl, transaction *tr);
+static void dump_journal(journal *jnl);
+
+
+#define CHECK_JOURNAL(jnl) \
+ do { \
+ if (jnl == NULL) {\
+ panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\
+ }\
+ if (jnl->jdev == NULL) { \
+ panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\
+ } \
+ if (jnl->fsdev == NULL) { \
+ panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\
+ } \
+ if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\
+ panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\
+ __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\
+ }\
+ if ( jnl->jhdr->start <= 0 \
+ || jnl->jhdr->start > jnl->jhdr->size\
+ || jnl->jhdr->start > 128*1024*1024) {\
+ panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \
+ __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\
+ }\
+ if ( jnl->jhdr->end <= 0 \
+ || jnl->jhdr->end > jnl->jhdr->size\
+ || jnl->jhdr->end > 128*1024*1024) {\
+ panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \
+ __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\
+ }\
+ if (jnl->jhdr->size > 128*1024*1024) {\
+ panic("%s:%d: jhdr size looks bad (0x%llx)\n",\
+ __FILE__, __LINE__, jnl->jhdr->size);\
+ } \
+ } while(0)
+
+#define CHECK_TRANSACTION(tr) \
+ do {\
+ if (tr == NULL) {\
+ panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\
+ }\
+ if (tr->jnl == NULL) {\
+ panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\
+ }\
+ if (tr->blhdr != (block_list_header *)tr->tbuffer) {\
+ panic("%s:%d: blhdr (0x%x) != tbuffer (0x%x)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\
+ }\
+ if (tr->total_bytes < 0) {\
+ panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\
+ }\
+ if (tr->journal_start < 0 || tr->journal_start > 128*1024*1024) {\
+ panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\
+ }\
+ if (tr->journal_end < 0 || tr->journal_end > 128*1024*1024) {\
+ panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\
+ }\
+ if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > 2048)) {\
+ panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\
+ }\
+ } while(0)
+
+
+
+//
+// this isn't a great checksum routine but it will do for now.
+// we use it to checksum the journal header and the block list
+// headers that are at the start of each transaction.
+//
+static int
+calc_checksum(char *ptr, int len)
+{
+ int i, cksum=0;
+
+ // this is a lame checksum but for now it'll do
+ for(i=0; i < len; i++, ptr++) {
+ cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr);
+ }
+
+ return (~cksum);
+}
+
+
+#define JNL_WRITE 1
+#define JNL_READ 2
+
+//
+// This function sets up a fake buf and passes it directly to the
+// journal device strategy routine (so that it won't get cached in
+// the block cache.
+//
+// It also handles range checking the i/o so that we don't write
+// outside the journal boundaries and it will wrap the i/o back
+// to the beginning if necessary (skipping over the journal header)
+//
+static size_t
+do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction)
+{
+ int err, io_sz=0, curlen=len;
+ struct buf *bp;
+ int max_iosize=0, max_vectors;
+
+ if (*offset < 0 || *offset > jnl->jhdr->size) {
+ panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size);
+ }
+
+ again:
+ bp = alloc_io_buf(jnl->jdev, 1);
+
+ if (direction == JNL_WRITE) {
+ bp->b_flags |= 0; // don't have to set any flags (was: B_WRITEINPROG)
+ jnl->jdev->v_numoutput++;
+ vfs_io_attributes(jnl->jdev, B_WRITE, &max_iosize, &max_vectors);
+ } else if (direction == JNL_READ) {
+ bp->b_flags |= B_READ;
+ vfs_io_attributes(jnl->jdev, B_READ, &max_iosize, &max_vectors);
+ }
+
+ if (max_iosize == 0) {
+ max_iosize = 128 * 1024;
+ }
+
+ if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) {
+ if (*offset == jnl->jhdr->size) {
+ *offset = jnl->jhdr->jhdr_size;
+ } else {
+ curlen = (off_t)jnl->jhdr->size - *offset;
+ }
+ }
+
+ if (curlen > max_iosize) {
+ curlen = max_iosize;
+ }
+
+ if (curlen <= 0) {
+ panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %d\n", curlen, *offset, len);
+ }
+
+ bp->b_bufsize = curlen;
+ bp->b_bcount = curlen;
+ bp->b_data = data;
+ bp->b_blkno = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size);
+ bp->b_lblkno = (daddr_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size);
+
+ err = VOP_STRATEGY(bp);
+ if (!err) {
+ err = biowait(bp);
+ }
+
+ bp->b_data = NULL;
+ bp->b_bufsize = bp->b_bcount = 0;
+ bp->b_blkno = bp->b_lblkno = -1;
+
+ free_io_buf(bp);
+
+ if (err) {
+ printf("jnl: do_jnl_io: strategy err 0x%x\n", err);
+ return 0;
+ }
+
+ *offset += curlen;
+ io_sz += curlen;
+ if (io_sz != len) {
+ // handle wrap-around
+ data = (char *)data + curlen;
+ curlen = len - io_sz;
+ if (*offset >= jnl->jhdr->size) {
+ *offset = jnl->jhdr->jhdr_size;
+ }
+ goto again;
+ }
+
+ return io_sz;
+}
+
+static size_t
+read_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
+{
+ return do_journal_io(jnl, offset, data, len, JNL_READ);
+}
+
+static size_t
+write_journal_data(journal *jnl, off_t *offset, void *data, size_t len)
+{
+ return do_journal_io(jnl, offset, data, len, JNL_WRITE);
+}
+
+
+static int
+write_journal_header(journal *jnl)
+{
+ int ret;
+ off_t jhdr_offset = 0;
+
+ //
+ // XXXdbg note: this ioctl doesn't seem to do anything on firewire disks.
+ //
+ ret = VOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NOCRED, current_proc());
+ if (ret != 0) {
+ printf("jnl: flushing fs disk buffer returned 0x%x\n", ret);
+ }
+
+
+ jnl->jhdr->checksum = 0;
+ jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header));
+ if (write_journal_data(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size) != jnl->jhdr->jhdr_size) {
+ printf("jnl: write_journal_header: error writing the journal header!\n");
+ jnl->flags |= JOURNAL_INVALID;
+ return -1;
+ }
+
+ return 0;
+}
+
+
+
+//
+// this is a work function used to free up transactions that
+// completed. they can't be free'd from buffer_flushed_callback
+// because it is called from deep with the disk driver stack
+// and thus can't do something that would potentially cause
+// paging. it gets called by each of the journal api entry
+// points so stuff shouldn't hang around for too long.
+//
+static void
+free_old_stuff(journal *jnl)
+{
+ transaction *tr, *next;
+
+ for(tr=jnl->tr_freeme; tr; tr=next) {
+ next = tr->next;
+ kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+ }
+
+ jnl->tr_freeme = NULL;
+}
+
+
+
+//
+// This is our callback that lets us know when a buffer has been
+// flushed to disk. It's called from deep within the driver stack
+// and thus is quite limited in what it can do. Notably, it can
+// not initiate any new i/o's or allocate/free memory.
+//
+static void
+buffer_flushed_callback(struct buf *bp)
+{
+ transaction *tr;
+ journal *jnl;
+ transaction *ctr, *prev=NULL, *next;
+ int i, bufsize;
+
+
+ //printf("jnl: buf flush: bp @ 0x%x l/blkno %d/%d vp 0x%x tr @ 0x%x\n",
+ // bp, bp->b_lblkno, bp->b_blkno, bp->b_vp, bp->b_transaction);
+
+ // snarf out the bits we want
+ bufsize = bp->b_bufsize;
+ tr = bp->b_transaction;
+
+ bp->b_iodone = NULL; // don't call us for this guy again
+ bp->b_transaction = NULL;
+
+ //
+ // This is what biodone() would do if it didn't call us.
+ // NOTE: THIS CODE *HAS* TO BE HERE!
+ //
+ if (ISSET(bp->b_flags, B_ASYNC)) { /* if async, release it */
+ brelse(bp);
+ } else { /* or just wakeup the buffer */
+ CLR(bp->b_flags, B_WANTED);
+ wakeup(bp);
+ }
+
+ // NOTE: from here on out we do *NOT* touch bp anymore.
+
+
+ // then we've already seen it
+ if (tr == NULL) {
+ return;
+ }
+
+ CHECK_TRANSACTION(tr);
+
+ jnl = tr->jnl;
+ if (jnl->flags & JOURNAL_INVALID) {
+ return;
+ }
+
+ CHECK_JOURNAL(jnl);
+
+ // update the number of blocks that have been flushed.
+ // this buf may represent more than one block so take
+ // that into account.
+ tr->num_flushed += bufsize;
+
+
+ // if this transaction isn't done yet, just return as
+ // there is nothing to do.
+ if ((tr->num_flushed + tr->num_killed) < tr->total_bytes) {
+ return;
+ }
+
+ //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n",
+ // tr, tr->journal_start, tr->journal_end, jnl);
+
+ // find this entry in the old_start[] index and mark it completed
+ simple_lock(&jnl->old_start_lock);
+ for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
+
+ if ((jnl->old_start[i] & ~(0x8000000000000000LL)) == tr->journal_start) {
+ jnl->old_start[i] &= ~(0x8000000000000000LL);
+ break;
+ }
+ }
+ if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
+ panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr 0x%x, jnl 0x%x)\n",
+ tr->journal_start, tr, jnl);
+ }
+ simple_unlock(&jnl->old_start_lock);
+
+
+ // if we are here then we need to update the journal header
+ // to reflect that this transaction is complete
+ if (tr->journal_start == jnl->active_start) {
+ jnl->active_start = tr->journal_end;
+ tr->journal_start = tr->journal_end = (off_t)0;
+ }
+
+ // go through the completed_trs list and try to coalesce
+ // entries, restarting back at the beginning if we have to.
+ for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) {
+ if (ctr->journal_start == jnl->active_start) {
+ jnl->active_start = ctr->journal_end;
+ if (prev) {
+ prev->next = ctr->next;
+ }
+ if (ctr == jnl->completed_trs) {
+ jnl->completed_trs = ctr->next;
+ }
+
+ next = jnl->completed_trs; // this starts us over again
+ ctr->next = jnl->tr_freeme;
+ jnl->tr_freeme = ctr;
+ ctr = NULL;
+ } else if (tr->journal_end == ctr->journal_start) {
+ ctr->journal_start = tr->journal_start;
+ next = jnl->completed_trs; // this starts us over again
+ ctr = NULL;
+ tr->journal_start = tr->journal_end = (off_t)0;
+ } else if (tr->journal_start == ctr->journal_end) {
+ ctr->journal_end = tr->journal_end;
+ next = ctr->next;
+ tr->journal_start = tr->journal_end = (off_t)0;
+ } else {
+ next = ctr->next;
+ }
+ }
+
+ // at this point no one should be using this guy anymore
+ tr->total_bytes = 0xfbadc0de;
+
+ // if this is true then we didn't merge with anyone
+ // so link ourselves in at the head of the completed
+ // transaction list.
+ if (tr->journal_start != 0) {
+ // put this entry into the correct sorted place
+ // in the list instead of just at the head.
+ //
+
+ prev = NULL;
+ for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) {
+ // just keep looping
+ }
+
+ if (ctr == NULL && prev == NULL) {
+ jnl->completed_trs = tr;
+ tr->next = NULL;
+ } else if (ctr == jnl->completed_trs) {
+ tr->next = jnl->completed_trs;
+ jnl->completed_trs = tr;
+ } else {
+ tr->next = prev->next;
+ prev->next = tr;
+ }
+ } else {
+ // if we're here this tr got merged with someone else so
+ // put it on the list to be free'd
+ tr->next = jnl->tr_freeme;
+ jnl->tr_freeme = tr;
+ }
+}
+
+static int
+update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize)
+{
+ int ret;
+ struct buf *oblock_bp=NULL;
+
+ // first read the block we want.
+ ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp);
+ if (ret != 0) {
+ printf("jnl: update_fs_block: error reading fs block # %lld! (ret %d)\n", fs_block, ret);
+
+ if (oblock_bp) {
+ brelse(oblock_bp);
+ oblock_bp = NULL;
+ }
+
+ // let's try to be aggressive here and just re-write the block
+ oblock_bp = getblk(jnl->fsdev, (daddr_t)fs_block, bsize, 0, 0, BLK_META);
+ if (oblock_bp == NULL) {
+ printf("jnl: update_fs_block: getblk() for %lld failed! failing update.\n", fs_block);
+ return -1;
+ }
+ }
+
+ // make sure it's the correct size.
+ if (oblock_bp->b_bufsize != bsize) {
+ brelse(oblock_bp);
+ return -1;
+ }
+
+ // copy the journal data over top of it
+ memcpy(oblock_bp->b_data, block_ptr, bsize);
+
+ if ((ret = VOP_BWRITE(oblock_bp)) != 0) {
+ printf("jnl: update_fs_block: failed to update block %lld (ret %d)\n", fs_block,ret);
+ brelse(oblock_bp);
+ return ret;
+ }
+
+ // and now invalidate it so that if someone else wants to read
+ // it in a different size they'll be able to do it.
+ ret = meta_bread(jnl->fsdev, (daddr_t)fs_block, bsize, NOCRED, &oblock_bp);
+ if (oblock_bp) {
+ oblock_bp->b_flags |= B_INVAL;
+ brelse(oblock_bp);
+ }
+
+ return 0;
+}
+
+
+static int
+replay_journal(journal *jnl)
+{
+ int i, ret, checksum, max_bsize;
+ struct buf *oblock_bp;
+ block_list_header *blhdr;
+ off_t offset;
+ char *buf, *block_ptr=NULL;
+
+ // wrap the start ptr if it points to the very end of the journal
+ if (jnl->jhdr->start == jnl->jhdr->size) {
+ jnl->jhdr->start = jnl->jhdr->jhdr_size;
+ }
+ if (jnl->jhdr->end == jnl->jhdr->size) {
+ jnl->jhdr->end = jnl->jhdr->jhdr_size;
+ }
+
+ if (jnl->jhdr->start == jnl->jhdr->end) {
+ return 0;
+ }
+
+ // allocate memory for the header_block. we'll read each blhdr into this
+ if (kmem_alloc(kernel_map, (vm_offset_t *)&buf, jnl->jhdr->blhdr_size)) {
+ printf("jnl: replay_journal: no memory for block buffer! (%d bytes)\n",
+ jnl->jhdr->blhdr_size);
+ return -1;
+ }
+
+
+ printf("jnl: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n",
+ jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset);
+
+ while(jnl->jhdr->start != jnl->jhdr->end) {
+ offset = jnl->jhdr->start;
+ ret = read_journal_data(jnl, &offset, buf, jnl->jhdr->blhdr_size);
+ if (ret != jnl->jhdr->blhdr_size) {
+ printf("jnl: replay_journal: Could not read block list header block @ 0x%llx!\n", offset);
+ goto bad_replay;
+ }
+
+ blhdr = (block_list_header *)buf;
+ checksum = blhdr->checksum;
+ blhdr->checksum = 0;
+ if (checksum != calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE)) {
+ printf("jnl: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n",
+ offset, checksum, calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE));
+ goto bad_replay;
+ }
+ if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > 2048
+ || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) {
+ printf("jnl: replay_journal: bad looking journal entry: max: %d num: %d\n",
+ blhdr->max_blocks, blhdr->num_blocks);
+ goto bad_replay;
+ }
+
+ for(i=1,max_bsize=0; i < blhdr->num_blocks; i++) {
+ if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) {
+ printf("jnl: replay_journal: bogus block number 0x%llx\n", blhdr->binfo[i].bnum);
+ goto bad_replay;
+ }
+ if (blhdr->binfo[i].bsize > max_bsize) {
+ max_bsize = blhdr->binfo[i].bsize;
+ }
+ }
+
+ // make sure it's at least one page in size.
+ if (max_bsize & (PAGE_SIZE - 1)) {
+ max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1);
+ }
+
+ if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) {
+ goto bad_replay;
+ }
+
+ //printf("jnl: replay_journal: %d blocks in journal entry @ 0x%llx\n", blhdr->num_blocks-1,
+ // jnl->jhdr->start);
+ for(i=1; i < blhdr->num_blocks; i++) {
+ int size;
+
+ size = blhdr->binfo[i].bsize;
+
+ ret = read_journal_data(jnl, &offset, block_ptr, size);
+ if (ret != size) {
+ printf("jnl: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", offset);
+ goto bad_replay;
+ }
+
+ // don't replay "killed" blocks
+ if (blhdr->binfo[i].bnum == (off_t)-1) {
+ // printf("jnl: replay_journal: skipping killed fs block (slot %d)\n", i);
+ } else {
+ //printf("jnl: replay_journal: fixing fs block # %lld (%d)\n",
+ // blhdr->binfo[i].bnum, blhdr->binfo[i].bsize);
+
+ if (update_fs_block(jnl, block_ptr, blhdr->binfo[i].bnum, blhdr->binfo[i].bsize) != 0) {
+ goto bad_replay;
+ }
+ }
+
+ // check if we need to wrap offset back to the beginning
+ // (which is just past the journal header)
+ //
+ if (offset >= jnl->jhdr->size) {
+ offset = jnl->jhdr->jhdr_size;
+ }
+ }
+
+ kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
+ block_ptr = NULL;
+
+ jnl->jhdr->start += blhdr->bytes_used;
+ if (jnl->jhdr->start >= jnl->jhdr->size) {
+ // wrap around and skip the journal header block
+ jnl->jhdr->start = (jnl->jhdr->start % jnl->jhdr->size) + jnl->jhdr->jhdr_size;
+ }
+
+ // only update the on-disk journal header if we've reached the
+ // last chunk of updates from this transaction. if binfo[0].bnum
+ // is zero then we know we're at the end.
+ if (blhdr->binfo[0].bnum == 0) {
+ if (write_journal_header(jnl) != 0) {
+ goto bad_replay;
+ }
+ }
+ }
+
+ kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size);
+ return 0;
+
+ bad_replay:
+ if (block_ptr) {
+ kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize);
+ }
+ kmem_free(kernel_map, (vm_offset_t)buf, jnl->jhdr->blhdr_size);
+ return -1;
+}
+
+
+#define DEFAULT_TRANSACTION_BUFFER_SIZE (128*1024)
+//#define DEFAULT_TRANSACTION_BUFFER_SIZE (256*1024) // better performance but uses more mem
+#define MAX_TRANSACTION_BUFFER_SIZE (512*1024)
+
+// XXXdbg - so I can change it in the debugger
+int def_tbuffer_size = 0;
+
+
+//
+// This function sets the size of the tbuffer and the
+// size of the blhdr. It assumes that jnl->jhdr->size
+// and jnl->jhdr->jhdr_size are already valid.
+//
+static void
+size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz)
+{
+ //
+ // one-time initialization based on how much memory
+ // there is in the machine.
+ //
+ if (def_tbuffer_size == 0) {
+ if (mem_size < (256*1024*1024)) {
+ def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE;
+ } else if (mem_size < (512*1024*1024)) {
+ def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 2;
+ } else if (mem_size < (1024*1024*1024)) {
+ def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 3;
+ } else if (mem_size >= (1024*1024*1024)) {
+ def_tbuffer_size = DEFAULT_TRANSACTION_BUFFER_SIZE * 4;
+ }
+ }
+
+ // size up the transaction buffer... can't be larger than the number
+ // of blocks that can fit in a block_list_header block.
+ if (tbuffer_size == 0) {
+ jnl->tbuffer_size = def_tbuffer_size;
+ } else {
+ // make sure that the specified tbuffer_size isn't too small
+ if (tbuffer_size < jnl->jhdr->blhdr_size * 2) {
+ tbuffer_size = jnl->jhdr->blhdr_size * 2;
+ }
+ // and make sure it's an even multiple of the block size
+ if ((tbuffer_size % jnl->jhdr->jhdr_size) != 0) {
+ tbuffer_size -= (tbuffer_size % jnl->jhdr->jhdr_size);
+ }
+
+ jnl->tbuffer_size = tbuffer_size;
+ }
+
+ if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) {
+ jnl->tbuffer_size = (jnl->jhdr->size / 2);
+ }
+
+ if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) {
+ jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE;
+ }
+
+ jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info);
+ if (jnl->jhdr->blhdr_size < phys_blksz) {
+ jnl->jhdr->blhdr_size = phys_blksz;
+ }
+}
+
+
+
+journal *
+journal_create(struct vnode *jvp,
+ off_t offset,
+ off_t journal_size,
+ struct vnode *fsvp,
+ size_t min_fs_blksz,
+ int32_t flags,
+ int32_t tbuffer_size,
+ void (*flush)(void *arg),
+ void *arg)
+{
+ journal *jnl;
+ int ret, phys_blksz;
+
+ /* Get the real physical block size. */
+ if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) {
+ return NULL;
+ }
+
+ if (phys_blksz > min_fs_blksz) {
+ printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
+ phys_blksz, min_fs_blksz);
+ return NULL;
+ }
+
+ if ((journal_size % phys_blksz) != 0) {
+ printf("jnl: create: journal size 0x%llx is not an even multiple of block size 0x%x\n",
+ journal_size, phys_blksz);
+ return NULL;
+ }
+
+ if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl, sizeof(struct journal))) {
+ return NULL;
+ }
+ memset(jnl, 0, sizeof(*jnl));
+
+ jnl->jdev = jvp;
+ jnl->jdev_offset = offset;
+ jnl->fsdev = fsvp;
+ jnl->flush = flush;
+ jnl->flush_arg = arg;
+ jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
+ simple_lock_init(&jnl->old_start_lock);
+
+ if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
+ printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
+ goto bad_kmem_alloc;
+ }
+
+ memset(jnl->header_buf, 0, phys_blksz);
+
+ jnl->jhdr = (journal_header *)jnl->header_buf;
+ jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
+ jnl->jhdr->endian = ENDIAN_MAGIC;
+ jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself
+ jnl->jhdr->end = phys_blksz;
+ jnl->jhdr->size = journal_size;
+ jnl->jhdr->jhdr_size = phys_blksz;
+ size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+ jnl->active_start = jnl->jhdr->start;
+
+ // XXXdbg - for testing you can force the journal to wrap around
+ // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3);
+ // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3);
+
+ if (semaphore_create(kernel_task, &jnl->jsem, SYNC_POLICY_FIFO, 1) != 0) {
+ printf("jnl: journal_create: failed to create journal semaphore..\n");
+ goto bad_sem;
+ }
+
+ if (write_journal_header(jnl) != 0) {
+ printf("jnl: journal_create: failed to write journal header.\n");
+ goto bad_write;
+ }
+
+ return jnl;
+
+
+ bad_write:
+ semaphore_destroy(kernel_task, jnl->jsem);
+ bad_sem:
+ kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+ bad_kmem_alloc:
+ jnl->jhdr = NULL;
+ kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+ return NULL;
+}
+
+
+journal *
+journal_open(struct vnode *jvp,
+ off_t offset,
+ off_t journal_size,
+ struct vnode *fsvp,
+ size_t min_fs_blksz,
+ int32_t flags,
+ int32_t tbuffer_size,
+ void (*flush)(void *arg),
+ void *arg)
+{
+ journal *jnl;
+ int orig_blksz=0, phys_blksz, blhdr_size;
+ off_t hdr_offset=0;
+
+ /* Get the real physical block size. */
+ if (VOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, FSCRED, NULL)) {
+ return NULL;
+ }
+
+ if (phys_blksz > min_fs_blksz) {
+ printf("jnl: create: error: phys blksize %d bigger than min fs blksize %d\n",
+ phys_blksz, min_fs_blksz);
+ return NULL;
+ }
+
+ if ((journal_size % phys_blksz) != 0) {
+ printf("jnl: open: journal size 0x%llx is not an even multiple of block size 0x%x\n",
+ journal_size, phys_blksz);
+ return NULL;
+ }
+
+ if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl, sizeof(struct journal))) {
+ return NULL;
+ }
+ memset(jnl, 0, sizeof(*jnl));
+
+ jnl->jdev = jvp;
+ jnl->jdev_offset = offset;
+ jnl->fsdev = fsvp;
+ jnl->flush = flush;
+ jnl->flush_arg = arg;
+ jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK);
+ simple_lock_init(&jnl->old_start_lock);
+
+ if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) {
+ printf("jnl: create: could not allocate space for header buffer (%d bytes)\n", phys_blksz);
+ goto bad_kmem_alloc;
+ }
+
+ jnl->jhdr = (journal_header *)jnl->header_buf;
+ memset(jnl->jhdr, 0, sizeof(journal_header)+4);
+
+ // we have to set this up here so that do_journal_io() will work
+ jnl->jhdr->jhdr_size = phys_blksz;
+
+ if (read_journal_data(jnl, &hdr_offset, jnl->jhdr, phys_blksz) != phys_blksz) {
+ printf("jnl: open: could not read %d bytes for the journal header.\n",
+ phys_blksz);
+ goto bad_journal;
+ }
+
+ if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) {
+ printf("jnl: open: journal magic is bad (0x%x != 0x%x)\n",
+ jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);
+ goto bad_journal;
+ }
+
+ // only check if we're the current journal header magic value
+ if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) {
+ int orig_checksum = jnl->jhdr->checksum;
+
+ jnl->jhdr->checksum = 0;
+ if (orig_checksum != calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header))) {
+ printf("jnl: open: journal checksum is bad (0x%x != 0x%x)\n", orig_checksum,
+ calc_checksum((char *)jnl->jhdr, sizeof(struct journal_header)));
+ //goto bad_journal;
+ }
+ }
+
+ // XXXdbg - convert old style magic numbers to the new one
+ if (jnl->jhdr->magic == OLD_JOURNAL_HEADER_MAGIC) {
+ jnl->jhdr->magic = JOURNAL_HEADER_MAGIC;
+ }
+
+ if (phys_blksz != jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) {
+ printf("jnl: open: phys_blksz %d does not match journal header size %d\n",
+ phys_blksz, jnl->jhdr->jhdr_size);
+
+ orig_blksz = phys_blksz;
+ phys_blksz = jnl->jhdr->jhdr_size;
+ if (VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&phys_blksz, FWRITE, FSCRED, NULL)) {
+ printf("jnl: could not set block size to %d bytes.\n", phys_blksz);
+ goto bad_journal;
+ }
+// goto bad_journal;
+ }
+
+ if ( jnl->jhdr->start <= 0
+ || jnl->jhdr->start > jnl->jhdr->size
+ || jnl->jhdr->start > 128*1024*1024) {
+ printf("jnl: open: jhdr start looks bad (0x%llx max size 0x%llx)\n",
+ jnl->jhdr->start, jnl->jhdr->size);
+ goto bad_journal;
+ }
+
+ if ( jnl->jhdr->end <= 0
+ || jnl->jhdr->end > jnl->jhdr->size
+ || jnl->jhdr->end > 128*1024*1024) {
+ printf("jnl: open: jhdr end looks bad (0x%llx max size 0x%llx)\n",
+ jnl->jhdr->end, jnl->jhdr->size);
+ goto bad_journal;
+ }
+
+ if (jnl->jhdr->size > 128*1024*1024) {
+ printf("jnl: open: jhdr size looks bad (0x%llx)\n", jnl->jhdr->size);
+ goto bad_journal;
+ }
+
+// XXXdbg - can't do these checks because hfs writes all kinds of
+// non-uniform sized blocks even on devices that have a block size
+// that is larger than 512 bytes (i.e. optical media w/2k blocks).
+// therefore these checks will fail and so we just have to punt and
+// do more relaxed checking...
+// XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) {
+ if ((jnl->jhdr->start % 512) != 0) {
+ printf("jnl: open: journal start (0x%llx) not a multiple of 512?\n",
+ jnl->jhdr->start);
+ goto bad_journal;
+ }
+
+//XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) {
+ if ((jnl->jhdr->end % 512) != 0) {
+ printf("jnl: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n",
+ jnl->jhdr->end, jnl->jhdr->jhdr_size);
+ goto bad_journal;
+ }
+
+ // take care of replaying the journal if necessary
+ if (flags & JOURNAL_RESET) {
+ printf("jnl: journal start/end pointers reset! (jnl 0x%x; s 0x%llx e 0x%llx)\n",
+ jnl, jnl->jhdr->start, jnl->jhdr->end);
+ jnl->jhdr->start = jnl->jhdr->end;
+ } else if (replay_journal(jnl) != 0) {
+ printf("jnl: journal_open: Error replaying the journal!\n");
+ goto bad_journal;
+ }
+
+ if (orig_blksz != 0) {
+ VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL);
+ phys_blksz = orig_blksz;
+ }
+
+ // make sure this is in sync!
+ jnl->active_start = jnl->jhdr->start;
+
+ // set this now, after we've replayed the journal
+ size_up_tbuffer(jnl, tbuffer_size, phys_blksz);
+
+ if (semaphore_create(kernel_task, &jnl->jsem, SYNC_POLICY_FIFO, 1) != 0) {
+ printf("jnl: journal_create: failed to create journal semaphore..\n");
+ goto bad_journal;
+ }
+
+ return jnl;
+
+ bad_journal:
+ if (orig_blksz != 0) {
+ phys_blksz = orig_blksz;
+ VOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, FSCRED, NULL);
+ }
+ kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz);
+ bad_kmem_alloc:
+ kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+ return NULL;
+}
+
+void
+journal_close(journal *jnl)
+{
+ volatile off_t *start, *end;
+ int counter=0;
+
+ CHECK_JOURNAL(jnl);
+
+ // set this before doing anything that would block so that
+ // we start tearing things down properly.
+ //
+ jnl->flags |= JOURNAL_CLOSE_PENDING;
+
+ if (jnl->owner != current_act()) {
+ int ret;
+
+ while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+ // just keep trying if we've been ^C'ed
+ }
+ if (ret != 0) {
+ printf("jnl: close: sem wait failed.\n");
+ return;
+ }
+ }
+
+ //
+ // only write stuff to disk if the journal is still valid
+ //
+ if ((jnl->flags & JOURNAL_INVALID) == 0) {
+
+ if (jnl->active_tr) {
+ journal_end_transaction(jnl);
+ }
+
+ // flush any buffered transactions
+ if (jnl->cur_tr) {
+ transaction *tr = jnl->cur_tr;
+
+ jnl->cur_tr = NULL;
+ end_transaction(tr, 1); // force it to get flushed
+ }
+
+ //start = &jnl->jhdr->start;
+ start = &jnl->active_start;
+ end = &jnl->jhdr->end;
+
+ while (*start != *end && counter++ < 500) {
+ printf("jnl: close: flushing the buffer cache (start 0x%llx end 0x%llx)\n", *start, *end);
+ if (jnl->flush) {
+ jnl->flush(jnl->flush_arg);
+ }
+
+ }
+
+ if (*start != *end) {
+ printf("jnl: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n",
+ *start, *end);
+ }
+
+ // make sure this is in sync when we close the journal
+ jnl->jhdr->start = jnl->active_start;
+
+ // if this fails there's not much we can do at this point...
+ write_journal_header(jnl);
+ } else {
+ // if we're here the journal isn't valid any more.
+ // so make sure we don't leave any locked blocks lying around
+ printf("jnl: close: journal 0x%x, is invalid. aborting outstanding transactions\n", jnl);
+ if (jnl->active_tr || jnl->cur_tr) {
+ transaction *tr;
+ if (jnl->active_tr) {
+ tr = jnl->active_tr;
+ jnl->active_tr = NULL;
+ } else {
+ tr = jnl->cur_tr;
+ jnl->cur_tr = NULL;
+ }
+
+ abort_transaction(jnl, tr);
+ if (jnl->active_tr || jnl->cur_tr) {
+ panic("jnl: close: jnl @ 0x%x had both an active and cur tr\n", jnl);
+ }
+ }
+ }
+
+ free_old_stuff(jnl);
+
+ kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->jhdr->jhdr_size);
+ jnl->jhdr = (void *)0xbeefbabe;
+
+ semaphore_destroy(kernel_task, jnl->jsem);
+ kmem_free(kernel_map, (vm_offset_t)jnl, sizeof(struct journal));
+}
+
+static void
+dump_journal(journal *jnl)
+{
+ transaction *ctr;
+
+ printf("journal:");
+ printf(" jdev_offset %.8llx\n", jnl->jdev_offset);
+ printf(" magic: 0x%.8x\n", jnl->jhdr->magic);
+ printf(" start: 0x%.8llx\n", jnl->jhdr->start);
+ printf(" end: 0x%.8llx\n", jnl->jhdr->end);
+ printf(" size: 0x%.8llx\n", jnl->jhdr->size);
+ printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size);
+ printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size);
+ printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum);
+
+ printf(" completed transactions:\n");
+ for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) {
+ printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end);
+ }
+}
+
+
+
+static off_t
+free_space(journal *jnl)
+{
+ off_t free_space;
+
+ if (jnl->jhdr->start < jnl->jhdr->end) {
+ free_space = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size;
+ } else if (jnl->jhdr->start > jnl->jhdr->end) {
+ free_space = jnl->jhdr->start - jnl->jhdr->end;
+ } else {
+ // journal is completely empty
+ free_space = jnl->jhdr->size - jnl->jhdr->jhdr_size;
+ }
+
+ return free_space;
+}
+
+
+//
+// The journal must be locked on entry to this function.
+// The "desired_size" is in bytes.
+//
+static int
+check_free_space(journal *jnl, int desired_size)
+{
+ int i, counter=0;
+
+ //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n",
+// desired_size, free_space(jnl));
+
+ while (1) {
+ if (counter++ == 5000) {
+ dump_journal(jnl);
+ panic("jnl: check_free_space: buffer flushing isn't working "
+ "(jnl @ 0x%x s %lld e %lld f %lld [active start %lld]).\n", jnl,
+ jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start);
+ }
+ if (counter > 7500) {
+ printf("jnl: check_free_space: giving up waiting for free space.\n");
+ return ENOSPC;
+ }
+
+ // make sure there's space in the journal to hold this transaction
+ if (free_space(jnl) > desired_size) {
+ break;
+ }
+
+ //
+ // here's where we lazily bump up jnl->jhdr->start. we'll consume
+ // entries until there is enough space for the next transaction.
+ //
+ simple_lock(&jnl->old_start_lock);
+ for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) {
+ int counter;
+
+ counter = 0;
+ while (jnl->old_start[i] & 0x8000000000000000LL) {
+ if (counter++ > 100) {
+ panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl 0x%x).\n",
+ jnl->old_start[i], jnl);
+ }
+
+ simple_unlock(&jnl->old_start_lock);
+ if (jnl->flush) {
+ jnl->flush(jnl->flush_arg);
+ }
+ tsleep((caddr_t)jnl, PRIBIO, "check_free_space1", 1);
+ simple_lock(&jnl->old_start_lock);
+ }
+
+ if (jnl->old_start[i] == 0) {
+ continue;
+ }
+
+ jnl->jhdr->start = jnl->old_start[i];
+ jnl->old_start[i] = 0;
+ if (free_space(jnl) > desired_size) {
+ write_journal_header(jnl);
+ break;
+ }
+ }
+ simple_unlock(&jnl->old_start_lock);
+
+ // if we bumped the start, loop and try again
+ if (i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) {
+ continue;
+ }
+
+
+ // if the file system gave us a flush function, call it to so that
+ // it can flush some blocks which hopefully will cause some transactions
+ // to complete and thus free up space in the journal.
+ if (jnl->flush) {
+ jnl->flush(jnl->flush_arg);
+ }
+
+ // wait for a while to avoid being cpu-bound (this will
+ // put us to sleep for 10 milliseconds)
+ tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1);
+ }
+
+ return 0;
+}
+
+int
+journal_start_transaction(journal *jnl)
+{
+ int ret;
+ transaction *tr;
+
+ CHECK_JOURNAL(jnl);
+
+ if (jnl->flags & JOURNAL_INVALID) {
+ return EINVAL;
+ }
+
+ if (jnl->owner == current_act()) {
+ if (jnl->active_tr == NULL) {
+ panic("jnl: start_tr: active_tr is NULL (jnl @ 0x%x, owner 0x%x, current_act 0x%x\n",
+ jnl, jnl->owner, current_act());
+ }
+ jnl->nested_count++;
+ return 0;
+ }
+
+ while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+ // just keep looping if we've been ^C'ed
+ }
+ if (ret != 0) {
+ printf("jnl: start_tr: sem wait failed.\n");
+ return EINVAL;
+ }
+
+ if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) {
+ panic("jnl: start_tr: owner 0x%x, nested count 0x%x, active_tr 0x%x jnl @ 0x%x\n",
+ jnl->owner, jnl->nested_count, jnl->active_tr, jnl);
+ }
+
+ jnl->owner = current_act();
+ jnl->nested_count = 1;
+
+ free_old_stuff(jnl);
+
+ // make sure there's room in the journal
+ if (check_free_space(jnl, jnl->tbuffer_size) != 0) {
+ printf("jnl: start transaction failed: no space\n");
+ ret = ENOSPC;
+ goto bad_start;
+ }
+
+ // if there's a buffered transaction, use it.
+ if (jnl->cur_tr) {
+ jnl->active_tr = jnl->cur_tr;
+ jnl->cur_tr = NULL;
+
+ return 0;
+ }
+
+ if (kmem_alloc(kernel_map, (vm_offset_t *)&tr, sizeof(transaction))) {
+ printf("jnl: start transaction failed: no mem\n");
+ ret = ENOMEM;
+ goto bad_start;
+ }
+ memset(tr, 0, sizeof(transaction));
+
+ tr->tbuffer_size = jnl->tbuffer_size;
+ if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) {
+ kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+ printf("jnl: start transaction failed: no tbuffer mem\n");
+ ret = ENOMEM;
+ goto bad_start;
+ }
+
+ // journal replay code checksum check depends on this.
+ memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE);
+
+ tr->blhdr = (block_list_header *)tr->tbuffer;
+ tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
+ tr->blhdr->num_blocks = 1; // accounts for this header block
+ tr->blhdr->bytes_used = jnl->jhdr->blhdr_size;
+
+ tr->num_blhdrs = 1;
+ tr->total_bytes = jnl->jhdr->blhdr_size;
+ tr->jnl = jnl;
+
+ jnl->active_tr = tr;
+
+ // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, tr);
+
+ return 0;
+
+ bad_start:
+ jnl->owner = NULL;
+ jnl->nested_count = 0;
+ semaphore_signal(jnl->jsem);
+ return ret;
+}
+
+
+int
+journal_modify_block_start(journal *jnl, struct buf *bp)
+{
+ transaction *tr;
+
+ CHECK_JOURNAL(jnl);
+
+ if (jnl->flags & JOURNAL_INVALID) {
+ return EINVAL;
+ }
+
+ // XXXdbg - for debugging I want this to be true. later it may
+ // not be necessary.
+ if ((bp->b_flags & B_META) == 0) {
+ panic("jnl: modify_block_start: bp @ 0x%x is not a meta-data block! (jnl 0x%x)\n", bp, jnl);
+ }
+
+ tr = jnl->active_tr;
+ CHECK_TRANSACTION(tr);
+
+ if (jnl->owner != current_act()) {
+ panic("jnl: modify_block_start: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+ jnl, jnl->owner, current_act());
+ }
+
+ free_old_stuff(jnl);
+
+ //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d; total bytes %d)\n",
+ // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
+
+ // can't allow blocks that aren't an even multiple of the
+ // underlying block size.
+ if ((bp->b_bufsize % jnl->jhdr->jhdr_size) != 0) {
+ panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n",
+ bp->b_bufsize, jnl->jhdr->jhdr_size);
+ return -1;
+ }
+
+ // make sure that this transaction isn't bigger than the whole journal
+ if (tr->total_bytes+bp->b_bufsize >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) {
+ panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr 0x%x bp 0x%x)\n",
+ tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), bp->b_bufsize, tr, bp);
+ return -1;
+ }
+
+ // if the block is dirty and not already locked we have to write
+ // it out before we muck with it because it has data that belongs
+ // (presumably) to another transaction.
+ //
+ if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_LOCKED) == 0) {
+
+ // this will cause it to not be brelse()'d
+ bp->b_flags |= B_NORELSE;
+ VOP_BWRITE(bp);
+ }
+
+ bp->b_flags |= B_LOCKED;
+
+ return 0;
+}
+
+int
+journal_modify_block_abort(journal *jnl, struct buf *bp)
+{
+ transaction *tr;
+ block_list_header *blhdr;
+ int i, j;
+
+ CHECK_JOURNAL(jnl);
+
+ tr = jnl->active_tr;
+
+ //
+ // if there's no active transaction then we just want to
+ // call brelse() and return since this is just a block
+ // that happened to be modified as part of another tr.
+ //
+ if (tr == NULL) {
+ brelse(bp);
+ return 0;
+ }
+
+ if (jnl->flags & JOURNAL_INVALID) {
+ return EINVAL;
+ }
+
+ CHECK_TRANSACTION(tr);
+
+ if (jnl->owner != current_act()) {
+ panic("jnl: modify_block_abort: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+ jnl, jnl->owner, current_act());
+ }
+
+ free_old_stuff(jnl);
+
+ // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp);
+
+ // first check if it's already part of this transaction
+ for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+ for(i=1; i < blhdr->num_blocks; i++) {
+ if (bp == blhdr->binfo[i].bp) {
+ if (bp->b_bufsize != blhdr->binfo[i].bsize) {
+ panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
+ bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl);
+ }
+ break;
+ }
+ }
+
+ if (i < blhdr->num_blocks) {
+ break;
+ }
+ }
+
+ //
+ // if blhdr is null, then this block has only had modify_block_start
+ // called on it as part of the current transaction. that means that
+ // it is ok to clear the LOCKED bit since it hasn't actually been
+ // modified. if blhdr is non-null then modify_block_end was called
+ // on it and so we need to keep it locked in memory.
+ //
+ if (blhdr == NULL) {
+ bp->b_flags &= ~(B_LOCKED);
+ }
+
+ brelse(bp);
+ return 0;
+}
+
+
+int
+journal_modify_block_end(journal *jnl, struct buf *bp)
+{
+ int i, j, tbuffer_offset;
+ char *blkptr;
+ block_list_header *blhdr, *prev=NULL;
+ transaction *tr;
+
+ CHECK_JOURNAL(jnl);
+
+ if (jnl->flags & JOURNAL_INVALID) {
+ return EINVAL;
+ }
+
+ tr = jnl->active_tr;
+ CHECK_TRANSACTION(tr);
+
+ if (jnl->owner != current_act()) {
+ panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+ jnl, jnl->owner, current_act());
+ }
+
+ free_old_stuff(jnl);
+
+ //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %d/%d bsz %d, total bytes %d)\n",
+ // bp, bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_bufsize, tr->total_bytes);
+
+ if ((bp->b_flags & B_LOCKED) == 0) {
+ panic("jnl: modify_block_end: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
+ bp->b_flags |= B_LOCKED;
+ }
+
+ // first check if it's already part of this transaction
+ for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+ tbuffer_offset = jnl->jhdr->blhdr_size;
+
+ for(i=1; i < blhdr->num_blocks; i++) {
+ if (bp == blhdr->binfo[i].bp) {
+ if (bp->b_bufsize != blhdr->binfo[i].bsize) {
+ panic("jnl: bp @ 0x%x changed size on me! (%d vs. %d, jnl 0x%x)\n",
+ bp, bp->b_bufsize, blhdr->binfo[i].bsize, jnl);
+ }
+ break;
+ }
+ tbuffer_offset += blhdr->binfo[i].bsize;
+ }
+
+ if (i < blhdr->num_blocks) {
+ break;
+ }
+ }
+
+ if (blhdr == NULL
+ && prev
+ && (prev->num_blocks+1) <= prev->max_blocks
+ && (prev->bytes_used+bp->b_bufsize) <= tr->tbuffer_size) {
+ blhdr = prev;
+ } else if (blhdr == NULL) {
+ block_list_header *nblhdr;
+
+ if (prev == NULL) {
+ panic("jnl: modify block end: no way man, prev == NULL?!?, jnl 0x%x, bp 0x%x\n", jnl, bp);
+ }
+
+ // we got to the end of the list, didn't find the block and there's
+ // no room in the block_list_header pointed to by prev
+
+ // we allocate another tbuffer and link it in at the end of the list
+ // through prev->binfo[0].bnum. that's a skanky way to do things but
+ // avoids having yet another linked list of small data structures to manage.
+
+ if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) {
+ panic("jnl: end_tr: no space for new block tr @ 0x%x (total bytes: %d)!\n",
+ tr, tr->total_bytes);
+ }
+
+ // journal replay code checksum check depends on this.
+ memset(nblhdr, 0, BLHDR_CHECKSUM_SIZE);
+
+ // initialize the new guy
+ nblhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1;
+ nblhdr->num_blocks = 1; // accounts for this header block
+ nblhdr->bytes_used = jnl->jhdr->blhdr_size;
+
+ tr->num_blhdrs++;
+ tr->total_bytes += jnl->jhdr->blhdr_size;
+
+ // then link him in at the end
+ prev->binfo[0].bnum = (off_t)((long)nblhdr);
+
+ // and finally switch to using the new guy
+ blhdr = nblhdr;
+ tbuffer_offset = jnl->jhdr->blhdr_size;
+ i = 1;
+ }
+
+
+ if ((i+1) > blhdr->max_blocks) {
+ panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks);
+ }
+
+ // copy the data into the in-memory transaction buffer
+ blkptr = (char *)&((char *)blhdr)[tbuffer_offset];
+ memcpy(blkptr, bp->b_data, bp->b_bufsize);
+
+ // if this is true then this is a new block we haven't seen
+ if (i >= blhdr->num_blocks) {
+ vget(bp->b_vp, 0, current_proc());
+
+ blhdr->binfo[i].bnum = bp->b_blkno;
+ blhdr->binfo[i].bsize = bp->b_bufsize;
+ blhdr->binfo[i].bp = bp;
+
+ blhdr->bytes_used += bp->b_bufsize;
+ tr->total_bytes += bp->b_bufsize;
+
+ blhdr->num_blocks++;
+ }
+
+ bdwrite(bp);
+
+ return 0;
+}
+
+int
+journal_kill_block(journal *jnl, struct buf *bp)
+{
+ int i;
+ block_list_header *blhdr;
+ transaction *tr;
+
+ CHECK_JOURNAL(jnl);
+
+ if (jnl->flags & JOURNAL_INVALID) {
+ return EINVAL;
+ }
+
+ tr = jnl->active_tr;
+ CHECK_TRANSACTION(tr);
+
+ if (jnl->owner != current_act()) {
+ panic("jnl: modify_block_end: called w/out a transaction! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+ jnl, jnl->owner, current_act());
+ }
+
+ free_old_stuff(jnl);
+
+ if ((bp->b_flags & B_LOCKED) == 0) {
+ panic("jnl: kill block: bp 0x%x not locked! jnl @ 0x%x\n", bp, jnl);
+ }
+
+ // first check if it's already part of this transaction
+ for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+
+ for(i=1; i < blhdr->num_blocks; i++) {
+ if (bp == blhdr->binfo[i].bp) {
+ bp->b_flags &= ~B_LOCKED;
+
+ // this undoes the vget() in journal_modify_block_end()
+ vrele(bp->b_vp);
+
+ // if the block has the DELWRI and CALL bits sets, then
+ // things are seriously weird. if it was part of another
+ // transaction then journal_modify_block_start() should
+ // have force it to be written.
+ //
+ if ((bp->b_flags & B_DELWRI) && (bp->b_flags & B_CALL)) {
+ panic("jnl: kill block: this defies all logic! bp 0x%x\n", bp);
+ } else {
+ tr->num_killed += bp->b_bufsize;
+ }
+
+ if (bp->b_flags & B_BUSY) {
+ brelse(bp);
+ }
+
+ blhdr->binfo[i].bp = NULL;
+ blhdr->binfo[i].bnum = (off_t)-1;
+ break;
+ }
+ }
+
+ if (i < blhdr->num_blocks) {
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int
+journal_binfo_cmp(void *a, void *b)
+{
+ block_info *bi_a = (struct block_info *)a,
+ *bi_b = (struct block_info *)b;
+ daddr_t res;
+
+ if (bi_a->bp == NULL) {
+ return 1;
+ }
+ if (bi_b->bp == NULL) {
+ return -1;
+ }
+
+ // don't have to worry about negative block
+ // numbers so this is ok to do.
+ //
+ res = (bi_a->bp->b_blkno - bi_b->bp->b_blkno);
+
+ return (int)res;
+}
+
+
+static int
+end_transaction(transaction *tr, int force_it)
+{
+ int i, j, ret, amt;
+ off_t end;
+ journal *jnl = tr->jnl;
+ struct buf *bp;
+ block_list_header *blhdr=NULL, *next=NULL;
+
+ if (jnl->cur_tr) {
+ panic("jnl: jnl @ 0x%x already has cur_tr 0x%x, new tr: 0x%x\n",
+ jnl, jnl->cur_tr, tr);
+ }
+
+ // if there weren't any modified blocks in the transaction
+ // just save off the transaction pointer and return.
+ if (tr->total_bytes == jnl->jhdr->blhdr_size) {
+ jnl->cur_tr = tr;
+ return;
+ }
+
+ // if our transaction buffer isn't very full, just hang
+ // on to it and don't actually flush anything. this is
+ // what is known as "group commit". we will flush the
+ // transaction buffer if it's full or if we have more than
+ // one of them so we don't start hogging too much memory.
+ //
+ if ( force_it == 0
+ && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0
+ && tr->num_blhdrs < 3
+ && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8))) {
+
+ jnl->cur_tr = tr;
+ return;
+ }
+
+
+ // if we're here we're going to flush the transaction buffer to disk.
+ // make sure there is room in the journal first.
+ check_free_space(jnl, tr->total_bytes);
+
+ // range check the end index
+ if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) {
+ panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n",
+ jnl->jhdr->end, jnl->jhdr->size);
+ }
+
+ // this transaction starts where the current journal ends
+ tr->journal_start = jnl->jhdr->end;
+ end = jnl->jhdr->end;
+
+ //
+ // if the first entry in old_start[] isn't free yet, loop calling the
+ // file system flush routine until it is (or we panic).
+ //
+ i = 0;
+ simple_lock(&jnl->old_start_lock);
+ while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) {
+ if (jnl->flush) {
+ simple_unlock(&jnl->old_start_lock);
+
+ if (jnl->flush) {
+ jnl->flush(jnl->flush_arg);
+ }
+
+ // yield the cpu so others can get in to clear the lock bit
+ (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1);
+
+ simple_lock(&jnl->old_start_lock);
+ }
+ if (i++ >= 100) {
+ panic("jnl: transaction that started at 0x%llx is not completing! jnl 0x%x\n",
+ jnl->old_start[0] & (~0x8000000000000000LL), jnl);
+ }
+ }
+
+ //
+ // slide everyone else down and put our latest guy in the last
+ // entry in the old_start array
+ //
+ memcpy(&jnl->old_start[0], &jnl->old_start[1], sizeof(jnl->old_start)-sizeof(jnl->old_start[0]));
+ jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL;
+
+ simple_unlock(&jnl->old_start_lock);
+
+
+ // for each block, make sure that the physical block # is set
+ for(blhdr=tr->blhdr; blhdr; blhdr=next) {
+
+ for(i=1; i < blhdr->num_blocks; i++) {
+
+ bp = blhdr->binfo[i].bp;
+ if (bp == NULL) { // only true if a block was "killed"
+ if (blhdr->binfo[i].bnum != (off_t)-1) {
+ panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ 0x%x, tr 0x%x)\n",
+ blhdr->binfo[i].bnum, jnl, tr);
+ }
+ continue;
+ }
+
+ if (bp->b_vp == NULL && bp->b_lblkno == bp->b_blkno) {
+ panic("jnl: end_tr: DANGER! bp @ 0x%x w/null vp and l/blkno = %d/%d\n",
+ bp, bp->b_lblkno, bp->b_blkno);
+ }
+
+ // if the lblkno is the same as blkno and this bp isn't
+ // associated with the underlying file system device then
+ // we need to call bmap() to get the actual physical block.
+ //
+ if ((bp->b_lblkno == bp->b_blkno) && (bp->b_vp != jnl->fsdev)) {
+ if (VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL) != 0) {
+ printf("jnl: end_tr: can't bmap the bp @ 0x%x, jnl 0x%x\n", bp, jnl);
+ goto bad_journal;
+ }
+ }
+
+ // update this so we write out the correct physical block number!
+ blhdr->binfo[i].bnum = bp->b_blkno;
+ }
+
+ next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+ }
+
+ for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) {
+
+ amt = blhdr->bytes_used;
+
+ blhdr->checksum = 0;
+ blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE);
+
+ ret = write_journal_data(jnl, &end, blhdr, amt);
+ if (ret != amt) {
+ printf("jnl: end_transaction: only wrote %d of %d bytes to the journal!\n",
+ ret, amt);
+
+ goto bad_journal;
+ }
+ }
+
+ jnl->jhdr->end = end; // update where the journal now ends
+ tr->journal_end = end; // the transaction ends here too
+ if (tr->journal_start == 0 || tr->journal_end == 0) {
+ panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n",
+ tr->journal_start, tr->journal_end);
+ }
+
+ if (write_journal_header(jnl) != 0) {
+ goto bad_journal;
+ }
+
+ //
+ // setup for looping through all the blhdr's. we null out the
+ // tbuffer and blhdr fields so that they're not used any more.
+ //
+ blhdr = tr->blhdr;
+ tr->tbuffer = NULL;
+ tr->blhdr = NULL;
+
+ // the buffer_flushed_callback will only be called for the
+ // real blocks that get flushed so we have to account for
+ // the block_list_headers here.
+ //
+ tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size;
+
+ // for each block, set the iodone callback and unlock it
+ for(; blhdr; blhdr=next) {
+
+ // we can re-order the buf ptrs because everything is written out already
+ qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp);
+
+ for(i=1; i < blhdr->num_blocks; i++) {
+ if (blhdr->binfo[i].bp == NULL) {
+ continue;
+ }
+
+ ret = meta_bread(blhdr->binfo[i].bp->b_vp,
+ (daddr_t)blhdr->binfo[i].bp->b_lblkno,
+ blhdr->binfo[i].bp->b_bufsize,
+ NOCRED,
+ &bp);
+ if (ret == 0 && bp != NULL) {
+ struct vnode *save_vp;
+
+ if (bp != blhdr->binfo[i].bp) {
+ panic("jnl: end_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
+ bp, blhdr->binfo[i].bp, jnl);
+ }
+
+ if ((bp->b_flags & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) {
+ if (jnl->flags & JOURNAL_CLOSE_PENDING) {
+ brelse(bp);
+ continue;
+ } else {
+ panic("jnl: end_tr: !!!DANGER!!! bp 0x%x flags (0x%x) not LOCKED & DELWRI\n", bp, bp->b_flags);
+ }
+ }
+
+ if (bp->b_iodone != NULL) {
+ panic("jnl: bp @ 0x%x (blkno %d, vp 0x%x) has non-null iodone (0x%x) buffflushcb 0x%x\n",
+ bp, bp->b_blkno, bp->b_vp, bp->b_iodone, buffer_flushed_callback);
+ }
+
+ save_vp = bp->b_vp;
+
+ bp->b_iodone = buffer_flushed_callback;
+ bp->b_transaction = tr;
+ bp->b_flags |= B_CALL;
+ bp->b_flags &= ~(B_LOCKED);
+
+ // kicking off the write here helps performance
+ bawrite(bp);
+ // XXXdbg this is good for testing: bdwrite(bp);
+ //bdwrite(bp);
+
+ // this undoes the vget() in journal_modify_block_end()
+ vrele(save_vp);
+
+ } else {
+ printf("jnl: end_transaction: could not find block %Ld vp 0x%x!\n",
+ blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
+ }
+ }
+
+ next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+
+ // we can free blhdr here since we won't need it any more
+ blhdr->binfo[0].bnum = 0xdeadc0de;
+ kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
+ }
+
+ //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n",
+ // tr, tr->journal_start, tr->journal_end);
+ return 0;
+
+
+ bad_journal:
+ jnl->flags |= JOURNAL_INVALID;
+ abort_transaction(jnl, tr);
+ return -1;
+}
+
+static void
+abort_transaction(journal *jnl, transaction *tr)
+{
+ int i, ret;
+ block_list_header *blhdr, *next;
+ struct buf *bp;
+
+ // for each block list header, iterate over the blocks then
+ // free up the memory associated with the block list.
+ //
+ // for each block, clear the lock bit and release it.
+ //
+ for(blhdr=tr->blhdr; blhdr; blhdr=next) {
+
+ for(i=1; i < blhdr->num_blocks; i++) {
+ if (blhdr->binfo[i].bp == NULL) {
+ continue;
+ }
+
+ ret = meta_bread(blhdr->binfo[i].bp->b_vp,
+ (daddr_t)blhdr->binfo[i].bp->b_lblkno,
+ blhdr->binfo[i].bp->b_bufsize,
+ NOCRED,
+ &bp);
+ if (ret == 0 && bp != NULL) {
+ if (bp != blhdr->binfo[i].bp) {
+ panic("jnl: abort_tr: got back a different bp! (bp 0x%x should be 0x%x, jnl 0x%x\n",
+ bp, blhdr->binfo[i].bp, jnl);
+ }
+
+ // clear the locked bit and the delayed-write bit. we
+ // don't want these blocks going to disk.
+ bp->b_flags &= ~(B_LOCKED|B_DELWRI);
+ bp->b_flags |= B_INVAL;
+
+ brelse(bp);
+
+ } else {
+ printf("jnl: abort_tr: could not find block %Ld vp 0x%x!\n",
+ blhdr->binfo[i].bnum, blhdr->binfo[i].bp);
+ }
+ }
+
+ next = (block_list_header *)((long)blhdr->binfo[0].bnum);
+
+ // we can free blhdr here since we won't need it any more
+ blhdr->binfo[0].bnum = 0xdeadc0de;
+ kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size);
+ }
+
+ tr->tbuffer = NULL;
+ tr->blhdr = NULL;
+ tr->total_bytes = 0xdbadc0de;
+ kmem_free(kernel_map, (vm_offset_t)tr, sizeof(transaction));
+}
+
+
+int
+journal_end_transaction(journal *jnl)
+{
+ int ret;
+ transaction *tr;
+
+ CHECK_JOURNAL(jnl);
+
+ if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) {
+ return 0;
+ }
+
+ if (jnl->owner != current_act()) {
+ panic("jnl: end_tr: I'm not the owner! jnl 0x%x, owner 0x%x, curact 0x%x\n",
+ jnl, jnl->owner, current_act());
+ }
+
+ free_old_stuff(jnl);
+
+ jnl->nested_count--;
+ if (jnl->nested_count > 0) {
+ return 0;
+ } else if (jnl->nested_count < 0) {
+ panic("jnl: jnl @ 0x%x has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count);
+ }
+
+ if (jnl->flags & JOURNAL_INVALID) {
+ if (jnl->active_tr) {
+ transaction *tr;
+
+ if (jnl->cur_tr != NULL) {
+ panic("jnl: journal @ 0x%x has active tr (0x%x) and cur tr (0x%x)\n",
+ jnl, jnl->active_tr, jnl->cur_tr);
+ }
+
+ tr = jnl->active_tr;
+ jnl->active_tr = NULL;
+ abort_transaction(jnl, tr);
+ }
+
+ jnl->owner = NULL;
+ semaphore_signal(jnl->jsem);
+
+ return EINVAL;
+ }
+
+ tr = jnl->active_tr;
+ CHECK_TRANSACTION(tr);
+
+ // clear this out here so that when check_free_space() calls
+ // the FS flush function, we don't panic in journal_flush()
+ // if the FS were to call that. note: check_free_space() is
+ // called from end_transaction().
+ //
+ jnl->active_tr = NULL;
+ ret = end_transaction(tr, 0);
+
+ jnl->owner = NULL;
+ semaphore_signal(jnl->jsem);
+
+ return ret;
+}
+
+
+int
+journal_flush(journal *jnl)
+{
+ int need_signal = 0;
+
+ CHECK_JOURNAL(jnl);
+
+ if (jnl->flags & JOURNAL_INVALID) {
+ return -1;
+ }
+
+ if (jnl->owner != current_act()) {
+ int ret;
+
+ while ((ret = semaphore_wait(jnl->jsem)) == KERN_ABORTED) {
+ // just keep looping if we've ben ^C'ed
+ }
+ if (ret != 0) {
+ printf("jnl: flush: sem wait failed.\n");
+ return -1;
+ }
+ need_signal = 1;
+ }
+
+ free_old_stuff(jnl);
+
+ // if we're not active, flush any buffered transactions
+ if (jnl->active_tr == NULL && jnl->cur_tr) {
+ transaction *tr = jnl->cur_tr;
+
+ jnl->cur_tr = NULL;
+ end_transaction(tr, 1); // force it to get flushed
+ }
+
+ if (need_signal) {
+ semaphore_signal(jnl->jsem);
+ }
+
+ return 0;
+}
+
+int
+journal_active(journal *jnl)
+{
+ if (jnl->flags & JOURNAL_INVALID) {
+ return -1;
+ }
+
+ return (jnl->active_tr == NULL) ? 0 : 1;
+}
--- /dev/null
+
+/*
+ * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License"). You may not use this file except in compliance with the
+ * License. Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ *
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+/*
+ * This header contains the structures and function prototypes
+ * for the vfs journaling code. The data types are not meant
+ * to be modified by user code. Just use the functions and do
+ * not mess around with the structs.
+ */
+#ifndef _SYS_VFS_JOURNAL_H_
+#define _SYS_VFS_JOURNAL_H_
+
+#include <sys/appleapiopts.h>
+
+#ifdef __APPLE_API_UNSTABLE
+
+#include <sys/types.h>
+
+typedef struct block_info {
+ off_t bnum; // block # on the file system device
+ size_t bsize; // in bytes
+ struct buf *bp;
+} block_info;
+
+typedef struct block_list_header {
+ u_int16_t max_blocks; // max number of blocks in this chunk
+ u_int16_t num_blocks; // number of valid block numbers in block_nums
+ int32_t bytes_used; // how many bytes of this tbuffer are used
+ int32_t checksum; // on-disk: checksum of this header and binfo[0]
+ int32_t pad; // pad out to 16 bytes
+ block_info binfo[1]; // so we can reference them by name
+} block_list_header;
+
+
+struct journal;
+
+typedef struct transaction {
+ int tbuffer_size; // in bytes
+ char *tbuffer; // memory copy of the transaction
+ block_list_header *blhdr; // points to the first byte of tbuffer
+ int num_blhdrs; // how many buffers we've allocated
+ int total_bytes; // total # of bytes in transaction
+ int num_flushed; // how many bytes have been flushed
+ int num_killed; // how many bytes were "killed"
+ off_t journal_start; // where in the journal this transaction starts
+ off_t journal_end; // where in the journal this transaction ends
+ struct journal *jnl; // ptr back to the journal structure
+ struct transaction *next; // list of tr's (either completed or to be free'd)
+} transaction;
+
+
+/*
+ * This is written to block zero of the journal and it
+ * maintains overall state about the journal.
+ */
+typedef struct journal_header {
+ int32_t magic;
+ int32_t endian;
+ volatile off_t start; // zero-based byte offset of the start of the first transaction
+ volatile off_t end; // zero-based byte offset of where free space begins
+ off_t size; // size in bytes of the entire journal
+ int32_t blhdr_size; // size in bytes of each block_list_header in the journal
+ int32_t checksum;
+ int32_t jhdr_size; // block size (in bytes) of the journal header
+} journal_header;
+
+#define JOURNAL_HEADER_MAGIC 0x4a4e4c78 // 'JNLx'
+#define ENDIAN_MAGIC 0x12345678
+
+#define OLD_JOURNAL_HEADER_MAGIC 0x4a484452 // 'JHDR'
+
+
+/*
+ * In memory structure about the journal.
+ */
+typedef struct journal {
+ struct vnode *jdev; // vnode of the device where the journal lives
+ off_t jdev_offset; // byte offset to the start of the journal
+
+ struct vnode *fsdev; // vnode of the file system device
+
+ void (*flush)(void *arg); // fs callback to flush meta data blocks
+ void *flush_arg; // arg that's passed to flush()
+
+ int32_t flags;
+ int32_t tbuffer_size; // default transaction buffer size
+
+ char *header_buf; // in-memory copy of the journal header
+ journal_header *jhdr; // points to the first byte of header_buf
+
+ transaction *cur_tr; // for group-commit
+ transaction *completed_trs; // out-of-order transactions that completed
+ transaction *active_tr; // for nested transactions
+ int32_t nested_count; // for nested transactions
+ void *owner; // a ptr that's unique to the calling process
+
+ transaction *tr_freeme; // transaction structs that need to be free'd
+
+ volatile off_t active_start; // the active start that we only keep in memory
+ simple_lock_data_t old_start_lock; // guard access
+ volatile off_t old_start[16]; // this is how we do lazy start update
+
+ semaphore_t jsem;
+} journal;
+
+/* internal-only journal flags (top 16 bits) */
+#define JOURNAL_CLOSE_PENDING 0x00010000
+#define JOURNAL_INVALID 0x00020000
+
+/* journal_open/create options are always in the low-16 bits */
+#define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff
+
+/*
+ * Prototypes.
+ */
+
+/*
+ * Call journal_create() to create a new journal. You only
+ * call this once, typically at file system creation time.
+ *
+ * The "jvp" argument is the vnode where the journal is written.
+ * The journal starts at "offset" and is "journal_size" bytes long.
+ *
+ * The "fsvp" argument is the vnode of your file system. It may be
+ * the same as "jvp".
+ *
+ * The "min_fs_block_size" argument is the minimum block size
+ * (in bytes) that the file system will ever write. Typically
+ * this is the block size of the file system (1k, 4k, etc) but
+ * on HFS+ it is the minimum block size of the underlying device.
+ *
+ * The flags argument lets you disable group commit if you
+ * want tighter guarantees on transactions (in exchange for
+ * lower performance).
+ *
+ * The tbuffer_size is the size of the transaction buffer
+ * used by the journal. If you specify zero, the journal code
+ * will use a reasonable defaults. The tbuffer_size should
+ * be an integer multiple of the min_fs_block_size.
+ *
+ * Returns a valid journal pointer or NULL if one could not
+ * be created.
+ */
+journal *journal_create(struct vnode *jvp,
+ off_t offset,
+ off_t journal_size,
+ struct vnode *fsvp,
+ size_t min_fs_block_size,
+ int32_t flags,
+ int32_t tbuffer_size,
+ void (*flush)(void *arg),
+ void *arg);
+
+/*
+ * Call journal_open() when mounting an existing file system
+ * that has a previously created journal. It will take care
+ * of validating the journal and replaying it if necessary.
+ *
+ * See journal_create() for a description of the arguments.
+ *
+ * Returns a valid journal pointer of NULL if it runs into
+ * trouble reading/playing back the journal.
+ */
+journal *journal_open(struct vnode *jvp,
+ off_t offset,
+ off_t journal_size,
+ struct vnode *fsvp,
+ size_t min_fs_block_size,
+ int32_t flags,
+ int32_t tbuffer_size,
+ void (*flush)(void *arg),
+ void *arg);
+
+/*
+ * Call journal_close() just before your file system is unmounted.
+ * It flushes any outstanding transactions and makes sure the
+ * journal is in a consistent state.
+ */
+void journal_close(journal *journal);
+
+/*
+ * flags for journal_create/open. only can use
+ * the low 16 bits for flags because internal
+ * bits go in the high 16.
+ */
+#define JOURNAL_NO_GROUP_COMMIT 0x00000001
+#define JOURNAL_RESET 0x00000002
+
+/*
+ * Transaction related functions.
+ *
+ * Before you start modifying file system meta data, you
+ * should call journal_start_transaction(). Then before
+ * you modify each block, call journal_modify_block_start()
+ * and when you're done, journal_modify_block_end(). When
+ * you've modified the last block as part of a transaction,
+ * call journal_end_transaction() to commit the changes.
+ *
+ * If you decide to abort the modifications to a block you
+ * should call journal_modify_block_abort().
+ *
+ * If as part of a transaction you need want to throw out
+ * any previous copies of a block (because it got deleted)
+ * then call journal_kill_block(). This will mark it so
+ * that the journal does not play it back (effectively
+ * dropping it).
+ */
+int journal_start_transaction(journal *jnl);
+int journal_modify_block_start(journal *jnl, struct buf *bp);
+int journal_modify_block_abort(journal *jnl, struct buf *bp);
+int journal_modify_block_end(journal *jnl, struct buf *bp);
+int journal_kill_block(journal *jnl, struct buf *bp);
+int journal_end_transaction(journal *jnl);
+
+int journal_active(journal *jnl);
+int journal_flush(journal *jnl);
+
+#endif /* __APPLE_API_UNSTABLE */
+#endif /* !_SYS_VFS_JOURNAL_H_ */
if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) {
return (error);
}
- if (vp->v_dirtyblkhd.lh_first)
- panic("vinvalbuf: dirty bufs");
+
+ // XXXdbg - if there are dirty bufs, wait for 'em if they're busy
+ for (bp=vp->v_dirtyblkhd.lh_first; bp; bp=nbp) {
+ nbp = bp->b_vnbufs.le_next;
+ if (ISSET(bp->b_flags, B_BUSY)) {
+ SET(bp->b_flags, B_WANTED);
+ tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), "vinvalbuf", 0);
+ nbp = vp->v_dirtyblkhd.lh_first;
+ } else {
+ panic("vinvalbuf: dirty buf (vp 0x%x, bp 0x%x)", vp, bp);
+ }
+ }
}
for (;;) {
- if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA)
+ if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
while (blist && blist->b_lblkno < 0)
blist = blist->b_vnbufs.le_next;
if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
for (bp = blist; bp; bp = nbp) {
nbp = bp->b_vnbufs.le_next;
- if (flags & V_SAVEMETA && bp->b_lblkno < 0)
+ if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
continue;
s = splbio();
if (ISSET(bp->b_flags, B_BUSY)) {
(void) VOP_BWRITE(bp);
break;
}
- SET(bp->b_flags, B_INVAL);
+
+ if (bp->b_flags & B_LOCKED) {
+ panic("vinvalbuf: bp @ 0x%x is locked!\n", bp);
+ break;
+ } else {
+ SET(bp->b_flags, B_INVAL);
+ }
brelse(bp);
}
}
#define kIOCommandPoolSizeKey "IOCommandPoolSize" // (OSNumber)
// properties found in services that have transfer constraints
-#define kIOMaximumBlockCountReadKey "IOMaximumBlockCountRead" // (OSNumber)
-#define kIOMaximumBlockCountWriteKey "IOMaximumBlockCountWrite" // (OSNumber)
-#define kIOMaximumSegmentCountReadKey "IOMaximumSegmentCountRead" // (OSNumber)
-#define kIOMaximumSegmentCountWriteKey "IOMaximumSegmentCountWrite" // (OSNumber)
+#define kIOMaximumBlockCountReadKey "IOMaximumBlockCountRead" // (OSNumber)
+#define kIOMaximumBlockCountWriteKey "IOMaximumBlockCountWrite" // (OSNumber)
+#define kIOMaximumByteCountReadKey "IOMaximumByteCountRead" // (OSNumber)
+#define kIOMaximumByteCountWriteKey "IOMaximumByteCountWrite" // (OSNumber)
+#define kIOMaximumSegmentCountReadKey "IOMaximumSegmentCountRead" // (OSNumber)
+#define kIOMaximumSegmentCountWriteKey "IOMaximumSegmentCountWrite" // (OSNumber)
+#define kIOMaximumSegmentByteCountReadKey "IOMaximumSegmentByteCountRead" // (OSNumber)
+#define kIOMaximumSegmentByteCountWriteKey "IOMaximumSegmentByteCountWrite" // (OSNumber)
// properties found in services that wish to describe an icon
//
*/
const char * gIOKernelKmods =
"{
- 'com.apple.kernel' = '6.1';
- 'com.apple.kernel.bsd' = '6.1';
- 'com.apple.kernel.iokit' = '6.1';
- 'com.apple.kernel.libkern' = '6.1';
- 'com.apple.kernel.mach' = '6.1';
- 'com.apple.iokit.IOADBFamily' = '1.1';
- 'com.apple.iokit.IONVRAMFamily' = '1.1';
- 'com.apple.iokit.IOSystemManagementFamily' = '1.1';
- 'com.apple.iokit.ApplePlatformFamily' = '1.0';
- 'com.apple.driver.AppleNMI' = '1.0';
+ 'com.apple.kernel' = '6.2';
+ 'com.apple.kernel.bsd' = '6.2';
+ 'com.apple.kernel.iokit' = '6.2';
+ 'com.apple.kernel.libkern' = '6.2';
+ 'com.apple.kernel.mach' = '6.2';
+ 'com.apple.iokit.IOADBFamily' = '6.2';
+ 'com.apple.iokit.IONVRAMFamily' = '6.2';
+ 'com.apple.iokit.IOSystemManagementFamily' = '6.2';
+ 'com.apple.iokit.ApplePlatformFamily' = '6.2';
+ 'com.apple.driver.AppleNMI' = '6.2';
}";
*/
+/*
+ * copy 'size' bytes from physical to physical address
+ * the caller must validate the physical ranges
+ *
+ * if flush_action == 0, no cache flush necessary
+ * if flush_action == 1, flush the source
+ * if flush_action == 2, flush the dest
+ * if flush_action == 3, flush both source and dest
+ */
+
+kern_return_t copyp2p(vm_offset_t source, vm_offset_t dest, unsigned int size, unsigned int flush_action) {
+
+ switch(flush_action) {
+ case 1:
+ flush_dcache(source, size, 1);
+ break;
+ case 2:
+ flush_dcache(dest, size, 1);
+ break;
+ case 3:
+ flush_dcache(source, size, 1);
+ flush_dcache(dest, size, 1);
+ break;
+
+ }
+ bcopy_phys((char *)source, (char *)dest, size); /* Do a physical copy */
+
+ switch(flush_action) {
+ case 1:
+ flush_dcache(source, size, 1);
+ break;
+ case 2:
+ flush_dcache(dest, size, 1);
+ break;
+ case 3:
+ flush_dcache(source, size, 1);
+ flush_dcache(dest, size, 1);
+ break;
+
+ }
+}
+
+
/*
* Copies data from a physical page to a virtual page. This is used to
rlwinm. r0,r8,0,MSR_PR_BIT,MSR_PR_BIT ; See if we are doing this for user state
stw r8,savesrr1(r25) ; Set the msr of the interrupted guy
xor r3,r25,r5 ; Get the real address of the savearea
- bne- fsnuser ; We are not user state...
+ beq- fsnuser ; We are not user state...
stw r10,ACT_MACT_SPF(r17) ; Set the activation copy
stw r10,spcFlags(r26) ; Set per_proc copy
rlwinm. r0,r8,0,MSR_PR_BIT,MSR_PR_BIT ; See if we are doing this for user state
stw r8,savesrr1(r25) ; Set the msr of the interrupted guy
xor r3,r25,r5 ; Get the real address of the savearea
- bne- vrnuser ; We are not user state...
+ beq- vrnuser ; We are not user state...
stw r10,ACT_MACT_SPF(r17) ; Set the activation copy
stw r10,spcFlags(r26) ; Set per_proc copy
#endif
vm_map_t mapping_map = VM_MAP_NULL;
+#define MAPPING_MAP_SIZE 33554432 /* 32MB address space */
unsigned int incrVSID = 0; /* VSID increment value */
unsigned int mappingdeb0 = 0;
mappingblok *mbn;
vm_offset_t mapping_min;
- retr = kmem_suballoc(kernel_map, &mapping_min, mem_size / 16,
+ retr = kmem_suballoc(kernel_map, &mapping_min, MAPPING_MAP_SIZE,
FALSE, TRUE, &mapping_map);
if (retr != KERN_SUCCESS)
}
+/*
+ * copy 'size' bytes from physical to physical address
+ * the caller must validate the physical ranges
+ *
+ * if flush_action == 0, no cache flush necessary
+ * if flush_action == 1, flush the source
+ * if flush_action == 2, flush the dest
+ * if flush_action == 3, flush both source and dest
+ */
+
+kern_return_t copyp2p(vm_offset_t source, vm_offset_t dest, unsigned int size, unsigned int flush_action) {
+
+ switch(flush_action) {
+ case 1:
+ flush_dcache(source, size, 1);
+ break;
+ case 2:
+ flush_dcache(dest, size, 1);
+ break;
+ case 3:
+ flush_dcache(source, size, 1);
+ flush_dcache(dest, size, 1);
+ break;
+
+ }
+ bcopy_phys((char *)source, (char *)dest, size); /* Do a physical copy */
+
+ switch(flush_action) {
+ case 1:
+ flush_dcache(source, size, 1);
+ break;
+ case 2:
+ flush_dcache(dest, size, 1);
+ break;
+ case 3:
+ flush_dcache(source, size, 1);
+ flush_dcache(dest, size, 1);
+ break;
+
+ }
+}
+
+
+
#if DEBUG
/*
* Dumps out the mapping stuff associated with a virtual address
hash_table_size *= 2)
continue;
+ if (num > (sizeof(pte_t) * 524288))
+ hash_table_size = hash_table_size/2; /* reduce by half above 512MB */
+
/* Scale to within any physical memory layout constraints */
do {
num = atop(mem_size); /* num now holds mem_size in pages */
kmem_init(start, end);
pmap_init();
- zsize = mem_size >> 2; /* Get target zone size as 1/4 of physical memory */
+ if (PE_parse_boot_arg("zsize", &zsize))
+ zsize = zsize * 1024 * 1024;
+ else {
+ zsize = mem_size >> 2; /* Get target zone size as 1/4 of physical memory */
+ }
if(zsize < ZONE_MAP_MIN) zsize = ZONE_MAP_MIN; /* Clamp to min */
if(zsize > ZONE_MAP_MAX) zsize = ZONE_MAP_MAX; /* Clamp to max */
zone_init(zsize); /* Allocate address space for zones */
extern kern_return_t kmem_alloc_pages(
register vm_object_t object,
register vm_object_offset_t offset,
- register vm_offset_t start,
- register vm_offset_t end,
- vm_prot_t protection);
+ register vm_size_t size);
extern void kmem_remap_pages(
register vm_object_t object,
/*
* Since we have not given out this address yet,
- * it is safe to unlock the map.
+ * it is safe to unlock the map. Except of course
+ * we must make certain no one coalesces our address
+ * or does a blind vm_deallocate and removes the object
+ * an extra object reference will suffice to protect
+ * against both contingencies.
*/
+ vm_object_reference(object);
vm_map_unlock(map);
vm_object_lock(object);
offset + (vm_object_offset_t)i);
vm_object_unlock(object);
vm_map_remove(map, addr, addr + size, 0);
+ vm_object_deallocate(object);
return KERN_RESOURCE_SHORTAGE;
}
vm_object_unlock(object);
vm_object_unlock(object);
}
vm_map_remove(map, addr, addr + size, 0);
+ vm_object_deallocate(object);
return (kr);
}
+ /* now that the page is wired, we no longer have to fear coalesce */
+ vm_object_deallocate(object);
if (object == kernel_object)
vm_map_simplify(map, addr);
vm_offset_t *newaddrp,
vm_size_t newsize)
{
- vm_offset_t oldmin, oldmax;
- vm_offset_t newaddr;
- vm_object_t object;
- vm_map_entry_t oldentry, newentry;
- kern_return_t kr;
+ vm_offset_t oldmin, oldmax;
+ vm_offset_t newaddr;
+ vm_offset_t offset;
+ vm_object_t object;
+ vm_map_entry_t oldentry, newentry;
+ vm_page_t mem;
+ kern_return_t kr;
oldmin = trunc_page(oldaddr);
oldmax = round_page(oldaddr + oldsize);
oldsize = oldmax - oldmin;
newsize = round_page(newsize);
- /*
- * Find space for the new region.
- */
-
- kr = vm_map_find_space(map, &newaddr, newsize, (vm_offset_t) 0,
- &newentry);
- if (kr != KERN_SUCCESS) {
- return kr;
- }
/*
* Find the VM object backing the old region.
*/
+ vm_map_lock(map);
+
if (!vm_map_lookup_entry(map, oldmin, &oldentry))
panic("kmem_realloc");
object = oldentry->object.vm_object;
*/
vm_object_reference(object);
+ /* by grabbing the object lock before unlocking the map */
+ /* we guarantee that we will panic if more than one */
+ /* attempt is made to realloc a kmem_alloc'd area */
vm_object_lock(object);
+ vm_map_unlock(map);
if (object->size != oldsize)
panic("kmem_realloc");
object->size = newsize;
vm_object_unlock(object);
- newentry->object.vm_object = object;
- newentry->offset = 0;
- assert (newentry->wired_count == 0);
- newentry->wired_count = 1;
+ /* allocate the new pages while expanded portion of the */
+ /* object is still not mapped */
+ kmem_alloc_pages(object, oldsize, newsize-oldsize);
+
/*
- * Since we have not given out this address yet,
- * it is safe to unlock the map. We are trusting
- * that nobody will play with either region.
+ * Find space for the new region.
*/
+ kr = vm_map_find_space(map, &newaddr, newsize, (vm_offset_t) 0,
+ &newentry);
+ if (kr != KERN_SUCCESS) {
+ vm_object_lock(object);
+ for(offset = oldsize;
+ offset<newsize; offset+=PAGE_SIZE) {
+ if ((mem = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
+ vm_page_lock_queues();
+ vm_page_free(mem);
+ vm_page_unlock_queues();
+ }
+ }
+ object->size = oldsize;
+ vm_object_unlock(object);
+ vm_object_deallocate(object);
+ return kr;
+ }
+ newentry->object.vm_object = object;
+ newentry->offset = 0;
+ assert (newentry->wired_count == 0);
+
+
+ /* add an extra reference in case we have someone doing an */
+ /* unexpected deallocate */
+ vm_object_reference(object);
vm_map_unlock(map);
- /*
- * Remap the pages in the old region and
- * allocate more pages for the new region.
- */
+ if ((kr = vm_map_wire(map, newaddr, newaddr + newsize,
+ VM_PROT_DEFAULT, FALSE)) != KERN_SUCCESS) {
+ vm_map_remove(map, newaddr, newaddr + newsize, 0);
+ vm_object_lock(object);
+ for(offset = oldsize;
+ offset<newsize; offset+=PAGE_SIZE) {
+ if ((mem = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
+ vm_page_lock_queues();
+ vm_page_free(mem);
+ vm_page_unlock_queues();
+ }
+ }
+ object->size = oldsize;
+ vm_object_unlock(object);
+ vm_object_deallocate(object);
+ return (kr);
+ }
+ vm_object_deallocate(object);
- kmem_remap_pages(object, 0,
- newaddr, newaddr + oldsize,
- VM_PROT_DEFAULT);
- kmem_alloc_pages(object, oldsize,
- newaddr + oldsize, newaddr + newsize,
- VM_PROT_DEFAULT);
*newaddrp = newaddr;
return KERN_SUCCESS;
}
/*
- * Allocate new wired pages in an object.
- * The object is assumed to be mapped into the kernel map or
- * a submap.
+ * Allocate new pages in an object.
*/
kern_return_t
kmem_alloc_pages(
register vm_object_t object,
register vm_object_offset_t offset,
- register vm_offset_t start,
- register vm_offset_t end,
- vm_prot_t protection)
+ register vm_size_t size)
{
- /*
- * Mark the pmap region as not pageable.
- */
- pmap_pageable(kernel_pmap, start, end, FALSE);
- while (start < end) {
+ size = round_page(size);
+ vm_object_lock(object);
+ while (size) {
register vm_page_t mem;
- vm_object_lock(object);
/*
* Allocate a page
vm_object_lock(object);
}
- /*
- * Wire it down
- */
- vm_page_lock_queues();
- vm_page_wire(mem);
- vm_page_unlock_queues();
- vm_object_unlock(object);
-
- /*
- * Enter it in the kernel pmap
- */
- PMAP_ENTER(kernel_pmap, start, mem, protection,
- VM_WIMG_USE_DEFAULT, TRUE);
-
- vm_object_lock(object);
- PAGE_WAKEUP_DONE(mem);
- vm_object_unlock(object);
- start += PAGE_SIZE;
- offset += PAGE_SIZE_64;
+ offset += PAGE_SIZE;
+ size -= PAGE_SIZE;
+ mem->busy = FALSE;
}
+ vm_object_unlock(object);
return KERN_SUCCESS;
}