]> git.saurik.com Git - apple/libc.git/blame - db.subproj/btree.subproj/bt_split.c
Libc-166.tar.gz
[apple/libc.git] / db.subproj / btree.subproj / bt_split.c
CommitLineData
e9ce8d39
A
1/*
2 * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/*
23 * Copyright (c) 1990, 1993
24 * The Regents of the University of California. All rights reserved.
25 *
26 * This code is derived from software contributed to Berkeley by
27 * Mike Olson.
28 *
29 * Redistribution and use in source and binary forms, with or without
30 * modification, are permitted provided that the following conditions
31 * are met:
32 * 1. Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * 2. Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in the
36 * documentation and/or other materials provided with the distribution.
37 * 3. All advertising materials mentioning features or use of this software
38 * must display the following acknowledgement:
39 * This product includes software developed by the University of
40 * California, Berkeley and its contributors.
41 * 4. Neither the name of the University nor the names of its contributors
42 * may be used to endorse or promote products derived from this software
43 * without specific prior written permission.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 */
57
58
59#include <sys/types.h>
60
61#include <limits.h>
62#include <stdio.h>
63#include <stdlib.h>
64#include <string.h>
65
66#include <db.h>
67#include "btree.h"
68
69static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
70static PAGE *bt_page
71 __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
72static int bt_preserve __P((BTREE *, pgno_t));
73static PAGE *bt_psplit
74 __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t));
75static PAGE *bt_root
76 __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
77static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
78static recno_t rec_total __P((PAGE *));
79
80#ifdef STATISTICS
81u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
82#endif
83
84/*
85 * __BT_SPLIT -- Split the tree.
86 *
87 * Parameters:
88 * t: tree
89 * sp: page to split
90 * key: key to insert
91 * data: data to insert
92 * flags: BIGKEY/BIGDATA flags
93 * ilen: insert length
94 * skip: index to leave open
95 *
96 * Returns:
97 * RET_ERROR, RET_SUCCESS
98 */
99int
100__bt_split(t, sp, key, data, flags, ilen, skip)
101 BTREE *t;
102 PAGE *sp;
103 const DBT *key, *data;
104 int flags;
105 size_t ilen;
106 indx_t skip;
107{
108 BINTERNAL *bi;
109 BLEAF *bl, *tbl;
110 DBT a, b;
111 EPGNO *parent;
112 PAGE *h, *l, *r, *lchild, *rchild;
113 indx_t nxtindex;
114 size_t n, nbytes, nksize;
115 int parentsplit;
116 char *dest;
117
118 /*
119 * Split the page into two pages, l and r. The split routines return
120 * a pointer to the page into which the key should be inserted and with
121 * skip set to the offset which should be used. Additionally, l and r
122 * are pinned.
123 */
124 h = sp->pgno == P_ROOT ?
125 bt_root(t, sp, &l, &r, &skip, ilen) :
126 bt_page(t, sp, &l, &r, &skip, ilen);
127 if (h == NULL)
128 return (RET_ERROR);
129
130 /*
131 * Insert the new key/data pair into the leaf page. (Key inserts
132 * always cause a leaf page to split first.)
133 */
134 h->linp[skip] = h->upper -= ilen;
135 dest = (char *)h + h->upper;
136 if (ISSET(t, R_RECNO))
137 WR_RLEAF(dest, data, flags)
138 else
139 WR_BLEAF(dest, key, data, flags)
140
141 /* If the root page was split, make it look right. */
142 if (sp->pgno == P_ROOT &&
143 (ISSET(t, R_RECNO) ?
144 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
145 goto err2;
146
147 /*
148 * Now we walk the parent page stack -- a LIFO stack of the pages that
149 * were traversed when we searched for the page that split. Each stack
150 * entry is a page number and a page index offset. The offset is for
151 * the page traversed on the search. We've just split a page, so we
152 * have to insert a new key into the parent page.
153 *
154 * If the insert into the parent page causes it to split, may have to
155 * continue splitting all the way up the tree. We stop if the root
156 * splits or the page inserted into didn't have to split to hold the
157 * new key. Some algorithms replace the key for the old page as well
158 * as the new page. We don't, as there's no reason to believe that the
159 * first key on the old page is any better than the key we have, and,
160 * in the case of a key being placed at index 0 causing the split, the
161 * key is unavailable.
162 *
163 * There are a maximum of 5 pages pinned at any time. We keep the left
164 * and right pages pinned while working on the parent. The 5 are the
165 * two children, left parent and right parent (when the parent splits)
166 * and the root page or the overflow key page when calling bt_preserve.
167 * This code must make sure that all pins are released other than the
168 * root page or overflow page which is unlocked elsewhere.
169 */
170 while ((parent = BT_POP(t)) != NULL) {
171 lchild = l;
172 rchild = r;
173
174 /* Get the parent page. */
175 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
176 goto err2;
177
178 /*
179 * The new key goes ONE AFTER the index, because the split
180 * was to the right.
181 */
182 skip = parent->index + 1;
183
184 /*
185 * Calculate the space needed on the parent page.
186 *
187 * Prefix trees: space hack when inserting into BINTERNAL
188 * pages. Retain only what's needed to distinguish between
189 * the new entry and the LAST entry on the page to its left.
190 * If the keys compare equal, retain the entire key. Note,
191 * we don't touch overflow keys, and the entire key must be
192 * retained for the next-to-left most key on the leftmost
193 * page of each level, or the search will fail. Applicable
194 * ONLY to internal pages that have leaf pages as children.
195 * Further reduction of the key between pairs of internal
196 * pages loses too much information.
197 */
198 switch (rchild->flags & P_TYPE) {
199 case P_BINTERNAL:
200 bi = GETBINTERNAL(rchild, 0);
201 nbytes = NBINTERNAL(bi->ksize);
202 break;
203 case P_BLEAF:
204 bl = GETBLEAF(rchild, 0);
205 nbytes = NBINTERNAL(bl->ksize);
206 if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
207 (h->prevpg != P_INVALID || skip > 1)) {
208 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
209 a.size = tbl->ksize;
210 a.data = tbl->bytes;
211 b.size = bl->ksize;
212 b.data = bl->bytes;
213 nksize = t->bt_pfx(&a, &b);
214 n = NBINTERNAL(nksize);
215 if (n < nbytes) {
216#ifdef STATISTICS
217 bt_pfxsaved += nbytes - n;
218#endif
219 nbytes = n;
220 } else
221 nksize = 0;
222 } else
223 nksize = 0;
224 break;
225 case P_RINTERNAL:
226 case P_RLEAF:
227 nbytes = NRINTERNAL;
228 break;
229 default:
230 abort();
231 }
232
233 /* Split the parent page if necessary or shift the indices. */
234 if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
235 sp = h;
236 h = h->pgno == P_ROOT ?
237 bt_root(t, h, &l, &r, &skip, nbytes) :
238 bt_page(t, h, &l, &r, &skip, nbytes);
239 if (h == NULL)
240 goto err1;
241 parentsplit = 1;
242 } else {
243 if (skip < (nxtindex = NEXTINDEX(h)))
244 memmove(h->linp + skip + 1, h->linp + skip,
245 (nxtindex - skip) * sizeof(indx_t));
246 h->lower += sizeof(indx_t);
247 parentsplit = 0;
248 }
249
250 /* Insert the key into the parent page. */
251 switch(rchild->flags & P_TYPE) {
252 case P_BINTERNAL:
253 h->linp[skip] = h->upper -= nbytes;
254 dest = (char *)h + h->linp[skip];
255 memmove(dest, bi, nbytes);
256 ((BINTERNAL *)dest)->pgno = rchild->pgno;
257 break;
258 case P_BLEAF:
259 h->linp[skip] = h->upper -= nbytes;
260 dest = (char *)h + h->linp[skip];
261 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
262 rchild->pgno, bl->flags & P_BIGKEY);
263 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
264 if (bl->flags & P_BIGKEY &&
265 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
266 goto err1;
267 break;
268 case P_RINTERNAL:
269 /*
270 * Update the left page count. If split
271 * added at index 0, fix the correct page.
272 */
273 if (skip > 0)
274 dest = (char *)h + h->linp[skip - 1];
275 else
276 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
277 ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
278 ((RINTERNAL *)dest)->pgno = lchild->pgno;
279
280 /* Update the right page count. */
281 h->linp[skip] = h->upper -= nbytes;
282 dest = (char *)h + h->linp[skip];
283 ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
284 ((RINTERNAL *)dest)->pgno = rchild->pgno;
285 break;
286 case P_RLEAF:
287 /*
288 * Update the left page count. If split
289 * added at index 0, fix the correct page.
290 */
291 if (skip > 0)
292 dest = (char *)h + h->linp[skip - 1];
293 else
294 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
295 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
296 ((RINTERNAL *)dest)->pgno = lchild->pgno;
297
298 /* Update the right page count. */
299 h->linp[skip] = h->upper -= nbytes;
300 dest = (char *)h + h->linp[skip];
301 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
302 ((RINTERNAL *)dest)->pgno = rchild->pgno;
303 break;
304 default:
305 abort();
306 }
307
308 /* Unpin the held pages. */
309 if (!parentsplit) {
310 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
311 break;
312 }
313
314 /* If the root page was split, make it look right. */
315 if (sp->pgno == P_ROOT &&
316 (ISSET(t, R_RECNO) ?
317 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
318 goto err1;
319
320 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
321 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
322 }
323
324 /* Unpin the held pages. */
325 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
326 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
327
328 /* Clear any pages left on the stack. */
329 return (RET_SUCCESS);
330
331 /*
332 * If something fails in the above loop we were already walking back
333 * up the tree and the tree is now inconsistent. Nothing much we can
334 * do about it but release any memory we're holding.
335 */
336err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
337 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
338
339err2: mpool_put(t->bt_mp, l, 0);
340 mpool_put(t->bt_mp, r, 0);
341 __dbpanic(t->bt_dbp);
342 return (RET_ERROR);
343}
344
345/*
346 * BT_PAGE -- Split a non-root page of a btree.
347 *
348 * Parameters:
349 * t: tree
350 * h: root page
351 * lp: pointer to left page pointer
352 * rp: pointer to right page pointer
353 * skip: pointer to index to leave open
354 * ilen: insert length
355 *
356 * Returns:
357 * Pointer to page in which to insert or NULL on error.
358 */
359static PAGE *
360bt_page(t, h, lp, rp, skip, ilen)
361 BTREE *t;
362 PAGE *h, **lp, **rp;
363 indx_t *skip;
364 size_t ilen;
365{
366 PAGE *l, *r, *tp;
367 pgno_t npg;
368
369#ifdef STATISTICS
370 ++bt_split;
371#endif
372 /* Put the new right page for the split into place. */
373 if ((r = __bt_new(t, &npg)) == NULL)
374 return (NULL);
375 r->pgno = npg;
376 r->lower = BTDATAOFF;
377 r->upper = t->bt_psize;
378 r->nextpg = h->nextpg;
379 r->prevpg = h->pgno;
380 r->flags = h->flags & P_TYPE;
381
382 /*
383 * If we're splitting the last page on a level because we're appending
384 * a key to it (skip is NEXTINDEX()), it's likely that the data is
385 * sorted. Adding an empty page on the side of the level is less work
386 * and can push the fill factor much higher than normal. If we're
387 * wrong it's no big deal, we'll just do the split the right way next
388 * time. It may look like it's equally easy to do a similar hack for
389 * reverse sorted data, that is, split the tree left, but it's not.
390 * Don't even try.
391 */
392 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
393#ifdef STATISTICS
394 ++bt_sortsplit;
395#endif
396 h->nextpg = r->pgno;
397 r->lower = BTDATAOFF + sizeof(indx_t);
398 *skip = 0;
399 *lp = h;
400 *rp = r;
401 return (r);
402 }
403
404 /* Put the new left page for the split into place. */
405 if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
406 mpool_put(t->bt_mp, r, 0);
407 return (NULL);
408 }
409 l->pgno = h->pgno;
410 l->nextpg = r->pgno;
411 l->prevpg = h->prevpg;
412 l->lower = BTDATAOFF;
413 l->upper = t->bt_psize;
414 l->flags = h->flags & P_TYPE;
415
416 /* Fix up the previous pointer of the page after the split page. */
417 if (h->nextpg != P_INVALID) {
418 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
419 free(l);
420 /* XXX mpool_free(t->bt_mp, r->pgno); */
421 return (NULL);
422 }
423 tp->prevpg = r->pgno;
424 mpool_put(t->bt_mp, tp, 0);
425 }
426
427 /*
428 * Split right. The key/data pairs aren't sorted in the btree page so
429 * it's simpler to copy the data from the split page onto two new pages
430 * instead of copying half the data to the right page and compacting
431 * the left page in place. Since the left page can't change, we have
432 * to swap the original and the allocated left page after the split.
433 */
434 tp = bt_psplit(t, h, l, r, skip, ilen);
435
436 /* Move the new left page onto the old left page. */
437 memmove(h, l, t->bt_psize);
438 if (tp == l)
439 tp = h;
440 free(l);
441
442 *lp = h;
443 *rp = r;
444 return (tp);
445}
446
447/*
448 * BT_ROOT -- Split the root page of a btree.
449 *
450 * Parameters:
451 * t: tree
452 * h: root page
453 * lp: pointer to left page pointer
454 * rp: pointer to right page pointer
455 * skip: pointer to index to leave open
456 * ilen: insert length
457 *
458 * Returns:
459 * Pointer to page in which to insert or NULL on error.
460 */
461static PAGE *
462bt_root(t, h, lp, rp, skip, ilen)
463 BTREE *t;
464 PAGE *h, **lp, **rp;
465 indx_t *skip;
466 size_t ilen;
467{
468 PAGE *l, *r, *tp;
469 pgno_t lnpg, rnpg;
470
471#ifdef STATISTICS
472 ++bt_split;
473 ++bt_rootsplit;
474#endif
475 /* Put the new left and right pages for the split into place. */
476 if ((l = __bt_new(t, &lnpg)) == NULL ||
477 (r = __bt_new(t, &rnpg)) == NULL)
478 return (NULL);
479 l->pgno = lnpg;
480 r->pgno = rnpg;
481 l->nextpg = r->pgno;
482 r->prevpg = l->pgno;
483 l->prevpg = r->nextpg = P_INVALID;
484 l->lower = r->lower = BTDATAOFF;
485 l->upper = r->upper = t->bt_psize;
486 l->flags = r->flags = h->flags & P_TYPE;
487
488 /* Split the root page. */
489 tp = bt_psplit(t, h, l, r, skip, ilen);
490
491 *lp = l;
492 *rp = r;
493 return (tp);
494}
495
496/*
497 * BT_RROOT -- Fix up the recno root page after it has been split.
498 *
499 * Parameters:
500 * t: tree
501 * h: root page
502 * l: left page
503 * r: right page
504 *
505 * Returns:
506 * RET_ERROR, RET_SUCCESS
507 */
508static int
509bt_rroot(t, h, l, r)
510 BTREE *t;
511 PAGE *h, *l, *r;
512{
513 char *dest;
514
515 /* Insert the left and right keys, set the header information. */
516 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
517 dest = (char *)h + h->upper;
518 WR_RINTERNAL(dest,
519 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
520
521 h->linp[1] = h->upper -= NRINTERNAL;
522 dest = (char *)h + h->upper;
523 WR_RINTERNAL(dest,
524 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
525
526 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
527
528 /* Unpin the root page, set to recno internal page. */
529 h->flags &= ~P_TYPE;
530 h->flags |= P_RINTERNAL;
531 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
532
533 return (RET_SUCCESS);
534}
535
536/*
537 * BT_BROOT -- Fix up the btree root page after it has been split.
538 *
539 * Parameters:
540 * t: tree
541 * h: root page
542 * l: left page
543 * r: right page
544 *
545 * Returns:
546 * RET_ERROR, RET_SUCCESS
547 */
548static int
549bt_broot(t, h, l, r)
550 BTREE *t;
551 PAGE *h, *l, *r;
552{
553 BINTERNAL *bi;
554 BLEAF *bl;
555 size_t nbytes;
556 char *dest;
557
558 /*
559 * If the root page was a leaf page, change it into an internal page.
560 * We copy the key we split on (but not the key's data, in the case of
561 * a leaf page) to the new root page.
562 *
563 * The btree comparison code guarantees that the left-most key on any
564 * level of the tree is never used, so it doesn't need to be filled in.
565 */
566 nbytes = NBINTERNAL(0);
567 h->linp[0] = h->upper = t->bt_psize - nbytes;
568 dest = (char *)h + h->upper;
569 WR_BINTERNAL(dest, 0, l->pgno, 0);
570
571 switch(h->flags & P_TYPE) {
572 case P_BLEAF:
573 bl = GETBLEAF(r, 0);
574 nbytes = NBINTERNAL(bl->ksize);
575 h->linp[1] = h->upper -= nbytes;
576 dest = (char *)h + h->upper;
577 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
578 memmove(dest, bl->bytes, bl->ksize);
579
580 /*
581 * If the key is on an overflow page, mark the overflow chain
582 * so it isn't deleted when the leaf copy of the key is deleted.
583 */
584 if (bl->flags & P_BIGKEY &&
585 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
586 return (RET_ERROR);
587 break;
588 case P_BINTERNAL:
589 bi = GETBINTERNAL(r, 0);
590 nbytes = NBINTERNAL(bi->ksize);
591 h->linp[1] = h->upper -= nbytes;
592 dest = (char *)h + h->upper;
593 memmove(dest, bi, nbytes);
594 ((BINTERNAL *)dest)->pgno = r->pgno;
595 break;
596 default:
597 abort();
598 }
599
600 /* There are two keys on the page. */
601 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
602
603 /* Unpin the root page, set to btree internal page. */
604 h->flags &= ~P_TYPE;
605 h->flags |= P_BINTERNAL;
606 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
607
608 return (RET_SUCCESS);
609}
610
611/*
612 * BT_PSPLIT -- Do the real work of splitting the page.
613 *
614 * Parameters:
615 * t: tree
616 * h: page to be split
617 * l: page to put lower half of data
618 * r: page to put upper half of data
619 * pskip: pointer to index to leave open
620 * ilen: insert length
621 *
622 * Returns:
623 * Pointer to page in which to insert.
624 */
625static PAGE *
626bt_psplit(t, h, l, r, pskip, ilen)
627 BTREE *t;
628 PAGE *h, *l, *r;
629 indx_t *pskip;
630 size_t ilen;
631{
632 BINTERNAL *bi;
633 BLEAF *bl;
634 RLEAF *rl;
635 EPGNO *c;
636 PAGE *rval;
637 void *src;
638 indx_t full, half, nxt, off, skip, top, used;
639 size_t nbytes;
640 int bigkeycnt, isbigkey;
641
642 /*
643 * Split the data to the left and right pages. Leave the skip index
644 * open. Additionally, make some effort not to split on an overflow
645 * key. This makes internal page processing faster and can save
646 * space as overflow keys used by internal pages are never deleted.
647 */
648 bigkeycnt = 0;
649 skip = *pskip;
650 full = t->bt_psize - BTDATAOFF;
651 half = full / 2;
652 used = 0;
653 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
654 if (skip == off) {
655 nbytes = ilen;
656 isbigkey = 0; /* XXX: not really known. */
657 } else
658 switch (h->flags & P_TYPE) {
659 case P_BINTERNAL:
660 src = bi = GETBINTERNAL(h, nxt);
661 nbytes = NBINTERNAL(bi->ksize);
662 isbigkey = bi->flags & P_BIGKEY;
663 break;
664 case P_BLEAF:
665 src = bl = GETBLEAF(h, nxt);
666 nbytes = NBLEAF(bl);
667 isbigkey = bl->flags & P_BIGKEY;
668 break;
669 case P_RINTERNAL:
670 src = GETRINTERNAL(h, nxt);
671 nbytes = NRINTERNAL;
672 isbigkey = 0;
673 break;
674 case P_RLEAF:
675 src = rl = GETRLEAF(h, nxt);
676 nbytes = NRLEAF(rl);
677 isbigkey = 0;
678 break;
679 default:
680 abort();
681 }
682
683 /*
684 * If the key/data pairs are substantial fractions of the max
685 * possible size for the page, it's possible to get situations
686 * where we decide to try and copy too much onto the left page.
687 * Make sure that doesn't happen.
688 */
689 if (skip <= off && used + nbytes >= full) {
690 --off;
691 break;
692 }
693
694 /* Copy the key/data pair, if not the skipped index. */
695 if (skip != off) {
696 ++nxt;
697
698 l->linp[off] = l->upper -= nbytes;
699 memmove((char *)l + l->upper, src, nbytes);
700 }
701
702 used += nbytes;
703 if (used >= half) {
704 if (!isbigkey || bigkeycnt == 3)
705 break;
706 else
707 ++bigkeycnt;
708 }
709 }
710
711 /*
712 * Off is the last offset that's valid for the left page.
713 * Nxt is the first offset to be placed on the right page.
714 */
715 l->lower += (off + 1) * sizeof(indx_t);
716
717 /*
718 * If splitting the page that the cursor was on, the cursor has to be
719 * adjusted to point to the same record as before the split. If the
720 * cursor is at or past the skipped slot, the cursor is incremented by
721 * one. If the cursor is on the right page, it is decremented by the
722 * number of records split to the left page.
723 *
724 * Don't bother checking for the B_SEQINIT flag, the page number will
725 * be P_INVALID.
726 */
727 c = &t->bt_bcursor;
728 if (c->pgno == h->pgno) {
729 if (c->index >= skip)
730 ++c->index;
731 if (c->index < nxt) /* Left page. */
732 c->pgno = l->pgno;
733 else { /* Right page. */
734 c->pgno = r->pgno;
735 c->index -= nxt;
736 }
737 }
738
739 /*
740 * If the skipped index was on the left page, just return that page.
741 * Otherwise, adjust the skip index to reflect the new position on
742 * the right page.
743 */
744 if (skip <= off) {
745 skip = 0;
746 rval = l;
747 } else {
748 rval = r;
749 *pskip -= nxt;
750 }
751
752 for (off = 0; nxt < top; ++off) {
753 if (skip == nxt) {
754 ++off;
755 skip = 0;
756 }
757 switch (h->flags & P_TYPE) {
758 case P_BINTERNAL:
759 src = bi = GETBINTERNAL(h, nxt);
760 nbytes = NBINTERNAL(bi->ksize);
761 break;
762 case P_BLEAF:
763 src = bl = GETBLEAF(h, nxt);
764 nbytes = NBLEAF(bl);
765 break;
766 case P_RINTERNAL:
767 src = GETRINTERNAL(h, nxt);
768 nbytes = NRINTERNAL;
769 break;
770 case P_RLEAF:
771 src = rl = GETRLEAF(h, nxt);
772 nbytes = NRLEAF(rl);
773 break;
774 default:
775 abort();
776 }
777 ++nxt;
778 r->linp[off] = r->upper -= nbytes;
779 memmove((char *)r + r->upper, src, nbytes);
780 }
781 r->lower += off * sizeof(indx_t);
782
783 /* If the key is being appended to the page, adjust the index. */
784 if (skip == top)
785 r->lower += sizeof(indx_t);
786
787 return (rval);
788}
789
790/*
791 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
792 *
793 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
794 * record that references them gets deleted. Chains pointed to by internal
795 * pages never get deleted. This routine marks a chain as pointed to by an
796 * internal page.
797 *
798 * Parameters:
799 * t: tree
800 * pg: page number of first page in the chain.
801 *
802 * Returns:
803 * RET_SUCCESS, RET_ERROR.
804 */
805static int
806bt_preserve(t, pg)
807 BTREE *t;
808 pgno_t pg;
809{
810 PAGE *h;
811
812 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
813 return (RET_ERROR);
814 h->flags |= P_PRESERVE;
815 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
816 return (RET_SUCCESS);
817}
818
819/*
820 * REC_TOTAL -- Return the number of recno entries below a page.
821 *
822 * Parameters:
823 * h: page
824 *
825 * Returns:
826 * The number of recno entries below a page.
827 *
828 * XXX
829 * These values could be set by the bt_psplit routine. The problem is that the
830 * entry has to be popped off of the stack etc. or the values have to be passed
831 * all the way back to bt_split/bt_rroot and it's not very clean.
832 */
833static recno_t
834rec_total(h)
835 PAGE *h;
836{
837 recno_t recs;
838 indx_t nxt, top;
839
840 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
841 recs += GETRINTERNAL(h, nxt)->nrecs;
842 return (recs);
843}