Logo Search packages:      
Sourcecode: db version File versions

db_dispatch.c

/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996,2007 Oracle.  All rights reserved.
 */
/*
 * Copyright (c) 1995, 1996
 *    The President and Fellows of Harvard University.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Margo Seltzer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $Id: db_dispatch.c,v 12.37 2007/05/17 15:14:56 bostic Exp $
 */

#include "db_config.h"

#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/hash.h"
#include "dbinc/fop.h"
#include "dbinc/lock.h"
#include "dbinc/log.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"

static int __db_limbo_fix __P((DB *, DB_TXN *,
            DB_TXNLIST *, db_pgno_t *, DBMETA *, db_limbo_state));
static int __db_limbo_bucket __P((DB_ENV *,
           DB_TXN *, DB_TXNLIST *, db_limbo_state));
static int __db_limbo_move __P((DB_ENV *, DB_TXN *, DB_TXN *, DB_TXNLIST *));
static int __db_limbo_prepare __P((DB *, DB_TXN *, DB_TXNLIST *));
static int __db_lock_move __P((DB_ENV *,
            u_int8_t *, db_pgno_t, db_lockmode_t, DB_TXN *, DB_TXN *));
static int __db_txnlist_pgnoadd __P((DB_ENV *, DB_TXNHEAD *,
            int32_t, u_int8_t *, char *, db_pgno_t));
static int __db_txnlist_find_internal __P((DB_ENV *, DB_TXNHEAD *,
            db_txnlist_type, u_int32_t, u_int8_t *, DB_TXNLIST **,
            int, u_int32_t *));

/*
 * __db_dispatch --
 *
 * This is the transaction dispatch function used by the db access methods.
 * It is designed to handle the record format used by all the access
 * methods (the one automatically generated by the db_{h,log,read}.sh
 * scripts in the tools directory).  An application using a different
 * recovery paradigm will supply a different dispatch function to txn_open.
 *
 * PUBLIC: int __db_dispatch __P((DB_ENV *,
 * PUBLIC:     int (**)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)),
 * PUBLIC:     size_t, DBT *, DB_LSN *, db_recops, DB_TXNHEAD *));
 */
int
__db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info)
      DB_ENV *dbenv;          /* The environment. */
      int (**dtab)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
      size_t dtabsize;  /* Size of the dtab. */
      DBT *db;          /* The log record upon which to dispatch. */
      DB_LSN *lsnp;           /* The lsn of the record being dispatched. */
      db_recops redo;         /* Redo this op (or undo it). */
      DB_TXNHEAD *info; /* Transaction list. */
{
      DB_LSN prev_lsn;
      u_int32_t rectype, status, txnid;
      int make_call, ret;

      memcpy(&rectype, db->data, sizeof(rectype));
      memcpy(&txnid, (u_int8_t *)db->data + sizeof(rectype), sizeof(txnid));
      make_call = ret = 0;

      /* If we don't have a dispatch table, it's hard to dispatch. */
      DB_ASSERT(dbenv, dtab != NULL);

      /*
       * If we find a record that is in the user's number space and they
       * have specified a recovery routine, let them handle it.  If they
       * didn't specify a recovery routine, then we expect that they've
       * followed all our rules and registered new recovery functions.
       */
      switch (redo) {
      case DB_TXN_ABORT:
      case DB_TXN_APPLY:
      case DB_TXN_PRINT:
            make_call = 1;
            break;
      case DB_TXN_OPENFILES:
            /*
             * We collect all the transactions that have
             * "begin" records, those with no previous LSN,
             * so that we do not abort partial transactions.
             * These are known to be undone, otherwise the
             * log would not have been freeable.
             */
            memcpy(&prev_lsn, (u_int8_t *)db->data +
                sizeof(rectype) + sizeof(txnid), sizeof(prev_lsn));
            if (txnid != 0 && prev_lsn.file == 0 && (ret =
                __db_txnlist_add(dbenv, info, txnid, TXN_OK, NULL)) != 0)
                  return (ret);

            /* FALLTHROUGH */
      case DB_TXN_POPENFILES:
            if (rectype == DB___dbreg_register ||
                rectype == DB___txn_child ||
                rectype == DB___txn_ckp || rectype == DB___txn_recycle)
                  return (dtab[rectype](dbenv, db, lsnp, redo, info));
            break;
      case DB_TXN_BACKWARD_ROLL:
            /*
             * Running full recovery in the backward pass. In general,
             * we only process records during this pass that belong
             * to aborted transactions.  Unfortunately, there are several
             * exceptions:
             * 1. If this is a meta-record, one not associated with
             *    a transaction, then we must always process it.
             * 2. If this is a transaction commit/abort, we must
             *    always process it, so that we know the status of
             *    every transaction.
             * 3. If this is a child commit, we need to process it
             *    because the outcome of the child transaction depends
             *    on the outcome of the parent.
             * 4. If this is a dbreg_register record, we must always
             *    process is because they contain non-transactional
             *    closes that must be properly handled.
             * 5. If this is a noop, we must always undo it so that we
             *    properly handle any aborts before a file was closed.
             * 6. If this a file remove, we need to process it to
             *    determine if the on-disk file is the same as the
             *    one being described.
             */
            switch (rectype) {
            /*
             * These either do not belong to a transaction or (regop)
             * must be processed regardless of the status of the
             * transaction.
             */
            case DB___txn_regop:
            case DB___txn_recycle:
            case DB___txn_ckp:
                  make_call = 1;
                  break;
            /*
             * These belong to a transaction whose status must be
             * checked.
             */
            case DB___txn_child:
            case DB___db_noop:
            case DB___fop_file_remove:
            case DB___dbreg_register:
                  make_call = 1;

                  /* FALLTHROUGH */
            default:
                  if (txnid == 0)
                        break;

                  ret = __db_txnlist_find(dbenv, info, txnid, &status);

                  /* If not found, this is an incomplete abort.  */
                  if (ret == DB_NOTFOUND)
                        return (__db_txnlist_add(dbenv,
                            info, txnid, TXN_IGNORE, lsnp));
                  if (ret != 0)
                        return (ret);

                  /*
                   * If we ignore the transaction, ignore the operation
                   * UNLESS this is a child commit in which case we need
                   * to make sure that the child also gets marked as
                   * ignore.
                   */
                  if (status == TXN_IGNORE && rectype != DB___txn_child) {
                        make_call = 0;
                        break;
                  }
                  if (status == TXN_COMMIT)
                        break;

                  /* Set make_call in case we came through default */
                  make_call = 1;
                  if (status == TXN_OK &&
                      (ret = __db_txnlist_update(dbenv,
                      info, txnid, rectype == DB___txn_xa_regop ?
                      TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0)
                        return (ret);
            }
            break;
      case DB_TXN_FORWARD_ROLL:
            /*
             * In the forward pass, if we haven't seen the transaction,
             * do nothing, else recover it.
             *
             * We need to always redo DB___db_noop records, so that we
             * properly handle any commits after the file was closed.
             */
            switch (rectype) {
            case DB___txn_recycle:
            case DB___txn_ckp:
            case DB___db_noop:
            case DB___dbreg_register:
                  make_call = 1;
                  break;

            default:
                  if (txnid == 0)
                        status = 0;
                  else {
                        ret = __db_txnlist_find(dbenv,
                            info, txnid, &status);

                        if (ret == DB_NOTFOUND)
                              /* Break out out of if clause. */
                              ;
                        else if (ret != 0)
                              return (ret);
                        else if (status == TXN_COMMIT) {
                              make_call = 1;
                              break;
                        }
                  }

#ifndef HAVE_FTRUNCATE
                  if (status != TXN_IGNORE &&
                      (rectype == DB___ham_metagroup ||
                      rectype == DB___ham_groupalloc ||
                      rectype == DB___db_pg_alloc)) {
                        /*
                         * Because we do not have truncate
                         * all allocation records must be reprocessed
                         * during rollforward in case the file was
                         * just created.  It may not have been
                         * present during the backward pass.
                         */
                        make_call = 1;
                        redo = DB_TXN_BACKWARD_ALLOC;
                  }
#endif
            }
            break;
      case DB_TXN_BACKWARD_ALLOC:
      default:
            return (__db_unknown_flag(
                dbenv, "__db_dispatch", (u_int32_t)redo));
      }

      if (make_call) {
            /*
             * If the debug flag is set then we are logging
             * records for a non-durable update so that they
             * may be examined for diagnostic purposes.
             * So only make the call if we are printing,
             * otherwise we need to extract the previous
             * lsn so undo will work properly.
             */
            if (rectype & DB_debug_FLAG) {
                  if (redo == DB_TXN_PRINT)
                        rectype &= ~DB_debug_FLAG;
                  else {
                        memcpy(lsnp,
                            (u_int8_t *)db->data +
                            sizeof(rectype) +
                            sizeof(txnid), sizeof(*lsnp));
                        return (0);
                  }
            }
            if (rectype >= DB_user_BEGIN && dbenv->app_dispatch != NULL)
                  return (dbenv->app_dispatch(dbenv, db, lsnp, redo));
            else {
                  /*
                   * The size of the dtab table argument is the same as
                   * the standard table, use the standard table's size
                   * as our sanity check.
                   */
                  if (rectype > dtabsize || dtab[rectype] == NULL) {
                        __db_errx(dbenv,
                            "Illegal record type %lu in log",
                            (u_long)rectype);
                        return (EINVAL);
                  }
                  return (dtab[rectype](dbenv, db, lsnp, redo, info));
            }
      }

      return (0);
}

/*
 * __db_add_recovery --
 *
 * PUBLIC: int __db_add_recovery __P((DB_ENV *,
 * PUBLIC:   int (***)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), size_t *,
 * PUBLIC:   int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
 */
int
__db_add_recovery(dbenv, dtab, dtabsize, func, ndx)
      DB_ENV *dbenv;
      int (***dtab) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
      size_t *dtabsize;
      int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
      u_int32_t ndx;
{
      size_t i, nsize;
      int ret;

      /* Check if we have to grow the table. */
      if (ndx >= *dtabsize) {
            nsize = ndx + 40;
            if ((ret =
                __os_realloc(dbenv, nsize * sizeof((*dtab)[0]), dtab)) != 0)
                  return (ret);
            for (i = *dtabsize; i < nsize; ++i)
                  (*dtab)[i] = NULL;
            *dtabsize = nsize;
      }

      (*dtab)[ndx] = func;
      return (0);
}

/*
 * __db_txnlist_init --
 *    Initialize transaction linked list.
 *
 * PUBLIC: int __db_txnlist_init __P((DB_ENV *,
 * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
 */
int
__db_txnlist_init(dbenv, low_txn, hi_txn, trunc_lsn, retp)
      DB_ENV *dbenv;
      u_int32_t low_txn, hi_txn;
      DB_LSN *trunc_lsn;
      DB_TXNHEAD **retp;
{
      DB_TXNHEAD *headp;
      u_int32_t size, tmp;
      int ret;

      /*
       * Size a hash table.
       *    If low is zero then we are being called during rollback
       * and we need only one slot.
       *    Hi maybe lower than low if we have recycled txnid's.
       *    The numbers here are guesses about txn density, we can afford
       * to look at a few entries in each slot.
       */
      if (low_txn == 0)
            size = 1;
      else {
            if (hi_txn < low_txn) {
                  tmp = hi_txn;
                  hi_txn = low_txn;
                  low_txn = tmp;
            }
            tmp = hi_txn - low_txn;
            /* See if we wrapped around. */
            if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
                  tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
            size = tmp / 5;
            if (size < 100)
                  size = 100;
      }
      if ((ret = __os_malloc(dbenv,
          sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
            return (ret);

      memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
      headp->maxid = hi_txn;
      headp->generation = 0;
      headp->nslots = size;
      headp->gen_alloc = 8;
      if ((ret = __os_malloc(dbenv, headp->gen_alloc *
          sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
            __os_free(dbenv, headp);
            return (ret);
      }
      headp->gen_array[0].generation = 0;
      headp->gen_array[0].txn_min = TXN_MINIMUM;
      headp->gen_array[0].txn_max = TXN_MAXIMUM;
      if (trunc_lsn != NULL) {
            headp->trunc_lsn = *trunc_lsn;
            headp->maxlsn = *trunc_lsn;
      } else {
            ZERO_LSN(headp->trunc_lsn);
            ZERO_LSN(headp->maxlsn);
      }
      ZERO_LSN(headp->ckplsn);

      *retp = headp;
      return (0);
}

#define     FIND_GENERATION(hp, txnid, gen) do {                        \
      u_int32_t __i;                                        \
      for (__i = 0; __i <= (hp)->generation; __i++)               \
            /* The range may wrap around the end. */        \
            if ((hp)->gen_array[__i].txn_min <              \
                (hp)->gen_array[__i].txn_max ?              \
                ((txnid) >= (hp)->gen_array[__i].txn_min &&       \
                (txnid) <= (hp)->gen_array[__i].txn_max) :        \
                ((txnid) >= (hp)->gen_array[__i].txn_min ||       \
                (txnid) <= (hp)->gen_array[__i].txn_max))         \
                  break;                                    \
      DB_ASSERT(dbenv, __i <= (hp)->generation);                  \
      gen = (hp)->gen_array[__i].generation;                      \
} while (0)

/*
 * __db_txnlist_add --
 *    Add an element to our transaction linked list.
 *
 * PUBLIC: int __db_txnlist_add __P((DB_ENV *,
 * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
 */
int
__db_txnlist_add(dbenv, hp, txnid, status, lsn)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      u_int32_t txnid, status;
      DB_LSN *lsn;
{
      DB_TXNLIST *elp;
      int ret;

      if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
            return (ret);

      LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);

      /* Find the most recent generation containing this ID */
      FIND_GENERATION(hp, txnid, elp->u.t.generation);
      elp->type = TXNLIST_TXNID;
      elp->u.t.txnid = txnid;
      elp->u.t.status = status;
      if (txnid > hp->maxid)
            hp->maxid = txnid;
      if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
            hp->maxlsn = *lsn;

      DB_ASSERT(dbenv, lsn == NULL ||
          status != TXN_COMMIT || LOG_COMPARE(&hp->maxlsn, lsn) >= 0);

      return (0);
}

/*
 * __db_txnlist_remove --
 *    Remove an element from our transaction linked list.
 *
 * PUBLIC: int __db_txnlist_remove __P((DB_ENV *, DB_TXNHEAD *, u_int32_t));
 */
int
__db_txnlist_remove(dbenv, hp, txnid)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      u_int32_t txnid;
{
      DB_TXNLIST *entry;
      u_int32_t status;

      return (__db_txnlist_find_internal(dbenv,
          hp, TXNLIST_TXNID, txnid, NULL, &entry, 1, &status));
}

/*
 * __db_txnlist_ckp --
 *    Used to record the maximum checkpoint that will be retained
 * after recovery.  Typically this is simply the max checkpoint, but
 * if we are doing client replication recovery or timestamp-based
 * recovery, we are going to virtually truncate the log and we need
 * to retain the last checkpoint before the truncation point.
 *
 * PUBLIC: void __db_txnlist_ckp __P((DB_ENV *, DB_TXNHEAD *, DB_LSN *));
 */
void
__db_txnlist_ckp(dbenv, hp, ckp_lsn)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      DB_LSN *ckp_lsn;
{

      COMPQUIET(dbenv, NULL);

      if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
          LOG_COMPARE(&hp->maxlsn, ckp_lsn) >= 0)
            hp->ckplsn = *ckp_lsn;
}

/*
 * __db_txnlist_end --
 *    Discard transaction linked list.
 *
 * PUBLIC: void __db_txnlist_end __P((DB_ENV *, DB_TXNHEAD *));
 */
void
__db_txnlist_end(dbenv, hp)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
{
      u_int32_t i;
      DB_TXNLIST *p;

      if (hp == NULL)
            return;

      for (i = 0; i < hp->nslots; i++)
            while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
                  switch (p->type) {
                  case TXNLIST_LSN:
                        __os_free(dbenv, p->u.l.lsn_stack);
                        break;
                  case TXNLIST_DELETE:
                  case TXNLIST_PGNO:
                  case TXNLIST_TXNID:
                  default:
                        /*
                         * Possibly an incomplete DB_TXNLIST; just
                         * free it.
                         */
                        break;
                  }
                  LIST_REMOVE(p, links);
                  __os_free(dbenv, p);
            }

      if (hp->gen_array != NULL)
            __os_free(dbenv, hp->gen_array);
      __os_free(dbenv, hp);
}

/*
 * __db_txnlist_find --
 *    Checks to see if a txnid with the current generation is in the
 *    txnid list.  This returns DB_NOTFOUND if the item isn't in the
 *    list otherwise it returns (like __db_txnlist_find_internal)
 *    the status of the transaction.  A txnid of 0 means the record
 *    was generated while not in a transaction.
 *
 * PUBLIC: int __db_txnlist_find __P((DB_ENV *,
 * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t *));
 */
int
__db_txnlist_find(dbenv, hp, txnid, statusp)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      u_int32_t txnid, *statusp;
{
      DB_TXNLIST *entry;

      if (txnid == 0)
            return (DB_NOTFOUND);

      return (__db_txnlist_find_internal(dbenv, hp,
          TXNLIST_TXNID, txnid, NULL, &entry, 0, statusp));
}

/*
 * __db_txnlist_update --
 *    Change the status of an existing transaction entry.
 *    Returns DB_NOTFOUND if no such entry exists.
 *
 * PUBLIC: int __db_txnlist_update __P((DB_ENV *, DB_TXNHEAD *,
 * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
 */
int
__db_txnlist_update(dbenv, hp, txnid, status, lsn, ret_status, add_ok)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      u_int32_t txnid, status;
      DB_LSN *lsn;
      u_int32_t *ret_status;
      int add_ok;
{
      DB_TXNLIST *elp;
      int ret;

      if (txnid == 0)
            return (DB_NOTFOUND);

      ret = __db_txnlist_find_internal(dbenv,
          hp, TXNLIST_TXNID, txnid, NULL, &elp, 0, ret_status);

      if (ret == DB_NOTFOUND && add_ok) {
            *ret_status = status;
            return (__db_txnlist_add(dbenv, hp, txnid, status, lsn));
      }
      if (ret != 0)
            return (ret);

      if (*ret_status == TXN_IGNORE)
            return (0);

      elp->u.t.status = status;

      if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
            hp->maxlsn = *lsn;

      return (ret);
}

/*
 * __db_txnlist_find_internal --
 *    Find an entry on the transaction list.  If the entry is not there or
 *    the list pointer is not initialized we return DB_NOTFOUND.  If the
 *    item is found, we return the status.  Currently we always call this
 *    with an initialized list pointer but checking for NULL keeps it general.
 */
static int
__db_txnlist_find_internal(dbenv,
    hp, type, txnid, uid, txnlistp, delete, statusp)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      db_txnlist_type type;
      u_int32_t  txnid;
      u_int8_t uid[DB_FILE_ID_LEN];
      DB_TXNLIST **txnlistp;
      int delete;
      u_int32_t *statusp;
{
      struct __db_headlink *head;
      DB_TXNLIST *p;
      u_int32_t generation, hash;
      int ret;

      ret = 0;

      if (hp == NULL)
            return (DB_NOTFOUND);

      switch (type) {
      case TXNLIST_TXNID:
            hash = txnid;
            FIND_GENERATION(hp, txnid, generation);
            break;
      case TXNLIST_PGNO:
            memcpy(&hash, uid, sizeof(hash));
            generation = 0;
            break;
      case TXNLIST_DELETE:
      case TXNLIST_LSN:
      default:
            return (__db_panic(dbenv, EINVAL));
      }

      head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
      LIST_FOREACH(p, head, links) {
            if (p->type != type)
                  continue;
            switch (type) {
            case TXNLIST_TXNID:
                  if (p->u.t.txnid != txnid ||
                      generation != p->u.t.generation)
                        continue;
                  *statusp = p->u.t.status;
                  break;

            case TXNLIST_PGNO:
                  if (memcmp(uid, p->u.p.uid, DB_FILE_ID_LEN) != 0)
                        continue;
                  *statusp = TXN_OK;
                  break;
            case TXNLIST_DELETE:
            case TXNLIST_LSN:
            default:
                  return (__db_panic(dbenv, EINVAL));
            }
            if (delete == 1) {
                  LIST_REMOVE(p, links);
                  __os_free(dbenv, p);
                  *txnlistp = NULL;
            } else if (p != LIST_FIRST(head)) {
                  /* Move it to head of list. */
                  LIST_REMOVE(p, links);
                  LIST_INSERT_HEAD(head, p, links);
                  *txnlistp = p;
            } else
                  *txnlistp = p;
            return (ret);
      }

      return (DB_NOTFOUND);
}

/*
 * __db_txnlist_gen --
 *    Change the current generation number.
 *
 * PUBLIC: int __db_txnlist_gen __P((DB_ENV *,
 * PUBLIC:       DB_TXNHEAD *, int, u_int32_t, u_int32_t));
 */
int
__db_txnlist_gen(dbenv, hp, incr, min, max)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      int incr;
      u_int32_t min, max;
{
      int ret;

      /*
       * During recovery generation numbers keep track of "restart"
       * checkpoints and recycle records.  Restart checkpoints occur
       * whenever we take a checkpoint and there are no outstanding
       * transactions.  When that happens, we can reset transaction IDs
       * back to TXNID_MINIMUM.  Currently we only do the reset
       * at then end of recovery.  Recycle records occur when txnids
       * are exhausted during runtime.  A free range of ids is identified
       * and logged.  This code maintains a stack of ranges.  A txnid
       * is given the generation number of the first range it falls into
       * in the stack.
       */
      if (incr < 0) {
            --hp->generation;
            memmove(hp->gen_array, &hp->gen_array[1],
                (hp->generation + 1) * sizeof(hp->gen_array[0]));
      } else {
            ++hp->generation;
            if (hp->generation >= hp->gen_alloc) {
                  hp->gen_alloc *= 2;
                  if ((ret = __os_realloc(dbenv, hp->gen_alloc *
                      sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
                        return (ret);
            }
            memmove(&hp->gen_array[1], &hp->gen_array[0],
                hp->generation * sizeof(hp->gen_array[0]));
            hp->gen_array[0].generation = hp->generation;
            hp->gen_array[0].txn_min = min;
            hp->gen_array[0].txn_max = max;
      }
      return (0);
}

/*
 * __db_txnlist_lsnadd --
 *    Save the prev_lsn from a txn_child record.
 *
 * PUBLIC: int __db_txnlist_lsnadd __P((DB_ENV *, DB_TXNHEAD *, DB_LSN *));
 */
int
__db_txnlist_lsnadd(dbenv, hp, lsnp)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      DB_LSN *lsnp;
{
      DB_TXNLIST *elp;
      int ret;

      if (IS_ZERO_LSN(*lsnp))
            return (0);

      LIST_FOREACH(elp, &hp->head[0], links)
            if (elp->type == TXNLIST_LSN)
                  break;

      if (elp == NULL) {
            if ((ret = __db_txnlist_lsninit(dbenv, hp, lsnp)) != 0)
                  return (ret);
            return (DB_SURPRISE_KID);
      }

      if (elp->u.l.stack_indx == elp->u.l.stack_size) {
            elp->u.l.stack_size <<= 1;
            if ((ret = __os_realloc(dbenv, sizeof(DB_LSN) *
                 elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) {
                  __db_txnlist_end(dbenv, hp);
                  return (ret);
            }
      }
      elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp;

      return (0);
}

/*
 * __db_txnlist_lsnget --
 *
 * PUBLIC: int __db_txnlist_lsnget __P((DB_ENV *,
 * PUBLIC:     DB_TXNHEAD *, DB_LSN *, u_int32_t));
 *    Get the lsn saved from a txn_child record.
 */
int
__db_txnlist_lsnget(dbenv, hp, lsnp, flags)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      DB_LSN *lsnp;
      u_int32_t flags;
{
      DB_TXNLIST *elp;

      COMPQUIET(dbenv, NULL);
      COMPQUIET(flags, 0);

      LIST_FOREACH(elp, &hp->head[0], links)
            if (elp->type == TXNLIST_LSN)
                  break;

      if (elp == NULL || elp->u.l.stack_indx == 0) {
            ZERO_LSN(*lsnp);
            return (0);
      }

      *lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx];

      return (0);
}

/*
 * __db_txnlist_lsninit --
 *    Initialize a transaction list with an lsn array entry.
 *
 * PUBLIC: int __db_txnlist_lsninit __P((DB_ENV *, DB_TXNHEAD *, DB_LSN *));
 */
int
__db_txnlist_lsninit(dbenv, hp, lsnp)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      DB_LSN *lsnp;
{
      DB_TXNLIST *elp;
      int ret;

      elp = NULL;

      if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
            goto err;
      LIST_INSERT_HEAD(&hp->head[0], elp, links);

      elp->type = TXNLIST_LSN;
      if ((ret = __os_malloc(dbenv,
          sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0)
            goto err;
      elp->u.l.stack_indx = 1;
      elp->u.l.stack_size = DB_LSN_STACK_SIZE;
      elp->u.l.lsn_stack[0] = *lsnp;

      return (0);

err:  __db_txnlist_end(dbenv, hp);
      return (ret);
}

/*
 * __db_add_limbo -- add pages to the limbo list.
 *    Get the file information and call pgnoadd for each page.
 *
 * PUBLIC: int __db_add_limbo __P((DB_ENV *,
 * PUBLIC:      DB_TXNHEAD *, int32_t, db_pgno_t, u_int32_t));
 */
int
__db_add_limbo(dbenv, hp, fileid, pgno, count)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      int32_t fileid;
      db_pgno_t pgno;
      u_int32_t count;
{
      DB_LOG *dblp;
      FNAME *fnp;
      int ret;

      dblp = dbenv->lg_handle;
      if ((ret = __dbreg_id_to_fname(dblp, fileid, 0, &fnp)) != 0)
            return (ret);

      do {
            if ((ret =
                __db_txnlist_pgnoadd(dbenv, hp, fileid, fnp->ufid,
                R_ADDR(&dblp->reginfo, fnp->name_off), pgno)) != 0)
                  return (ret);
            pgno++;
      } while (--count != 0);

      return (0);
}

/*
 * __db_do_the_limbo -- move pages from limbo to free.
 *
 * Limbo processing is what ensures that we correctly handle and
 * recover from page allocations.  During recovery, for each database,
 * we process each in-question allocation, link them into the free list
 * and then write out the new meta-data page that contains the pointer
 * to the new beginning of the free list.  On an abort, we use our
 * standard __db_free mechanism in a compensating transaction which logs
 * the specific modifications to the free list.
 *
 * If we run out of log space during an abort, then we can't write the
 * compensating transaction, so we abandon the idea of a compensating
 * transaction, and go back to processing how we do during recovery.
 * The reason that this is not the norm is that it's expensive: it requires
 * that we flush any database with an in-question allocation.  Thus if
 * a compensating transaction fails, we never try to restart it.
 *
 * Since files may be open and closed within transactions (in particular,
 * the master database for subdatabases), we must be prepared to open
 * files during this process.  If there is a compensating transaction, we
 * can open the files in that transaction.  If this was an abort and there
 * is no compensating transaction, then we've got to perform these opens
 * in the context of the aborting transaction so that we do not deadlock.
 * During recovery, there's no locking, so this isn't an issue.
 *
 * What you want to keep in mind when reading this is that there are two
 * algorithms going on here:  ctxn == NULL, then we're either in recovery
 * or our compensating transaction has failed and we're doing the
 * "create list and write meta-data page" algorithm.  Otherwise, we're in
 * an abort and doing the "use compensating transaction" algorithm.
 *
 * PUBLIC: int __db_do_the_limbo __P((DB_ENV *,
 * PUBLIC:     DB_TXN *, DB_TXN *, DB_TXNHEAD *, db_limbo_state));
 */
int
__db_do_the_limbo(dbenv, ptxn, txn, hp, state)
      DB_ENV *dbenv;
      DB_TXN *ptxn, *txn;
      DB_TXNHEAD *hp;
      db_limbo_state state;
{
      DB_TXNLIST *elp;
      u_int32_t h;
      int ret;

      ret = 0;
      /*
       * The slots correspond to hash buckets.  We've hashed the
       * fileids into hash buckets and need to pick up all affected
       * files. (There will only be a single slot for an abort.)
       */
      for (h = 0; h < hp->nslots; h++) {
            if ((elp = LIST_FIRST(&hp->head[h])) == NULL)
                  continue;
            if (ptxn != NULL) {
                  if ((ret =
                      __db_limbo_move(dbenv, ptxn, txn, elp)) != 0)
                        goto err;
            } else if ((ret =
                __db_limbo_bucket(dbenv, txn, elp, state)) != 0)
                  goto err;
      }

err:  if (ret != 0) {
            __db_errx(dbenv, "Fatal error in abort of an allocation");
            ret = __db_panic(dbenv, ret);
      }

      return (ret);
}

/* Limbo support routines. */

/*
 * __db_lock_move --
 *    Move a lock from child to parent.
 */
static int
__db_lock_move(dbenv, fileid, pgno, mode, ptxn, txn)
      DB_ENV *dbenv;
      u_int8_t *fileid;
      db_pgno_t pgno;
      db_lockmode_t mode;
      DB_TXN *ptxn, *txn;
{
      DBT lock_dbt;
      DB_LOCK lock;
      DB_LOCK_ILOCK lock_obj;
      DB_LOCKREQ req;
      int ret;

      lock_obj.pgno = pgno;
      memcpy(lock_obj.fileid, fileid, DB_FILE_ID_LEN);
      lock_obj.type = DB_PAGE_LOCK;

      memset(&lock_dbt, 0, sizeof(lock_dbt));
      lock_dbt.data = &lock_obj;
      lock_dbt.size = sizeof(lock_obj);

      if ((ret = __lock_get(dbenv,
          txn->locker, 0, &lock_dbt, mode, &lock)) == 0) {
            memset(&req, 0, sizeof(req));
            req.lock = lock;
            req.op = DB_LOCK_TRADE;
            ret = __lock_vec(dbenv, ptxn->locker, 0, &req, 1, NULL);
      }
      return (ret);
}

/*
 * __db_limbo_move
 *    Move just the metapage lock to the parent.
 */
static int
__db_limbo_move(dbenv, ptxn, txn, elp)
      DB_ENV *dbenv;
      DB_TXN *ptxn, *txn;
      DB_TXNLIST *elp;
{
      int ret;

      for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
            if (elp->type != TXNLIST_PGNO || elp->u.p.locked == 1)
                  continue;
            if ((ret = __db_lock_move(dbenv, elp->u.p.uid,
                PGNO_BASE_MD, DB_LOCK_WRITE, ptxn, txn)) != 0)
                  return (ret);
            elp->u.p.locked = 1;
      }

      return (0);
}
/*
 * __db_limbo_bucket
 *    Perform limbo processing for a single hash bucket in the txnlist.
 * txn is the transaction aborting in the case of an abort and ctxn is the
 * compensating transaction.
 */

#define     T_RESTORED(txn)       ((txn) != NULL && F_ISSET(txn, TXN_RESTORED))
static int
__db_limbo_bucket(dbenv, txn, elp, state)
      DB_ENV *dbenv;
      DB_TXN *txn;
      DB_TXNLIST *elp;
      db_limbo_state state;
{
      DB *dbp;
      DB_MPOOLFILE *mpf;
      DBMETA *meta;
      DB_TXN *ctxn, *t;
      FNAME *fname;
      db_pgno_t last_pgno, pgno;
      int dbp_created, in_retry, ret, t_ret;

      ctxn = NULL;
      in_retry = 0;
      meta = NULL;
      mpf = NULL;
      ret = 0;
      for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
            if (elp->type != TXNLIST_PGNO)
                  continue;
retry:            dbp_created = 0;

            /*
             * Pick the transaction in which to potentially
             * log compensations.
             */
            if (state == LIMBO_PREPARE)
                  ctxn = txn;
            else if (!in_retry && state != LIMBO_RECOVER &&
                state != LIMBO_TIMESTAMP && !T_RESTORED(txn) &&
                (ret = __txn_compensate_begin(dbenv, &ctxn)) != 0)
                  return (ret);

            /*
             * Either use the compensating transaction or
             * the one passed in, which will be null if recovering.
             */
            t = ctxn == NULL ? txn : ctxn;

            /* First try to get a dbp by fileid. */
            ret = __dbreg_id_to_db(dbenv, t, &dbp, elp->u.p.fileid, 0);

            /*
             * If the file was closed and reopened its id could change.
             * Look it up the hard way.
             */
            if (ret == DB_DELETED || ret == ENOENT ||
                ((ret == 0 &&
                memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) != 0))) {
                  if ((ret = __dbreg_fid_to_fname(
                      dbenv->lg_handle, elp->u.p.uid, 0, &fname)) == 0)
                        ret = __dbreg_id_to_db(
                             dbenv, t, &dbp, fname->id, 0);
            }
            /*
             * File is being destroyed.  No need to worry about
             * dealing with recovery of allocations.
             */
            if (ret == DB_DELETED ||
                (ret == 0 && F_ISSET(dbp, DB_AM_DISCARD)))
                  goto next;

            if (ret != 0) {
                  if ((ret = __db_create_internal(&dbp, dbenv, 0)) != 0)
                        goto err;

                  /*
                   * This tells the system not to lock, which is always
                   * OK, whether this is an abort or recovery.
                   */
                  F_SET(dbp, DB_AM_COMPENSATE);
                  dbp_created = 1;

                  /* It is ok if the file is nolonger there. */
                  ret = __db_open(dbp, t, elp->u.p.fname, NULL,
                      DB_UNKNOWN, DB_ODDFILESIZE, __db_omode(OWNER_RW),
                      PGNO_BASE_MD);
                  if (ret == ENOENT)
                        goto next;
            }

            /*
             * Verify that we are opening the same file that we were
             * referring to when we wrote this log record.
             */
            if (memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
                  goto next;

            mpf = dbp->mpf;
            last_pgno = PGNO_INVALID;

            if (meta == NULL &&
                (ctxn == NULL || state == LIMBO_COMPENSATE)) {
                  pgno = PGNO_BASE_MD;
                  if ((ret = __memp_fget(mpf, &pgno, txn,
                      (state != LIMBO_PREPARE) ? DB_MPOOL_DIRTY : 0,
                      &meta)) != 0)
                        goto err;
                  last_pgno = meta->free;
            }

            if (state == LIMBO_PREPARE) {
                  if ((ret = __db_limbo_prepare(dbp, ctxn, elp)) != 0)
                        goto err;
            } else
                  ret = __db_limbo_fix(dbp,
                       ctxn, elp, &last_pgno, meta, state);
            /*
             * If we were doing compensating transactions, then we are
             * going to hope this error was due to running out of space.
             * We'll change modes (into the sync the file mode) and keep
             * trying.  If we weren't doing compensating transactions,
             * then this is a real error and we're sunk.
             */
            if (ret != 0) {
                  if (ret == DB_RUNRECOVERY || ctxn == NULL)
                        goto err;
                  in_retry = 1;
                  if ((ret = __txn_abort(ctxn)) != 0)
                        goto err;
                  ctxn = NULL;
                  goto retry;
            }

            if (state == LIMBO_PREPARE)
                  ctxn = NULL;

            else if (ctxn != NULL) {
                  /*
                   * We only force compensation at the end of recovery.
                   * We want the txn_commit to be logged so turn
                   * off the recovery flag briefly.
                   */
                  if (state == LIMBO_COMPENSATE)
                        F_CLR(dbenv->lg_handle, DBLOG_RECOVER);
                  ret = __txn_commit(ctxn, DB_TXN_NOSYNC);
                  ctxn = NULL;
                  if (state == LIMBO_COMPENSATE)
                        F_SET(dbenv->lg_handle, DBLOG_RECOVER);
                  if (ret != 0)
                        goto retry;
            }

            /*
             * This is where we handle the case where we're explicitly
             * putting together a free list.  We need to decide whether
             * we have to write the meta-data page, and if we do, then
             * we need to sync it as well.
             */
            else if (last_pgno == meta->free) {
                  /* No change to page; just put the page back. */
                  if ((ret =
                      __memp_fput(mpf, meta, DB_PRIORITY_UNCHANGED)) != 0)
                        goto err;
                  meta = NULL;
            } else {
                  /*
                   * These changes are unlogged so we cannot have the
                   * metapage pointing at pages that are not on disk.
                   * Therefore, we flush the new free list, then update
                   * the metapage.  We have to put the meta-data page
                   * first so that it isn't pinned when we try to sync.
                   */
                  if (!IS_RECOVERING(dbenv) && !T_RESTORED(txn))
                        __db_errx(dbenv, "Flushing free list to disk");
                  if ((ret =
                      __memp_fput(mpf, meta, DB_PRIORITY_UNCHANGED)) != 0)
                        goto err;
                  meta = NULL;
                  /*
                   * If the sync fails then we cannot flush the
                   * newly allocated pages.  That is, the file
                   * cannot be extended. Don't let the metapage
                   * point at them.
                   * We may lose these pages from the file if it
                   * can be extended later.  If there is never
                   * space for the pages, then things will be ok.
                   */
                  if ((ret = __db_sync(dbp)) == 0) {
                        pgno = PGNO_BASE_MD;
                        if ((ret = __memp_fget(mpf, &pgno, txn,
                            DB_MPOOL_DIRTY, &meta)) != 0)
                              goto err;
                        meta->free = last_pgno;
                        if ((ret = __memp_fput(mpf,
                            meta, DB_PRIORITY_UNCHANGED)) != 0)
                              goto err;
                        meta = NULL;
                  } else {
                        __db_err(dbenv, ret, "%s", dbp->fname);
                        __db_errx(dbenv, "%s: %s %s", dbp->fname,
                            "allocation flush failed, some free pages",
                            "may not appear in the free list");
                        ret = 0;
                  }
            }

next:
            /*
             * If we get here, either we have processed the list
             * or the db file has been deleted or could not be opened.
             */
            if (ctxn != NULL &&
                (t_ret = __txn_abort(ctxn)) != 0 && ret == 0)
                  ret = t_ret;

            if (dbp_created &&
                (t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
                  ret = t_ret;
            dbp = NULL;
            if (state != LIMBO_PREPARE && state != LIMBO_TIMESTAMP) {
                  __os_free(dbenv, elp->u.p.fname);
                  __os_free(dbenv, elp->u.p.pgno_array);
            }
            if (ret == ENOENT)
                  ret = 0;
            else if (ret != 0)
                  goto err;
      }

err:  if (meta != NULL)
            (void)__memp_fput(mpf, meta, DB_PRIORITY_UNCHANGED);
      return (ret);
}

/*
 * __db_limbo_fix --
 *    Process a single limbo entry which describes all the page allocations
 * for a single file.
 */
static int
__db_limbo_fix(dbp, ctxn, elp, lastp, meta, state)
      DB *dbp;
      DB_TXN *ctxn;
      DB_TXNLIST *elp;
      db_pgno_t *lastp;
      DBMETA *meta;
      db_limbo_state state;
{
      DBC *dbc;
      DBT ldbt;
      DB_MPOOLFILE *mpf;
      PAGE *freep, *pagep;
      db_pgno_t next, pgno;
      u_int32_t i;
      int ret, t_ret;

      /*
       * Loop through the entries for this txnlist element and
       * either link them into the free list or write a compensating
       * record for each.
       */
      dbc = NULL;
      mpf = dbp->mpf;
      pagep = NULL;
      ret = 0;

      for (i = 0; i < elp->u.p.nentries; i++) {
            pgno = elp->u.p.pgno_array[i];

            if (pgno == PGNO_INVALID)
                  continue;

            if ((ret = __memp_fget(mpf, &pgno, ctxn,
                DB_MPOOL_CREATE | DB_MPOOL_EDIT, &pagep)) != 0) {
                  if (ret != ENOSPC)
                        goto err;
                  continue;
            }

            if (state == LIMBO_COMPENSATE || IS_ZERO_LSN(LSN(pagep))) {
                  if (ctxn == NULL) {
                        /*
                         * If this is a fatal recovery which
                         * spans a previous crash this page may
                         * be on the free list already.
                         */
                        for (next = *lastp; next != 0; ) {
                              if (next == pgno)
                                    break;
                              if ((ret = __memp_fget(mpf,
                                  &next, ctxn, 0, &freep)) != 0)
                                    goto err;
                              next = NEXT_PGNO(freep);
                              if ((ret = __memp_fput(mpf,
                                  freep, DB_PRIORITY_UNCHANGED)) != 0)
                                    goto err;
                        }

                        if (next != pgno) {
                              P_INIT(pagep, dbp->pgsize, pgno,
                                  PGNO_INVALID, *lastp, 0, P_INVALID);
                              /* Make the lsn non-zero but generic. */
                              INIT_LSN(LSN(pagep));
                              *lastp = pgno;
                        }
                  } else if (state == LIMBO_COMPENSATE) {
                        /*
                         * Generate a log record for what we did on the
                         * LIMBO_TIMESTAMP pass.  All pages here are
                         * free so P_OVERHEAD is sufficient.
                         */
                        ZERO_LSN(pagep->lsn);
                        memset(&ldbt, 0, sizeof(ldbt));
                        ldbt.data = pagep;
                        ldbt.size = P_OVERHEAD(dbp);
                        if ((ret = __db_pg_new_log(dbp, ctxn,
                             &LSN(meta), 0, pagep->pgno,
                             &LSN(meta), PGNO_BASE_MD,
                             &ldbt, pagep->next_pgno)) != 0)
                              goto err;
                  } else {
                        if (dbc == NULL && (ret =
                            __db_cursor(dbp, ctxn, &dbc, 0)) != 0)
                                    goto err;
                        /*
                         * If the dbp is compensating (because we
                         * opened it), the dbc will automatically be
                         * marked compensating, but in case we didn't
                         * do the open, we have to mark it explicitly.
                         */
                        F_SET(dbc, DBC_DONTLOCK);

                        /*
                         * If aborting a txn for a different process
                         * via XA or failchk, DB_AM_RECOVER will be
                         * set but we need to log the compensating
                         * transactions.
                         */
                        F_CLR(dbc, DBC_RECOVER);

                        ret = __db_free(dbc, pagep);
                        pagep = NULL;

                        /*
                         * On any error, we hope that the error was
                         * caused due to running out of space, and we
                         * switch modes, doing the processing where we
                         * sync out files instead of doing compensating
                         * transactions.  If this was a real error and
                         * not out of space, we assume that some other
                         * call will fail real soon.
                         */
                        if (ret != 0) {
                              /* Assume that this is out of space. */
                              (void)__dbc_close(dbc);
                              dbc = NULL;
                              goto err;
                        }
                  }
            }
            else
                  elp->u.p.pgno_array[i] = PGNO_INVALID;

            if (pagep != NULL) {
                  ret = __memp_fput(mpf, pagep, DB_PRIORITY_UNCHANGED);
                  pagep = NULL;
            }
            if (ret != 0)
                  goto err;
      }

err:  if (pagep != NULL &&
          (t_ret = __memp_fput(mpf, pagep, DB_PRIORITY_UNCHANGED)) != 0 &&
          ret == 0)
            ret = t_ret;
      if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
            ret = t_ret;
      return (ret);
}

static int
__db_limbo_prepare(dbp, txn, elp)
      DB *dbp;
      DB_TXN *txn;
      DB_TXNLIST *elp;
{
      DB_LSN lsn;
      DB_MPOOLFILE *mpf;
      PAGE *pagep;
      db_pgno_t pgno;
      u_int32_t i;
      int ret, t_ret;

      /*
       * Loop through the entries for this txnlist element and
       * output a prepare record for them.
       */
      pagep = NULL;
      ret = 0;
      mpf = dbp->mpf;

      for (i = 0; i < elp->u.p.nentries; i++) {
            pgno = elp->u.p.pgno_array[i];

            if ((ret = __memp_fget(mpf,
                &pgno, txn, DB_MPOOL_CREATE, &pagep)) != 0) {
                  if (ret != ENOSPC)
                        return (ret);
                  continue;
            }

            if (IS_ZERO_LSN(LSN(pagep)))
                  ret = __db_pg_prepare_log(dbp, txn, &lsn, 0, pgno);

            if ((t_ret = __memp_fput(mpf,
                pagep, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
                  ret = t_ret;

            if (ret != 0)
                  return (ret);
      }

      return (0);
}

#define     DB_TXNLIST_MAX_PGNO     8           /* A nice even number. */

/*
 * __db_txnlist_pgnoadd --
 *    Find the txnlist entry for a file and add this pgno, or add the list
 *    entry for the file and then add the pgno.
 */
static int
__db_txnlist_pgnoadd(dbenv, hp, fileid, uid, fname, pgno)
      DB_ENV *dbenv;
      DB_TXNHEAD *hp;
      int32_t fileid;
      u_int8_t uid[DB_FILE_ID_LEN];
      char *fname;
      db_pgno_t pgno;
{
      DB_TXNLIST *elp;
      size_t len;
      u_int32_t hash, status;
      int ret;

      elp = NULL;

      if ((ret = __db_txnlist_find_internal(dbenv, hp,
          TXNLIST_PGNO, 0, uid, &elp, 0, &status)) != 0 && ret != DB_NOTFOUND)
            goto err;

      if (ret == DB_NOTFOUND || status != TXN_OK) {
            if ((ret =
                __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
                  goto err;
            memcpy(&hash, uid, sizeof(hash));
            LIST_INSERT_HEAD(
                &hp->head[DB_TXNLIST_MASK(hp, hash)], elp, links);
            memcpy(elp->u.p.uid, uid, DB_FILE_ID_LEN);

            len = strlen(fname) + 1;
            if ((ret = __os_malloc(dbenv, len, &elp->u.p.fname)) != 0)
                  goto err;
            memcpy(elp->u.p.fname, fname, len);

            elp->u.p.maxentry = 0;
            elp->u.p.locked = 0;
            elp->type = TXNLIST_PGNO;
            if ((ret = __os_malloc(dbenv,
                8 * sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
                  goto err;
            elp->u.p.maxentry = DB_TXNLIST_MAX_PGNO;
            elp->u.p.nentries = 0;
      } else if (elp->u.p.nentries == elp->u.p.maxentry) {
            elp->u.p.maxentry <<= 1;
            if ((ret = __os_realloc(dbenv, elp->u.p.maxentry *
                sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
                  goto err;
      }

      elp->u.p.pgno_array[elp->u.p.nentries++] = pgno;
      /* Update to the latest fileid.  Limbo will find it faster. */
      elp->u.p.fileid = fileid;

      return (0);

err:  return (ret);
}

#ifdef DEBUG
/*
 * __db_txnlist_print --
 *    Print out the transaction list.
 *
 * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *));
 */
void
__db_txnlist_print(hp)
      DB_TXNHEAD *hp;
{
      DB_TXNLIST *p;
      u_int32_t i;
      char *txntype;

      printf("Maxid: %lu Generation: %lu\n",
          (u_long)hp->maxid, (u_long)hp->generation);
      for (i = 0; i < hp->nslots; i++)
            LIST_FOREACH(p, &hp->head[i], links) {
                  if (p->type != TXNLIST_TXNID) {
                        printf("Unrecognized type: %d\n", p->type);
                        continue;
                  }
                  switch (p->u.t.status) {
                  case TXN_OK:
                        txntype = "OK";
                        break;
                  case TXN_COMMIT:
                        txntype = "commit";
                        break;
                  case TXN_PREPARE:
                        txntype = "prepare";
                        break;
                  case TXN_ABORT:
                        txntype = "abort";
                        break;
                  case TXN_IGNORE:
                        txntype = "ignore";
                        break;
                  case TXN_EXPECTED:
                        txntype = "expected";
                        break;
                  case TXN_UNEXPECTED:
                        txntype = "unexpected";
                        break;
                  default:
                        txntype = "UNKNOWN";
                        break;
                  }
                  printf("TXNID: %lx(%lu): %s\n",
                      (u_long)p->u.t.txnid,
                      (u_long)p->u.t.generation, txntype);
            }
}
#endif

Generated by  Doxygen 1.6.0   Back to index