Re: [patch 3/4] [jbd] Add support for journal guided resync.

From: Andrew Morton
Date: Thu Oct 01 2009 - 19:40:35 EST


On Thu, 01 Oct 2009 18:39:32 -0400
scjody@xxxxxxx wrote:

> Adds support for declare blocks, used by ext3's journal guided resync (declared
> mode.) A declare block is added to the journal to list blocks to be written
> during the current transaction. During journal replay, we perform a RAID
> resync of only these blocks and skip the rest of the resync.
>
> ...
>
> +int wait_for_descriptors(journal_t *journal, transaction_t *trans) {

Opening brace goes on the next line, please. Several instances.

> + struct journal_head *jh;
> + struct buffer_head *bh;
> + int err = 0;
> +
> +wait_for_ctlbuf:
> +
> + while (trans->t_log_list != NULL) {
> +
> + jh = trans->t_log_list->b_tprev;
> + bh = jh2bh(jh);
> + if (buffer_locked(bh)) {
> + wait_on_buffer(bh);
> + goto wait_for_ctlbuf;
> + }
> + if (cond_resched())
> + goto wait_for_ctlbuf;
> +
> + if (unlikely(!buffer_uptodate(bh)))
> + err = -EIO;
> +
> + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
> + clear_buffer_jwrite(bh);
> + journal_unfile_buffer(journal, jh);
> + journal_put_journal_head(jh);
> + __brelse(bh); /* One for getblk */
> + }
> +
> + return err;
> +}
> +
> +struct journal_head *get_descriptor(journal_t *journal, transaction_t *trans,
> + int blocktype, char **tagp, int *space_left) {

Did all these functions need global scope? Some do, but I didn't check
them all.

Those which cannot be made static should be given appropriate names,
such as jbd_get_descriptor().

> + struct journal_head *descriptor;
> + struct buffer_head *dbh;
> + journal_header_t *header;
> +
> + jbd_debug(4, "JBD: get descriptor\n");
> +
> + descriptor = journal_get_descriptor_buffer(journal);
> + if (!descriptor)
> + return NULL;
> +
> + dbh = jh2bh(descriptor);
> + jbd_debug(4, "JBD: got buffer %llu (%p)\n",
> + (unsigned long long)dbh->b_blocknr, dbh->b_data);
> + header = (journal_header_t *)&dbh->b_data[0];
> + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
> + header->h_blocktype = cpu_to_be32(blocktype);
> + header->h_sequence = cpu_to_be32(trans->t_tid);
> +
> + *tagp = &dbh->b_data[sizeof(journal_header_t)];
> + *space_left = dbh->b_size - sizeof(journal_header_t);
> +
> + set_buffer_jwrite(dbh);
> + set_buffer_dirty(dbh);
> +
> + /* Record it so that we can wait for it later */
> + BUFFER_TRACE(dbh, "ph3: file as descriptor");
> + journal_file_buffer(descriptor, trans, BJ_LogCtl);
> +
> + return descriptor;
> +}
> +
>
> ...
>
> +void write_declare_blocks(journal_t *journal, transaction_t *transaction,
> + int committing)
> +{
> + struct journal_head *jh, *descriptor = NULL;
> + struct buffer_head *bh;
> + int i, bufs = 0, err;
> + unsigned int n, count = 0, to_write;
> + unsigned long nextblock = 0;
> + char *tagp = NULL;
> + journal_block_tag_t *tag = NULL;
> + int space_left = 0, first_tag = 0, tag_flag;
> + struct radix_tree_root *root;
> +
> + root = &transaction->t_declare_root;
> +
> + spin_lock(&journal->j_list_lock);
> + to_write = transaction->t_declare_request;
> + transaction->t_declare_request = 0;
> + spin_unlock(&journal->j_list_lock);
> +
> + if (to_write == UINT_MAX)
> + jbd_debug (1, "jbd: tid %d write declare request for ALL "
> + "blocks\n", transaction->t_tid);
> + else
> + jbd_debug (1, "jbd: tid %d write declare request for %u "
> + "blocks\n", transaction->t_tid, to_write);
> +write_declare:
> + cond_resched();
> + spin_lock(&journal->j_list_lock);
> +
> + n = radix_tree_gang_lookup(root, journal->j_declare_jhs, nextblock, 1);
> + while (n) {
> + if (!descriptor) {
> + J_ASSERT(bufs == 0);
> +
> + spin_unlock(&journal->j_list_lock);
> +
> + descriptor = get_descriptor(journal, transaction,
> + JFS_DECLARE_BLOCK,
> + &tagp, &space_left);
> +
> + if (!descriptor) {
> + journal_abort(journal, -EIO);
> + return;
> + }
> +
> + first_tag = 1;
> + journal->j_declare_bhs[bufs++] = jh2bh(descriptor);
> +
> + goto write_declare;
> + }
> +
> + jh = (struct journal_head *)journal->j_declare_jhs[0];
> + bh = jh2bh(jh);
> +
> + /* refile the buffer as having been declared */
> + if (!inverted_lock(journal, bh))
> + goto write_declare;
> + __journal_unfile_buffer(jh);
> + __journal_file_buffer(jh, transaction, BJ_DeclareDone);
> +
> + jbd_unlock_bh_state(bh);
> +
> + /* record the block's tag in the current descriptor buffer */
> + tag_flag = 0;
> + if (!first_tag)
> + tag_flag |= JFS_FLAG_SAME_UUID;
> +
> + tag = (journal_block_tag_t *)tagp;
> + tag->t_blocknr = cpu_to_be32(bh->b_blocknr);
> + tag->t_flags = cpu_to_be32(tag_flag);
> + tagp += sizeof(journal_block_tag_t);
> + space_left -= sizeof(journal_block_tag_t);
> +
> + if (first_tag) {
> + memcpy (tagp, journal->j_uuid, 16);

Please pass the patches through scripts/checkpatch.pl and review the
result.

> + tagp += 16;
> + space_left -= 16;
> + first_tag = 0;
> + }
> +
> + count++;
> +
> + /* advance to the next journal head and buffer */
> + nextblock = bh->b_blocknr + 1;
> + n = radix_tree_gang_lookup(root, journal->j_declare_jhs,
> + nextblock, 1);
> +
> + /* If there's no more to do, or if the descriptor is full,
> + let the IO rip! */
> +
> + if (bufs == ARRAY_SIZE(journal->j_declare_bhs) || n == 0 ||
> + count == to_write ||
> + space_left < sizeof(journal_block_tag_t) + 16) {
> +
> + jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
> +
> + /* Write an end-of-descriptor marker before
> + * submitting the IOs. "tag" still points to
> + * the last tag we set up.
> + */
> +
> + tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
> +
>
> ...
>
> +int journal_write_declare(journal_t *journal)
> +{
> + transaction_t *transaction = journal->j_running_transaction;
> + DEFINE_WAIT(wait);
> +
> + if (transaction == NULL)
> + return 0;
> +
> + spin_lock(&journal->j_list_lock);
> +
> + if (transaction->t_declare_root.rnode == NULL) {
> + spin_unlock(&journal->j_list_lock);
> + return 0;
> + }
> +
> + transaction->t_declare_request = UINT_MAX;
> +
> + jbd_debug(1, "waking commit thread for fsync declare\n");
> + wake_up(&journal->j_wait_commit);
> +
> + prepare_to_wait(&journal->j_wait_declare, &wait, TASK_INTERRUPTIBLE);
> + spin_unlock(&journal->j_list_lock);
> + schedule();
> + finish_wait(&journal->j_wait_declare, &wait);

If this function is only ever called by a kernel thread then
TASK_INTERRUPTIBLE is OK.

If this function can be called by a userspace process then
TASK_INTERRUPTIBLE is buggy - a signal_pending() state will cause the
schedule() to fall straight through.

> + return 0;
> +}
> +
>
> ...
>
> +static int resync_range(journal_t *j, unsigned long start,
> + unsigned long end)
> +{
> + int err;
> + struct inode *fake_inode = kmalloc(sizeof(*fake_inode), GFP_KERNEL);
> + mdu_range_t range;
> + sector_t sectors_per_block = j->j_blocksize >> 9;
> + mm_segment_t old_fs;
> +
> + if (fake_inode == NULL) {
> + printk(KERN_ERR "JBD: Out of memory during recovery.\n");
> + return -ENOMEM;
> + }
> +
> + fake_inode->i_bdev = j->j_fs_dev;
> + range.start = start * sectors_per_block;
> + range.end = end * sectors_per_block + sectors_per_block - 1;
> +
> + old_fs = get_fs();
> + set_fs(KERNEL_DS);
> + err = blkdev_driver_ioctl(fake_inode, NULL, j->j_fs_dev->bd_disk,
> + RESYNC_RANGE, (long)&range);
> + set_fs(old_fs);

erk, what's happening here.

I can't find the blkdev_driver_ioctl() definition. If I could, I'd be
asking if it can be refactored so we can call a regular kernel function
and avoid the nasty set_fs(), and do away with the nasty fake_inode.


> + jbd_debug(3, "RESYNC_RANGE of sectors %llu - %llu returned %d\n",
> + range.start, range.end, err);
> +
> + kfree(fake_inode);
> +
> + return err;
> +}
>
>
> ...
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/