Re: soft-update/async write file systems

Jeremy Fitzhardinge (jeremy@zip.com.au)
Fri, 20 Feb 1998 11:32:18 +1100

Messages sorted by: [ date ][ thread ][ subject ][ author ]
Next message: Andries.Brouwer@cwi.nl: "Re: mkswap & swapon"
Previous message: Marc Horowitz: "Re: New Feature Idea: Compress swap file"
Maybe in reply to: Larry M. Augustin: "soft-update/async write file systems"

This is a multi-part message in MIME format.
--------------8DE25416562E9704990B39F7
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Theodore Y. Ts'o wrote:
> The real problem (and I ran into this while I started looking at trying
> to do metadata logging for ext2fs) is that the Linux block device
> interface is a real disaster right now. It very badly needs to be
> rewritten from scratch. Between the what the high-level SCSI code does
> to re-order and requests for performance reasons, and the fairly adhoc
> block device layer, it's pretty much impossible for the filesystem to
> request or expect any kind of write ordering semantics.

That's what I'm finding. I implemented a simple layer over the buffer
system so I could do ordered writes. The code is basically a sketch - I
haven't tested it yet. I've attached it below to see what people have
to say.

The basic idea is that I refrain from marking buffers dirty until
they're due for writing. To do this I put blocks into phases, which may
only be written once all previous phases are complete. I use the io
completion callback to count how many blocks remain in a phase, to
determine whether it can go onto the next one.

There's a couple of other subtleties, but it allows a filesystem to
submit blocks in groups while still specifying necessary write
orderings. The phase structure allows the drivers/devices to still see
many blocks and write them in any order they see fit, so long as they
don't call the completion callback before the buffer is really on disk.

It's pretty simple, and it has the advantage of not needing any changes
to the existing buffer-cache. Of course, it may not work...

J
--------------8DE25416562E9704990B39F7
Content-Type: text/plain; charset=us-ascii; name="seq_write.c"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="seq_write.c"

/*
* Ordered Writes
*
* Writes are ordered by putting them into phases. All the writes in
* a phase are written to physical media before any writes from a
* subsequent phase are written; blocks within a phase can be written
* in any order.. New blocks can be continually added to a phase
* until its committed: at that point all remaining blocks are written
* and the next phase is started. Any number of phases may be queued
* into the future, but their contents will only be written when all
* previous blocks are.
*
* The client of this system does *not* mark buffers dirty - that only
* encourages fs/buffer.c to write them in any old order. Instead,
* once modified buffers are ready to be written they are added to a
* phase with seq_add_buffer(). When there are no more to be added,
* call seq_commit_phase().
*
* One a phase has been committed, the phase information may be freed
* at any time.
*
* This system will control the sequence of writes of any block; they
* do not need to be on the same physical device.
*
* NOTE: this uses the b_end_io IO completion callback mechanism
* implemented by buffer.c. This will therefore not work with any
* other system which also uses it. For example, MD WILL NOT WORK
* (XXX is this true?). There are some explicit checks in here to
* look out for problems, but they are necessarily incomplete.
*
* TODO:
* - test
* - use slabs?
* - partial ordering of phases?
* - work out better error handler
*
* Jeremy Fitzhardinge <jeremy@zip.com.au> Feb 1998
*/

#include <linux/init.h>
#include <linux/malloc.h>

#include <linux/fs.h>
#include <linux/locks.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/major.h>

#include <asm/atomic.h>
#include <asm/spinlock.h>

#define __KERNEL_SYSCALLS__
static int errno;
#include <linux/unistd.h>

#include "seq_write.h"
#include "condvar.h"
#include "assert.h"

struct bh_list
{
struct buffer_head *bh;
struct bh_list *next;
struct seq_phase *phase;
int dirtied; /* this buffer dirtied for writing */
};

struct seq_phase
{
atomic_t count;
atomic_t errors;
struct bh_list *list;
struct seq_phase *next;
struct sequencer *seq;
int committed;
int marked;
spinlock_t lock;

seq_handler_t handler;
void *handler_arg;
};

struct sequencer
{
struct seq_phase *cur_phase;
struct seq_phase *end;
struct semaphore sema;

struct condvar cv;
spinlock_t cvlock;

int thread_quit;
pid_t thread_pid;
};

/*
* A block has been written. Decrement the number of outstanding
* unwritten blocks, and wake up the thread if it reaches 0.
*/
static void seq_end_io(struct buffer_head *bh, int uptodate)
{
struct bh_list *bhl = (struct bh_list *)bh->b_dev_id;
struct seq_phase *ph = bhl->phase;
struct sequencer *seq = ph->seq;

if (!uptodate)
atomic_inc(&ph->errors);

bhl->bh = NULL;
mark_buffer_uptodate(bh, uptodate);
unlock_buffer(bh);

if (atomic_dec_and_test(&ph->count))
{
unsigned long flags;

spin_lock_irqsave(&seq->cvlock, flags);
condvar_wakeall(&seq->cv);
spin_unlock_irqrestore(&seq->cvlock, flags);
}
}

/*
* Thread to retire phases once they are committed and all blocks are
* written.
*
* If a committed phase gets all its blocks written, free up all
* the structures we set up, and go onto the next one.
*
* Once we switch to the next phase, we mark any existing buffers
* as dirty to set them off.
*/
static int seq_phase_thread(void *data)
{
struct sequencer *seq = (struct sequencer *)data;
int quit = 0;

current->session = 1;
current->pgrp = 1;
sprintf(current->comm, "write_sequencer");

while(!quit)
{
struct seq_phase *ph;
unsigned long flags;

spin_lock_irqsave(&seq->cvlock, flags);

while(!seq->thread_quit ||
seq->cur_phase == NULL ||
!seq->cur_phase->committed ||
atomic_read(&seq->cur_phase->count) != 0)
condvar_wait_flags(&seq->cv, &seq->cvlock, &flags);

quit = seq->thread_quit;
ph = seq->cur_phase;

spin_unlock_irqrestore(&seq->cvlock, flags);

/* Retire this phase if all blocks are written
and it has been committed, as there can be no
more new blocks. */
if (ph->committed && atomic_read(&ph->count) == 0)
{
struct bh_list *bhl, *save;
struct seq_phase *next;
int errs;

errs = atomic_read(&ph->errors);
if (errs != 0)
printk("%d errors while writing phase\n", errs);

if (ph->handler != NULL)
(*ph->handler)(errs, ph->handler_arg);

for(bhl = ph->list; bhl != NULL; bhl = save)
{
save = bhl->next;
kfree(bhl);
}

next = ph->next;
kfree(ph);
spin_lock_irqsave(&seq->cvlock, flags);

seq->cur_phase = next;
if (seq->cur_phase == NULL)
seq->end = NULL;
}
else
continue;

if (seq->cur_phase != NULL && !seq->cur_phase->marked)
{
struct bh_list *bhl;
ph = seq->cur_phase;

spin_unlock_irqrestore(&seq->cvlock, flags);

spin_lock(&ph->lock);
bhl = ph->list;
spin_unlock(&ph->lock);

for(; bhl != NULL; bhl = bhl->next)
{
int dirtied;

spin_lock(&ph->lock);
dirtied = bhl->dirtied;
bhl->dirtied = 1;
spin_unlock(&ph->lock);

if (dirtied)
continue;

mark_buffer_dirty(bhl->bh, 1);
brelse(bhl->bh);
}
}
else
spin_unlock_irqrestore(&seq->cvlock, flags);
}

kfree(seq);
return 0;
}

struct seq_phase *make_seq_phase(seq_handler_t handler, void *arg)
{
struct seq_phase *ph;

ph = kmalloc(sizeof(*ph), GFP_KERNEL);

if (ph != NULL)
{
ph->list = NULL;
ph->next = NULL;
ph->committed = 0;
ph->marked = 0;
ph->handler = handler;
ph->handler_arg = arg;

atomic_set(&ph->count, 0);
atomic_set(&ph->errors, 0);
spin_lock_init(&ph->lock);
}

return ph;
}

void seq_add_block(struct sequencer *seq, struct seq_phase *ph,
struct buffer_head *bh)
{
struct bh_list *bhl;
unsigned long flags;

if (seq->thread_quit)
printk("seq_add_block: sequencer shutting down\n");

/* There's no point in adding a dirty buffer: we can't control
sequence. */
if (assert(!buffer_dirty(bh)))
return;

if (MAJOR(bh->b_dev) == MD_MAJOR)
{
printk("seq_add_buffer: can't do sequenced writes onto md device\n");
return;
}

bhl = kmalloc(sizeof(*bhl), GFP_KERNEL);
if (bhl == NULL)
{
printk("seq_add_buffer: can't allocate buffer list");
return;
}

spin_lock(&ph->lock);

if (assert(!ph->committed))
{
/* This is at best a bug warning: there's no certainty
that ph hasn't already been freed if it has been
committed. */
spin_unlock(&ph->lock);
kfree(bhl);
return;
}

bhl->bh = bh;
bhl->next = ph->list;
bhl->phase = ph;
spin_lock_irqsave(&seq->cvlock, flags);
bhl->dirtied = (ph == seq->cur_phase);
spin_unlock_irqrestore(&seq->cvlock, flags);

bh->b_end_io = seq_end_io;
bh->b_dev_id = (void *)bhl;

atomic_inc(&ph->count);

ph->list = bhl;

spin_unlock(&ph->lock);

if (bhl->dirtied)
{
mark_buffer_dirty(bh, 1);
brelse(bh);
}
}

/*
* Commit a phase. It's not necessary that prior phases have been
* committed or written yet - it just means that this one will have no
* further additions and will be retired as soon as its all written.
*/
void seq_commit_phase(struct sequencer *seq, struct seq_phase *ph)
{
unsigned long flags;

spin_lock(&ph->lock);

if (ph->committed)
printk("seq_commit_phase: phase already committed\n");

ph->committed = 1;

spin_unlock(&ph->lock);

spin_lock_irqsave(&seq->cvlock, flags);
if (ph == seq->cur_phase)
condvar_wakeall(&seq->cv);
spin_unlock_irqrestore(&seq->cvlock, flags);
}

void seq_add_phase(struct sequencer *seq, struct seq_phase *ph)
{
unsigned long flags;

if (seq->thread_quit)
printk("seq_add_phase: sequencer shutting down\n");

spin_lock_irqsave(&seq->cvlock, flags);

ph->seq = seq;
if (seq->cur_phase == NULL)
seq->cur_phase = ph;
ph->next = seq->end;
seq->end = ph;

condvar_wakeall(&seq->cv);
spin_unlock_irqrestore(&seq->cvlock, flags);
}

/*
* This processes all outstanding IO before actually killing the
* thread.
*/
void seq_shutdown(struct sequencer *seq)
{
pid_t pid = seq->thread_pid;
int stat;
unsigned long flags;

spin_lock_irqsave(&seq->cvlock, flags);
seq->thread_quit = 1;
condvar_wakeall(&seq->cv);
spin_unlock_irqrestore(&seq->cvlock, flags);

waitpid(pid, &stat, 0);
}

struct sequencer *seq_init(void)
{
struct sequencer *seq;

seq = kmalloc(sizeof(*seq), GFP_KERNEL);

if (seq != NULL)
{
memset(seq, 0, sizeof(*seq));
sema_init(&seq->sema, 1);
spin_lock_init(&seq->cvlock);
condvar_init(&seq->cv);

/* XXX This could be problematic by not freeing process
memory if we're loaded as a module - ignore for now */
seq->thread_pid = kernel_thread(seq_phase_thread, seq, 0);
}

return seq;
}

--------------8DE25416562E9704990B39F7
Content-Type: text/plain; charset=us-ascii; name="seq_write.h"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="seq_write.h"

#ifndef _FS_FXFS_BUFFER_H
#define _FS_FXFS_BUFFER_H

struct sequencer;
struct seq_phase;

typedef void (*seq_handler_t)(int errors, void *arg);

struct seq_phase *make_seq_phase(seq_handler_t handler, void *arg);
void seq_add_block(struct sequencer *, struct seq_phase *, struct buffer_head *bh);
void seq_add_phase(struct sequencer *, struct seq_phase *);
void set_commit_phase(struct sequencer *, struct seq_phase *);

struct sequencer *seq_init(void);
void seq_shutdown(struct sequencer *);

#endif /* _FS_FXFS_BUFFER_H */

--------------8DE25416562E9704990B39F7
Content-Type: text/plain; charset=us-ascii; name="condvar.h"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="condvar.h"

#ifndef _CONDVAR_H
#define _CONDVAR_H

#include <asm/spinlock.h>
#include <linux/sched.h>

struct condvar {
spinlock_t lock;
struct wait_queue *wait;
};

static inline void condvar_init(struct condvar *cv)
{
spin_lock_init(&cv->lock);
init_waitqueue(&cv->wait);
}

static inline void condvar_wakeall(struct condvar *cv)
{
unsigned long flags;

spin_lock_irqsave(&cv->lock, flags);
wake_up(&cv->wait);
spin_unlock_irqrestore(&cv->lock, flags);
}

/* This should probably be in a .c somewhere */
static void _condvar_wait(struct condvar *cv, spinlock_t *lock,
unsigned long *lkflags, int state)
{
unsigned long flags;
struct wait_queue wait;
struct task_struct *tsk = current;

spin_lock_irqsave(&cv->lock, flags);
tsk->state = state;
wait.task = tsk;
__add_wait_queue(&cv->wait, &wait);
spin_unlock_irqrestore(&cv->lock, flags);

if (lkflags != NULL)
spin_unlock_irqrestore(lock, *lkflags);
else
spin_unlock(lock);

/* If someone wakes us in this gap we become runnable again, and
schedule should return at the first opportunity */

schedule();

if (lkflags != NULL)
spin_lock_irqsave(lock, *lkflags);
else
spin_lock(lock);

tsk->state = TASK_RUNNING;

spin_lock_irqsave(&cv->lock, flags);
__remove_wait_queue(&cv->wait, &wait);
spin_unlock_irqrestore(&cv->lock, flags);
}

static inline void condvar_wait_flags(struct condvar *cv, spinlock_t *lock,
unsigned long *flags)
{
_condvar_wait(cv, lock, flags, TASK_UNINTERRUPTIBLE);
}

static inline void condvar_wait(struct condvar *cv, spinlock_t *lock)
{
_condvar_wait(cv, lock, NULL, TASK_UNINTERRUPTIBLE);
}
#endif /* _CONDVAR_H */

--------------8DE25416562E9704990B39F7--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu

Next message: Andries.Brouwer@cwi.nl: "Re: mkswap & swapon"
Previous message: Marc Horowitz: "Re: New Feature Idea: Compress swap file"
Maybe in reply to: Larry M. Augustin: "soft-update/async write file systems"