Re: [PATCH 02/20] io-controller: Common flat fair queuing code inelevaotor layer

From: Balbir Singh
Date: Mon Jun 22 2009 - 04:47:24 EST


* Vivek Goyal <vgoyal@xxxxxxxxxx> [2009-06-19 16:37:20]:

> This is common fair queuing code in elevator layer. This is controlled by
> config option CONFIG_ELV_FAIR_QUEUING. This patch initially only introduces
> flat fair queuing support where there is only one group, "root group" and all
> the tasks belong to root group.
>
> This elevator layer changes are backward compatible. That means any ioscheduler
> using old interfaces will continue to work.
>
> This code is essentially the CFQ code for fair queuing. The primary difference
> is that flat rounding robin algorithm of CFQ has been replaced with BFQ (WF2Q+).
>

The patch is quite long and to be honest requires a long time to
review, which I don't mind. I suspect my frequently diverted mind is
likely to miss a lot in a big patch like this. Could you consider
splitting this further if possible. I think you'll notice the number
of reviews will also increase.

> Signed-off-by: Nauman Rafique <nauman@xxxxxxxxxx>
> Signed-off-by: Fabio Checconi <fabio@xxxxxxxxxxxxxxxx>
> Signed-off-by: Paolo Valente <paolo.valente@xxxxxxxxxx>
> Signed-off-by: Aristeu Rozanski <aris@xxxxxxxxxx>
> Signed-off-by: Gui Jianfeng <guijianfeng@xxxxxxxxxxxxxx>
> Signed-off-by: Vivek Goyal <vgoyal@xxxxxxxxxx>
> ---
> block/Kconfig.iosched | 13 +
> block/Makefile | 1 +
> block/elevator-fq.c | 2015 ++++++++++++++++++++++++++++++++++++++++++++++
> block/elevator-fq.h | 473 +++++++++++
> block/elevator.c | 46 +-
> include/linux/blkdev.h | 5 +
> include/linux/elevator.h | 51 ++
> 7 files changed, 2593 insertions(+), 11 deletions(-)
> create mode 100644 block/elevator-fq.c
> create mode 100644 block/elevator-fq.h
>
> diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
> index 7e803fc..3398134 100644
> --- a/block/Kconfig.iosched
> +++ b/block/Kconfig.iosched
> @@ -2,6 +2,19 @@ if BLOCK
>
> menu "IO Schedulers"
>
> +config ELV_FAIR_QUEUING
> + bool "Elevator Fair Queuing Support"
> + default n
> + ---help---
> + Traditionally only cfq had notion of multiple queues and it did
> + fair queuing at its own. With the cgroups and need of controlling
> + IO, now even the simple io schedulers like noop, deadline, as will
> + have one queue per cgroup and will need hierarchical fair queuing.
> + Instead of every io scheduler implementing its own fair queuing
> + logic, this option enables fair queuing in elevator layer so that
> + other ioschedulers can make use of it.
> + If unsure, say N.
> +
> config IOSCHED_NOOP
> bool
> default y
> diff --git a/block/Makefile b/block/Makefile
> index e9fa4dd..94bfc6e 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -15,3 +15,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
>
> obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
> obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
> +obj-$(CONFIG_ELV_FAIR_QUEUING) += elevator-fq.o
> diff --git a/block/elevator-fq.c b/block/elevator-fq.c
> new file mode 100644
> index 0000000..9357fb0
> --- /dev/null
> +++ b/block/elevator-fq.c
> @@ -0,0 +1,2015 @@
> +/*
> + * BFQ: Hierarchical B-WF2Q+ scheduler.
> + *
> + * Based on ideas and code from CFQ:
> + * Copyright (C) 2003 Jens Axboe <axboe@xxxxxxxxx>
> + *
> + * Copyright (C) 2008 Fabio Checconi <fabio@xxxxxxxxxxxxxxxx>
> + * Paolo Valente <paolo.valente@xxxxxxxxxx>
> + * Copyright (C) 2009 Vivek Goyal <vgoyal@xxxxxxxxxx>
> + * Nauman Rafique <nauman@xxxxxxxxxx>
> + */
> +
> +#include <linux/blkdev.h>
> +#include "elevator-fq.h"
> +#include <linux/blktrace_api.h>
> +
> +/* Values taken from cfq */
> +const int elv_slice_sync = HZ / 10;
> +int elv_slice_async = HZ / 25;
> +const int elv_slice_async_rq = 2;
> +int elv_slice_idle = HZ / 125;
> +static struct kmem_cache *elv_ioq_pool;
> +
> +#define ELV_SLICE_SCALE (5)
> +#define ELV_HW_QUEUE_MIN (5)
> +#define IO_SERVICE_TREE_INIT ((struct io_service_tree) \
> + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
> +
> +static inline struct io_queue *elv_close_cooperator(struct request_queue *q,
> + struct io_queue *ioq, int probe);
> +struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd,
> + int extract);
> +
> +static inline int elv_prio_slice(struct elv_fq_data *efqd, int sync,
> + unsigned short prio)

Why is the return type int and not unsigned int or unsigned long? Can
the return value ever be negative?

> +{
> + const int base_slice = efqd->elv_slice[sync];
> +
> + WARN_ON(prio >= IOPRIO_BE_NR);
> +
> + return base_slice + (base_slice/ELV_SLICE_SCALE * (4 - prio));
> +}
> +
> +static inline int
> +elv_prio_to_slice(struct elv_fq_data *efqd, struct io_queue *ioq)
> +{
> + return elv_prio_slice(efqd, elv_ioq_sync(ioq), ioq->entity.ioprio);
> +}
> +
> +/* Mainly the BFQ scheduling code Follows */
> +
> +/*
> + * Shift for timestamp calculations. This actually limits the maximum
> + * service allowed in one timestamp delta (small shift values increase it),
> + * the maximum total weight that can be used for the queues in the system
> + * (big shift values increase it), and the period of virtual time wraparounds.
> + */
> +#define WFQ_SERVICE_SHIFT 22
> +
> +/**
> + * bfq_gt - compare two timestamps.
> + * @a: first ts.
> + * @b: second ts.
> + *
> + * Return @a > @b, dealing with wrapping correctly.
> + */
> +static inline int bfq_gt(bfq_timestamp_t a, bfq_timestamp_t b)
> +{
> + return (s64)(a - b) > 0;
> +}
> +

a and b are of type u64, but cast to s64 to deal with wrapping?
Correct?

> +/**
> + * bfq_delta - map service into the virtual time domain.
> + * @service: amount of service.
> + * @weight: scale factor.
> + */
> +static inline bfq_timestamp_t bfq_delta(bfq_service_t service,
> + bfq_weight_t weight)
> +{
> + bfq_timestamp_t d = (bfq_timestamp_t)service << WFQ_SERVICE_SHIFT;
> +

Why is the case required? Does the compiler complain? service is
already of the correct type.

> + do_div(d, weight);

On a 64 system both d and weight are 64 bit, but on a 32 bit system
weight is 32 bits. do_div expects a 64 bit dividend and 32 bit divisor
- no?

> + return d;
> +}
> +
> +/**
> + * bfq_calc_finish - assign the finish time to an entity.
> + * @entity: the entity to act upon.
> + * @service: the service to be charged to the entity.
> + */
> +static inline void bfq_calc_finish(struct io_entity *entity,
> + bfq_service_t service)
> +{
> + BUG_ON(entity->weight == 0);
> +
> + entity->finish = entity->start + bfq_delta(service, entity->weight);
> +}

Should we BUG_ON (entity->finish == entity->start)? Or is that
expected when the entity has no service time left.

> +
> +static inline struct io_queue *io_entity_to_ioq(struct io_entity *entity)
> +{
> + struct io_queue *ioq = NULL;
> +
> + BUG_ON(entity == NULL);
> + if (entity->my_sched_data == NULL)
> + ioq = container_of(entity, struct io_queue, entity);
> + return ioq;
> +}
> +
> +/**
> + * bfq_entity_of - get an entity from a node.
> + * @node: the node field of the entity.
> + *
> + * Convert a node pointer to the relative entity. This is used only
> + * to simplify the logic of some functions and not as the generic
> + * conversion mechanism because, e.g., in the tree walking functions,
> + * the check for a %NULL value would be redundant.
> + */
> +static inline struct io_entity *bfq_entity_of(struct rb_node *node)
> +{
> + struct io_entity *entity = NULL;
> +
> + if (node != NULL)
> + entity = rb_entry(node, struct io_entity, rb_node);
> +
> + return entity;
> +}
> +
> +/**
> + * bfq_extract - remove an entity from a tree.
> + * @root: the tree root.
> + * @entity: the entity to remove.
> + */
> +static inline void bfq_extract(struct rb_root *root, struct io_entity *entity)
> +{

Extract is not common terminology, why not use bfq_remove()?

> + BUG_ON(entity->tree != root);
> +
> + entity->tree = NULL;
> + rb_erase(&entity->rb_node, root);

Don't you want to make entity->tree = NULL after rb_erase?

> +}
> +
> +/**
> + * bfq_idle_extract - extract an entity from the idle tree.
> + * @st: the service tree of the owning @entity.
> + * @entity: the entity being removed.
> + */
> +static void bfq_idle_extract(struct io_service_tree *st,
> + struct io_entity *entity)
> +{
> + struct rb_node *next;
> +
> + BUG_ON(entity->tree != &st->idle);
> +
> + if (entity == st->first_idle) {
> + next = rb_next(&entity->rb_node);

What happens if next is NULL?

> + st->first_idle = bfq_entity_of(next);
> + }
> +
> + if (entity == st->last_idle) {
> + next = rb_prev(&entity->rb_node);

What happens if next is NULL?

> + st->last_idle = bfq_entity_of(next);
> + }
> +
> + bfq_extract(&st->idle, entity);
> +}
> +
> +/**
> + * bfq_insert - generic tree insertion.
> + * @root: tree root.
> + * @entity: entity to insert.
> + *
> + * This is used for the idle and the active tree, since they are both
> + * ordered by finish time.
> + */
> +static void bfq_insert(struct rb_root *root, struct io_entity *entity)
> +{
> + struct io_entity *entry;
> + struct rb_node **node = &root->rb_node;
> + struct rb_node *parent = NULL;
> +
> + BUG_ON(entity->tree != NULL);
> +
> + while (*node != NULL) {
> + parent = *node;
> + entry = rb_entry(parent, struct io_entity, rb_node);
> +
> + if (bfq_gt(entry->finish, entity->finish))
> + node = &parent->rb_left;
> + else
> + node = &parent->rb_right;
> + }
> +
> + rb_link_node(&entity->rb_node, parent, node);
> + rb_insert_color(&entity->rb_node, root);
> +
> + entity->tree = root;
> +}
> +
> +/**
> + * bfq_update_min - update the min_start field of a entity.
> + * @entity: the entity to update.
> + * @node: one of its children.
> + *
> + * This function is called when @entity may store an invalid value for
> + * min_start due to updates to the active tree. The function assumes
> + * that the subtree rooted at @node (which may be its left or its right
> + * child) has a valid min_start value.
> + */
> +static inline void bfq_update_min(struct io_entity *entity,
> + struct rb_node *node)
> +{
> + struct io_entity *child;
> +
> + if (node != NULL) {
> + child = rb_entry(node, struct io_entity, rb_node);
> + if (bfq_gt(entity->min_start, child->min_start))
> + entity->min_start = child->min_start;
> + }
> +}

So.. we check to see if child's min_time is lesser than the root
entities or node entities and set it to the minimum of the two?
Can you use min_t here?

> +
> +/**
> + * bfq_update_active_node - recalculate min_start.
> + * @node: the node to update.
> + *
> + * @node may have changed position or one of its children may have moved,
> + * this function updates its min_start value. The left and right subtrees
> + * are assumed to hold a correct min_start value.
> + */
> +static inline void bfq_update_active_node(struct rb_node *node)
> +{
> + struct io_entity *entity = rb_entry(node, struct io_entity, rb_node);
> +
> + entity->min_start = entity->start;
> + bfq_update_min(entity, node->rb_right);
> + bfq_update_min(entity, node->rb_left);
> +}

I don't like this every much, we set the min_time twice, this can be
easily optimized to look at both left and right child and pick the
minimum.

> +
> +/**
> + * bfq_update_active_tree - update min_start for the whole active tree.
> + * @node: the starting node.
> + *
> + * @node must be the deepest modified node after an update. This function
> + * updates its min_start using the values held by its children, assuming
> + * that they did not change, and then updates all the nodes that may have
> + * changed in the path to the root. The only nodes that may have changed
> + * are the ones in the path or their siblings.
> + */
> +static void bfq_update_active_tree(struct rb_node *node)
> +{
> + struct rb_node *parent;
> +
> +up:
> + bfq_update_active_node(node);
> +
> + parent = rb_parent(node);
> + if (parent == NULL)
> + return;
> +
> + if (node == parent->rb_left && parent->rb_right != NULL)
> + bfq_update_active_node(parent->rb_right);
> + else if (parent->rb_left != NULL)
> + bfq_update_active_node(parent->rb_left);
> +
> + node = parent;
> + goto up;
> +}
> +

For these functions, take a look at the walk function in the group
scheduler that does update_shares

> +/**
> + * bfq_active_insert - insert an entity in the active tree of its group/device.
> + * @st: the service tree of the entity.
> + * @entity: the entity being inserted.
> + *
> + * The active tree is ordered by finish time, but an extra key is kept
> + * per each node, containing the minimum value for the start times of
> + * its children (and the node itself), so it's possible to search for
> + * the eligible node with the lowest finish time in logarithmic time.
> + */
> +static void bfq_active_insert(struct io_service_tree *st,
> + struct io_entity *entity)
> +{
> + struct rb_node *node = &entity->rb_node;
> +
> + bfq_insert(&st->active, entity);
> +
> + if (node->rb_left != NULL)
> + node = node->rb_left;
> + else if (node->rb_right != NULL)
> + node = node->rb_right;
> +
> + bfq_update_active_tree(node);
> +}
> +
> +/**
> + * bfq_ioprio_to_weight - calc a weight from an ioprio.
> + * @ioprio: the ioprio value to convert.
> + */
> +static bfq_weight_t bfq_ioprio_to_weight(int ioprio)
> +{
> + WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
> + return IOPRIO_BE_NR - ioprio;
> +}
> +
> +void bfq_get_entity(struct io_entity *entity)
> +{
> + struct io_queue *ioq = io_entity_to_ioq(entity);
> +
> + if (ioq)
> + elv_get_ioq(ioq);
> +}
> +
> +void bfq_init_entity(struct io_entity *entity, struct io_group *iog)
> +{
> + entity->ioprio = entity->new_ioprio;
> + entity->ioprio_class = entity->new_ioprio_class;
> + entity->sched_data = &iog->sched_data;
> +}
> +
> +/**
> + * bfq_find_deepest - find the deepest node that an extraction can modify.
> + * @node: the node being removed.
> + *
> + * Do the first step of an extraction in an rb tree, looking for the
> + * node that will replace @node, and returning the deepest node that
> + * the following modifications to the tree can touch. If @node is the
> + * last node in the tree return %NULL.
> + */
> +static struct rb_node *bfq_find_deepest(struct rb_node *node)
> +{
> + struct rb_node *deepest;
> +
> + if (node->rb_right == NULL && node->rb_left == NULL)
> + deepest = rb_parent(node);

Why is the parent the deepest? Shouldn't node be the deepest?

> + else if (node->rb_right == NULL)
> + deepest = node->rb_left;
> + else if (node->rb_left == NULL)
> + deepest = node->rb_right;
> + else {
> + deepest = rb_next(node);
> + if (deepest->rb_right != NULL)
> + deepest = deepest->rb_right;
> + else if (rb_parent(deepest) != node)
> + deepest = rb_parent(deepest);
> + }
> +
> + return deepest;
> +}

The function is not clear, could you please define deepest node
better?

> +
> +/**
> + * bfq_active_extract - remove an entity from the active tree.
> + * @st: the service_tree containing the tree.
> + * @entity: the entity being removed.
> + */
> +static void bfq_active_extract(struct io_service_tree *st,
> + struct io_entity *entity)
> +{
> + struct rb_node *node;
> +
> + node = bfq_find_deepest(&entity->rb_node);
> + bfq_extract(&st->active, entity);
> +
> + if (node != NULL)
> + bfq_update_active_tree(node);
> +}
> +

Just to check my understanding, every time an active node is
added/removed, we update the min_time of the entire tree.

> +/**
> + * bfq_idle_insert - insert an entity into the idle tree.
> + * @st: the service tree containing the tree.
> + * @entity: the entity to insert.
> + */
> +static void bfq_idle_insert(struct io_service_tree *st,
> + struct io_entity *entity)
> +{
> + struct io_entity *first_idle = st->first_idle;
> + struct io_entity *last_idle = st->last_idle;
> +
> + if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
> + st->first_idle = entity;
> + if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
> + st->last_idle = entity;
> +
> + bfq_insert(&st->idle, entity);
> +}
> +
> +/**
> + * bfq_forget_entity - remove an entity from the wfq trees.
> + * @st: the service tree.
> + * @entity: the entity being removed.
> + *
> + * Update the device status and forget everything about @entity, putting
> + * the device reference to it, if it is a queue. Entities belonging to
> + * groups are not refcounted.
> + */
> +static void bfq_forget_entity(struct io_service_tree *st,
> + struct io_entity *entity)
> +{
> + struct io_queue *ioq = NULL;
> +
> + BUG_ON(!entity->on_st);
> + entity->on_st = 0;
> + st->wsum -= entity->weight;
> + ioq = io_entity_to_ioq(entity);
> + if (!ioq)
> + return;
> + elv_put_ioq(ioq);
> +}
> +
> +/**
> + * bfq_put_idle_entity - release the idle tree ref of an entity.
> + * @st: service tree for the entity.
> + * @entity: the entity being released.
> + */
> +void bfq_put_idle_entity(struct io_service_tree *st,
> + struct io_entity *entity)
> +{
> + bfq_idle_extract(st, entity);
> + bfq_forget_entity(st, entity);
> +}
> +
> +/**
> + * bfq_forget_idle - update the idle tree if necessary.
> + * @st: the service tree to act upon.
> + *
> + * To preserve the global O(log N) complexity we only remove one entry here;
> + * as the idle tree will not grow indefinitely this can be done safely.
> + */
> +void bfq_forget_idle(struct io_service_tree *st)
> +{
> + struct io_entity *first_idle = st->first_idle;
> + struct io_entity *last_idle = st->last_idle;
> +
> + if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
> + !bfq_gt(last_idle->finish, st->vtime)) {
> + /*
> + * Active tree is empty. Pull back vtime to finish time of
> + * last idle entity on idle tree.
> + * Rational seems to be that it reduces the possibility of
> + * vtime wraparound (bfq_gt(V-F) < 0).
> + */
> + st->vtime = last_idle->finish;
> + }
> +
> + if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
> + bfq_put_idle_entity(st, first_idle);
> +}
> +
> +
> +static struct io_service_tree *
> +__bfq_entity_update_prio(struct io_service_tree *old_st,
> + struct io_entity *entity)
> +{
> + struct io_service_tree *new_st = old_st;
> + struct io_queue *ioq = io_entity_to_ioq(entity);
> +
> + if (entity->ioprio_changed) {
> + entity->ioprio = entity->new_ioprio;
> + entity->ioprio_class = entity->new_ioprio_class;
> + entity->ioprio_changed = 0;
> +
> + /*
> + * Also update the scaled budget for ioq. Group will get the
> + * updated budget once ioq is selected to run next.
> + */
> + if (ioq) {
> + struct elv_fq_data *efqd = ioq->efqd;
> + entity->budget = elv_prio_to_slice(efqd, ioq);
> + }
> +
> + old_st->wsum -= entity->weight;
> + entity->weight = bfq_ioprio_to_weight(entity->ioprio);
> +
> + /*
> + * NOTE: here we may be changing the weight too early,
> + * this will cause unfairness. The correct approach
> + * would have required additional complexity to defer
> + * weight changes to the proper time instants (i.e.,
> + * when entity->finish <= old_st->vtime).
> + */
> + new_st = io_entity_service_tree(entity);
> + new_st->wsum += entity->weight;
> +
> + if (new_st != old_st)
> + entity->start = new_st->vtime;
> + }
> +
> + return new_st;
> +}
> +
> +/**
> + * __bfq_activate_entity - activate an entity.
> + * @entity: the entity being activated.
> + *
> + * Called whenever an entity is activated, i.e., it is not active and one
> + * of its children receives a new request, or has to be reactivated due to
> + * budget exhaustion. It uses the current budget of the entity (and the
> + * service received if @entity is active) of the queue to calculate its
> + * timestamps.
> + */
> +static void __bfq_activate_entity(struct io_entity *entity, int add_front)
> +{
> + struct io_sched_data *sd = entity->sched_data;
> + struct io_service_tree *st = io_entity_service_tree(entity);
> +
> + if (entity == sd->active_entity) {
> + BUG_ON(entity->tree != NULL);
> + /*
> + * If we are requeueing the current entity we have
> + * to take care of not charging to it service it has
> + * not received.
> + */
> + bfq_calc_finish(entity, entity->service);
> + entity->start = entity->finish;
> + sd->active_entity = NULL;
> + } else if (entity->tree == &st->active) {
> + /*
> + * Requeueing an entity due to a change of some
> + * next_active entity below it. We reuse the old
> + * start time.
> + */
> + bfq_active_extract(st, entity);
> + } else if (entity->tree == &st->idle) {
> + /*
> + * Must be on the idle tree, bfq_idle_extract() will
> + * check for that.
> + */
> + bfq_idle_extract(st, entity);
> + entity->start = bfq_gt(st->vtime, entity->finish) ?
> + st->vtime : entity->finish;
> + } else {
> + /*
> + * The finish time of the entity may be invalid, and
> + * it is in the past for sure, otherwise the queue
> + * would have been on the idle tree.
> + */
> + entity->start = st->vtime;
> + st->wsum += entity->weight;
> + bfq_get_entity(entity);
> +
> + BUG_ON(entity->on_st);
> + entity->on_st = 1;
> + }
> +
> + st = __bfq_entity_update_prio(st, entity);
> + /*
> + * This is to emulate cfq like functionality where preemption can
> + * happen with-in same class, like sync queue preempting async queue
> + * May be this is not a very good idea from fairness point of view
> + * as preempting queue gains share. Keeping it for now.
> + */
> + if (add_front) {
> + struct io_entity *next_entity;
> +
> + /*
> + * Determine the entity which will be dispatched next
> + * Use sd->next_active once hierarchical patch is applied
> + */
> + next_entity = bfq_lookup_next_entity(sd, 0);
> +
> + if (next_entity && next_entity != entity) {
> + struct io_service_tree *new_st;
> + bfq_timestamp_t delta;
> +
> + new_st = io_entity_service_tree(next_entity);
> +
> + /*
> + * At this point, both entities should belong to
> + * same service tree as cross service tree preemption
> + * is automatically taken care by algorithm
> + */
> + BUG_ON(new_st != st);
> + entity->finish = next_entity->finish - 1;
> + delta = bfq_delta(entity->budget, entity->weight);
> + entity->start = entity->finish - delta;
> + if (bfq_gt(entity->start, st->vtime))
> + entity->start = st->vtime;
> + }
> + } else {
> + bfq_calc_finish(entity, entity->budget);
> + }
> + bfq_active_insert(st, entity);
> +}
> +
> +/**
> + * bfq_activate_entity - activate an entity.
> + * @entity: the entity to activate.
> + */
> +void bfq_activate_entity(struct io_entity *entity, int add_front)
> +{
> + __bfq_activate_entity(entity, add_front);
> +}
> +
> +/**
> + * __bfq_deactivate_entity - deactivate an entity from its service tree.
> + * @entity: the entity to deactivate.
> + * @requeue: if false, the entity will not be put into the idle tree.
> + *
> + * Deactivate an entity, independently from its previous state. If the
> + * entity was not on a service tree just return, otherwise if it is on
> + * any scheduler tree, extract it from that tree, and if necessary
> + * and if the caller did not specify @requeue, put it on the idle tree.
> + *
> + */
> +int __bfq_deactivate_entity(struct io_entity *entity, int requeue)
> +{
> + struct io_sched_data *sd = entity->sched_data;
> + struct io_service_tree *st = io_entity_service_tree(entity);
> + int was_active = entity == sd->active_entity;
> + int ret = 0;
> +
> + if (!entity->on_st)
> + return 0;
> +
> + BUG_ON(was_active && entity->tree != NULL);
> +
> + if (was_active) {
> + bfq_calc_finish(entity, entity->service);
> + sd->active_entity = NULL;
> + } else if (entity->tree == &st->active)
> + bfq_active_extract(st, entity);
> + else if (entity->tree == &st->idle)
> + bfq_idle_extract(st, entity);
> + else if (entity->tree != NULL)
> + BUG();
> +
> + if (!requeue || !bfq_gt(entity->finish, st->vtime))
> + bfq_forget_entity(st, entity);
> + else
> + bfq_idle_insert(st, entity);
> +
> + BUG_ON(sd->active_entity == entity);
> +
> + return ret;
> +}
> +
> +/**
> + * bfq_deactivate_entity - deactivate an entity.
> + * @entity: the entity to deactivate.
> + * @requeue: true if the entity can be put on the idle tree
> + */
> +void bfq_deactivate_entity(struct io_entity *entity, int requeue)
> +{
> + __bfq_deactivate_entity(entity, requeue);
> +}
> +
> +/**
> + * bfq_update_vtime - update vtime if necessary.
> + * @st: the service tree to act upon.
> + *
> + * If necessary update the service tree vtime to have at least one
> + * eligible entity, skipping to its start time. Assumes that the
> + * active tree of the device is not empty.
> + *
> + * NOTE: this hierarchical implementation updates vtimes quite often,
> + * we may end up with reactivated tasks getting timestamps after a
> + * vtime skip done because we needed a ->first_active entity on some
> + * intermediate node.
> + */
> +static void bfq_update_vtime(struct io_service_tree *st)
> +{
> + struct io_entity *entry;
> + struct rb_node *node = st->active.rb_node;
> +
> + entry = rb_entry(node, struct io_entity, rb_node);
> + if (bfq_gt(entry->min_start, st->vtime)) {
> + st->vtime = entry->min_start;
> + bfq_forget_idle(st);
> + }
> +}
> +
> +/**
> + * bfq_first_active - find the eligible entity with the smallest finish time
> + * @st: the service tree to select from.
> + *
> + * This function searches the first schedulable entity, starting from the
> + * root of the tree and going on the left every time on this side there is
> + * a subtree with at least one eligible (start <= vtime) entity. The path
> + * on the right is followed only if a) the left subtree contains no eligible
> + * entities and b) no eligible entity has been found yet.
> + */
> +static struct io_entity *bfq_first_active_entity(struct io_service_tree *st)
> +{
> + struct io_entity *entry, *first = NULL;
> + struct rb_node *node = st->active.rb_node;
> +
> + while (node != NULL) {
> + entry = rb_entry(node, struct io_entity, rb_node);
> +left:
> + if (!bfq_gt(entry->start, st->vtime))
> + first = entry;
> +
> + BUG_ON(bfq_gt(entry->min_start, st->vtime));
> +
> + if (node->rb_left != NULL) {
> + entry = rb_entry(node->rb_left,
> + struct io_entity, rb_node);
> + if (!bfq_gt(entry->min_start, st->vtime)) {
> + node = node->rb_left;
> + goto left;
> + }
> + }
> + if (first != NULL)
> + break;
> + node = node->rb_right;

Please help me understand this, we sort the tree by finish time, but
search by vtime, start_time. The worst case could easily be O(N),
right?

> + }
> +
> + BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
> + return first;
> +}
> +
> +/**
> + * __bfq_lookup_next_entity - return the first eligible entity in @st.
> + * @st: the service tree.
> + *
> + * Update the virtual time in @st and return the first eligible entity
> + * it contains.
> + */
> +static struct io_entity *__bfq_lookup_next_entity(struct io_service_tree *st)
> +{
> + struct io_entity *entity;
> +
> + if (RB_EMPTY_ROOT(&st->active))
> + return NULL;
> +
> + bfq_update_vtime(st);
> + entity = bfq_first_active_entity(st);
> + BUG_ON(bfq_gt(entity->start, st->vtime));
> +
> + return entity;
> +}
> +
> +/**
> + * bfq_lookup_next_entity - return the first eligible entity in @sd.
> + * @sd: the sched_data.
> + * @extract: if true the returned entity will be also extracted from @sd.
> + *
> + * NOTE: since we cache the next_active entity at each level of the
> + * hierarchy, the complexity of the lookup can be decreased with
> + * absolutely no effort just returning the cached next_active value;
> + * we prefer to do full lookups to test the consistency of * the data
> + * structures.
> + */
> +struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd,
> + int extract)
> +{
> + struct io_service_tree *st = sd->service_tree;
> + struct io_entity *entity;
> + int i;
> +
> + /*
> + * We should not call lookup when an entity is active, as doing lookup
> + * can result in an erroneous vtime jump.
> + */
> + BUG_ON(sd->active_entity != NULL);
> +
> + for (i = 0; i < IO_IOPRIO_CLASSES; i++, st++) {
> + entity = __bfq_lookup_next_entity(st);
> + if (entity != NULL) {
> + if (extract) {
> + bfq_active_extract(st, entity);
> + sd->active_entity = entity;
> + }
> + break;
> + }
> + }
> +
> + return entity;
> +}
> +
> +void entity_served(struct io_entity *entity, bfq_service_t served)
> +{
> + struct io_service_tree *st;
> +
> + st = io_entity_service_tree(entity);
> + entity->service += served;
> + BUG_ON(st->wsum == 0);
> + st->vtime += bfq_delta(served, st->wsum);
> + bfq_forget_idle(st);

Forget idle checks to see if the st->vtime > first_idle->finish, if so
it pushes the first_idle to later, right?

> +}
> +
> +/**
> + * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
> + * @st: the service tree being flushed.
> + */
> +void io_flush_idle_tree(struct io_service_tree *st)
> +{
> + struct io_entity *entity = st->first_idle;
> +
> + for (; entity != NULL; entity = st->first_idle)
> + __bfq_deactivate_entity(entity, 0);
> +}
> +
> +/* Elevator fair queuing function */
> +struct io_queue *rq_ioq(struct request *rq)
> +{
> + return rq->ioq;
> +}
> +
> +static inline struct io_queue *elv_active_ioq(struct elevator_queue *e)
> +{
> + return e->efqd.active_queue;
> +}
> +
> +void *elv_active_sched_queue(struct elevator_queue *e)
> +{
> + return ioq_sched_queue(elv_active_ioq(e));
> +}
> +EXPORT_SYMBOL(elv_active_sched_queue);
> +
> +int elv_nr_busy_ioq(struct elevator_queue *e)
> +{
> + return e->efqd.busy_queues;
> +}
> +EXPORT_SYMBOL(elv_nr_busy_ioq);
> +
> +int elv_hw_tag(struct elevator_queue *e)
> +{
> + return e->efqd.hw_tag;
> +}
> +EXPORT_SYMBOL(elv_hw_tag);
> +
> +/* Helper functions for operating on elevator idle slice timer */
> +int elv_mod_idle_slice_timer(struct elevator_queue *eq, unsigned long expires)
> +{
> + struct elv_fq_data *efqd = &eq->efqd;
> +
> + return mod_timer(&efqd->idle_slice_timer, expires);
> +}
> +EXPORT_SYMBOL(elv_mod_idle_slice_timer);
> +
> +int elv_del_idle_slice_timer(struct elevator_queue *eq)
> +{
> + struct elv_fq_data *efqd = &eq->efqd;
> +
> + return del_timer(&efqd->idle_slice_timer);
> +}
> +EXPORT_SYMBOL(elv_del_idle_slice_timer);
> +
> +unsigned int elv_get_slice_idle(struct elevator_queue *eq)
> +{
> + return eq->efqd.elv_slice_idle;
> +}
> +EXPORT_SYMBOL(elv_get_slice_idle);
> +
> +void elv_ioq_served(struct io_queue *ioq, bfq_service_t served)
> +{
> + entity_served(&ioq->entity, served);
> +}
> +
> +/* Tells whether ioq is queued in root group or not */
> +static inline int is_root_group_ioq(struct request_queue *q,
> + struct io_queue *ioq)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> + return (ioq->entity.sched_data == &efqd->root_group->sched_data);
> +}
> +
> +/*
> + * sysfs parts below -->
> + */
> +static ssize_t
> +elv_var_show(unsigned int var, char *page)
> +{
> + return sprintf(page, "%d\n", var);
> +}
> +
> +static ssize_t
> +elv_var_store(unsigned int *var, const char *page, size_t count)
> +{
> + char *p = (char *) page;
> +
> + *var = simple_strtoul(p, &p, 10);
> + return count;
> +}
> +
> +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
> +ssize_t __FUNC(struct elevator_queue *e, char *page) \
> +{ \
> + struct elv_fq_data *efqd = &e->efqd; \
> + unsigned int __data = __VAR; \
> + if (__CONV) \
> + __data = jiffies_to_msecs(__data); \
> + return elv_var_show(__data, (page)); \
> +}
> +SHOW_FUNCTION(elv_slice_idle_show, efqd->elv_slice_idle, 1);
> +EXPORT_SYMBOL(elv_slice_idle_show);
> +SHOW_FUNCTION(elv_slice_sync_show, efqd->elv_slice[1], 1);
> +EXPORT_SYMBOL(elv_slice_sync_show);
> +SHOW_FUNCTION(elv_slice_async_show, efqd->elv_slice[0], 1);
> +EXPORT_SYMBOL(elv_slice_async_show);
> +#undef SHOW_FUNCTION
> +
> +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
> +ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
> +{ \
> + struct elv_fq_data *efqd = &e->efqd; \
> + unsigned int __data; \
> + int ret = elv_var_store(&__data, (page), count); \
> + if (__data < (MIN)) \
> + __data = (MIN); \
> + else if (__data > (MAX)) \
> + __data = (MAX); \
> + if (__CONV) \
> + *(__PTR) = msecs_to_jiffies(__data); \
> + else \
> + *(__PTR) = __data; \
> + return ret; \
> +}
> +STORE_FUNCTION(elv_slice_idle_store, &efqd->elv_slice_idle, 0, UINT_MAX, 1);
> +EXPORT_SYMBOL(elv_slice_idle_store);
> +STORE_FUNCTION(elv_slice_sync_store, &efqd->elv_slice[1], 1, UINT_MAX, 1);
> +EXPORT_SYMBOL(elv_slice_sync_store);
> +STORE_FUNCTION(elv_slice_async_store, &efqd->elv_slice[0], 1, UINT_MAX, 1);
> +EXPORT_SYMBOL(elv_slice_async_store);
> +#undef STORE_FUNCTION
> +
> +void elv_schedule_dispatch(struct request_queue *q)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> + if (elv_nr_busy_ioq(q->elevator)) {
> + elv_log(efqd, "schedule dispatch");
> + kblockd_schedule_work(efqd->queue, &efqd->unplug_work);
> + }
> +}
> +EXPORT_SYMBOL(elv_schedule_dispatch);
> +
> +void elv_kick_queue(struct work_struct *work)
> +{
> + struct elv_fq_data *efqd =
> + container_of(work, struct elv_fq_data, unplug_work);
> + struct request_queue *q = efqd->queue;
> + unsigned long flags;
> +
> + spin_lock_irqsave(q->queue_lock, flags);
> + blk_start_queueing(q);
> + spin_unlock_irqrestore(q->queue_lock, flags);
> +}
> +
> +void elv_shutdown_timer_wq(struct elevator_queue *e)
> +{
> + del_timer_sync(&e->efqd.idle_slice_timer);
> + cancel_work_sync(&e->efqd.unplug_work);
> +}
> +EXPORT_SYMBOL(elv_shutdown_timer_wq);
> +
> +void elv_ioq_set_prio_slice(struct request_queue *q, struct io_queue *ioq)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> + ioq->slice_end = jiffies + ioq->entity.budget;
> + elv_log_ioq(efqd, ioq, "set_slice=%lu", ioq->entity.budget);
> +}
> +
> +static void elv_ioq_update_io_thinktime(struct io_queue *ioq)
> +{
> + struct elv_fq_data *efqd = ioq->efqd;
> + unsigned long elapsed = jiffies - ioq->last_end_request;
> + unsigned long ttime = min(elapsed, 2UL * efqd->elv_slice_idle);
> +
> + ioq->ttime_samples = (7*ioq->ttime_samples + 256) / 8;
> + ioq->ttime_total = (7*ioq->ttime_total + 256*ttime) / 8;
> + ioq->ttime_mean = (ioq->ttime_total + 128) / ioq->ttime_samples;
> +}

Not sure I understand the magical 7, 8, 2, 128 and 256. Please help me
understand the algorithm.

> +
> +/*
> + * Disable idle window if the process thinks too long.
> + * This idle flag can also be updated by io scheduler.
> + */
> +static void elv_ioq_update_idle_window(struct elevator_queue *eq,
> + struct io_queue *ioq, struct request *rq)
> +{
> + int old_idle, enable_idle;
> + struct elv_fq_data *efqd = ioq->efqd;
> +
> + /*
> + * Don't idle for async or idle io prio class
> + */
> + if (!elv_ioq_sync(ioq) || elv_ioq_class_idle(ioq))
> + return;
> +
> + enable_idle = old_idle = elv_ioq_idle_window(ioq);
> +
> + if (!efqd->elv_slice_idle)
> + enable_idle = 0;
> + else if (ioq_sample_valid(ioq->ttime_samples)) {
> + if (ioq->ttime_mean > efqd->elv_slice_idle)
> + enable_idle = 0;
> + else
> + enable_idle = 1;
> + }
> +
> + /*
> + * From think time perspective idle should be enabled. Check with
> + * io scheduler if it wants to disable idling based on additional
> + * considrations like seek pattern.
> + */
> + if (enable_idle) {
> + if (eq->ops->elevator_update_idle_window_fn)
> + enable_idle = eq->ops->elevator_update_idle_window_fn(
> + eq, ioq->sched_queue, rq);
> + if (!enable_idle)
> + elv_log_ioq(efqd, ioq, "iosched disabled idle");
> + }
> +
> + if (old_idle != enable_idle) {
> + elv_log_ioq(efqd, ioq, "idle=%d", enable_idle);
> + if (enable_idle)
> + elv_mark_ioq_idle_window(ioq);
> + else
> + elv_clear_ioq_idle_window(ioq);
> + }
> +}
> +
> +struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask)
> +{
> + struct io_queue *ioq = NULL;
> +
> + ioq = kmem_cache_alloc_node(elv_ioq_pool, gfp_mask, q->node);
> + return ioq;
> +}
> +EXPORT_SYMBOL(elv_alloc_ioq);
> +
> +void elv_free_ioq(struct io_queue *ioq)
> +{
> + kmem_cache_free(elv_ioq_pool, ioq);
> +}
> +EXPORT_SYMBOL(elv_free_ioq);
> +
> +int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq,
> + void *sched_queue, int ioprio_class, int ioprio,
> + int is_sync)
> +{
> + struct elv_fq_data *efqd = &eq->efqd;
> + struct io_group *iog = io_lookup_io_group_current(efqd->queue);
> +
> + RB_CLEAR_NODE(&ioq->entity.rb_node);
> + atomic_set(&ioq->ref, 0);
> + ioq->efqd = efqd;
> + elv_ioq_set_ioprio_class(ioq, ioprio_class);
> + elv_ioq_set_ioprio(ioq, ioprio);
> + ioq->pid = current->pid;

Is pid used for cgroup association later? I don't see why we save the
pid otherwise? If yes, why not store the cgroup of the current->pid?

> + ioq->sched_queue = sched_queue;
> + if (is_sync && !elv_ioq_class_idle(ioq))
> + elv_mark_ioq_idle_window(ioq);
> + bfq_init_entity(&ioq->entity, iog);
> + ioq->entity.budget = elv_prio_to_slice(efqd, ioq);
> + if (is_sync)
> + ioq->last_end_request = jiffies;
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(elv_init_ioq);
> +
> +void elv_put_ioq(struct io_queue *ioq)
> +{
> + struct elv_fq_data *efqd = ioq->efqd;
> + struct elevator_queue *e = container_of(efqd, struct elevator_queue,
> + efqd);
> +
> + BUG_ON(atomic_read(&ioq->ref) <= 0);
> + if (!atomic_dec_and_test(&ioq->ref))
> + return;
> + BUG_ON(ioq->nr_queued);
> + BUG_ON(ioq->entity.tree != NULL);
> + BUG_ON(elv_ioq_busy(ioq));
> + BUG_ON(efqd->active_queue == ioq);
> +
> + /* Can be called by outgoing elevator. Don't use q */
> + BUG_ON(!e->ops->elevator_free_sched_queue_fn);
> +
> + e->ops->elevator_free_sched_queue_fn(e, ioq->sched_queue);
> + elv_log_ioq(efqd, ioq, "put_queue");
> + elv_free_ioq(ioq);
> +}
> +EXPORT_SYMBOL(elv_put_ioq);
> +
> +void elv_release_ioq(struct elevator_queue *e, struct io_queue **ioq_ptr)
> +{
> + struct io_queue *ioq = *ioq_ptr;
> +
> + if (ioq != NULL) {
> + /* Drop the reference taken by the io group */
> + elv_put_ioq(ioq);
> + *ioq_ptr = NULL;
> + }
> +}
> +
> +/*
> + * Normally next io queue to be served is selected from the service tree.
> + * This function allows one to choose a specific io queue to run next
> + * out of order. This is primarily to accomodate the close_cooperator
> + * feature of cfq.
> + *
> + * Currently it is done only for root level as to begin with supporting
> + * close cooperator feature only for root group to make sure default
> + * cfq behavior in flat hierarchy is not changed.
> + */
> +void elv_set_next_ioq(struct request_queue *q, struct io_queue *ioq)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> + struct io_entity *entity = &ioq->entity;
> + struct io_sched_data *sd = &efqd->root_group->sched_data;
> + struct io_service_tree *st = io_entity_service_tree(entity);
> +
> + BUG_ON(efqd->active_queue != NULL || sd->active_entity != NULL);
> + BUG_ON(!efqd->busy_queues);
> + BUG_ON(sd != entity->sched_data);
> + BUG_ON(!st);
> +
> + bfq_update_vtime(st);
> + bfq_active_extract(st, entity);
> + sd->active_entity = entity;
> + entity->service = 0;
> + elv_log_ioq(efqd, ioq, "set_next_ioq");
> +}
> +
> +/* Get next queue for service. */
> +struct io_queue *elv_get_next_ioq(struct request_queue *q, int extract)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> + struct io_entity *entity = NULL;
> + struct io_queue *ioq = NULL;
> + struct io_sched_data *sd;
> +
> + /*
> + * We should not call lookup when an entity is active, as doing
> + * lookup can result in an erroneous vtime jump.
> + */
> + BUG_ON(efqd->active_queue != NULL);
> +
> + if (!efqd->busy_queues)
> + return NULL;
> +
> + sd = &efqd->root_group->sched_data;
> + entity = bfq_lookup_next_entity(sd, 1);
> +
> + BUG_ON(!entity);
> + if (extract)
> + entity->service = 0;
> + ioq = io_entity_to_ioq(entity);
> +
> + return ioq;
> +}
> +
> +/*
> + * coop tells that io scheduler selected a queue for us and we did not

coop?

> + * select the next queue based on fairness.
> + */
> +static void __elv_set_active_ioq(struct elv_fq_data *efqd, struct io_queue *ioq,
> + int coop)
> +{
> + struct request_queue *q = efqd->queue;
> +
> + if (ioq) {
> + elv_log_ioq(efqd, ioq, "set_active, busy=%d",
> + efqd->busy_queues);
> + ioq->slice_end = 0;
> +
> + elv_clear_ioq_wait_request(ioq);
> + elv_clear_ioq_must_dispatch(ioq);
> + elv_mark_ioq_slice_new(ioq);
> +
> + del_timer(&efqd->idle_slice_timer);
> + }
> +
> + efqd->active_queue = ioq;
> +
> + /* Let iosched know if it wants to take some action */
> + if (ioq) {
> + if (q->elevator->ops->elevator_active_ioq_set_fn)
> + q->elevator->ops->elevator_active_ioq_set_fn(q,
> + ioq->sched_queue, coop);
> + }
> +}
> +
> +/* Get and set a new active queue for service. */
> +struct io_queue *elv_set_active_ioq(struct request_queue *q,
> + struct io_queue *ioq)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> + int coop = 0;
> +
> + if (!ioq)
> + ioq = elv_get_next_ioq(q, 1);
> + else {
> + elv_set_next_ioq(q, ioq);
> + /*
> + * io scheduler selected the next queue for us. Pass this
> + * this info back to io scheudler. cfq currently uses it
> + * to reset coop flag on the queue.
> + */
> + coop = 1;
> + }
> + __elv_set_active_ioq(efqd, ioq, coop);
> + return ioq;
> +}
> +
> +void elv_reset_active_ioq(struct elv_fq_data *efqd)
> +{
> + struct request_queue *q = efqd->queue;
> + struct io_queue *ioq = elv_active_ioq(efqd->queue->elevator);
> +
> + if (q->elevator->ops->elevator_active_ioq_reset_fn)
> + q->elevator->ops->elevator_active_ioq_reset_fn(q,
> + ioq->sched_queue);
> + efqd->active_queue = NULL;
> + del_timer(&efqd->idle_slice_timer);
> +}
> +
> +void elv_activate_ioq(struct io_queue *ioq, int add_front)
> +{
> + bfq_activate_entity(&ioq->entity, add_front);
> +}
> +
> +void elv_deactivate_ioq(struct elv_fq_data *efqd, struct io_queue *ioq,
> + int requeue)
> +{
> + bfq_deactivate_entity(&ioq->entity, requeue);
> +}
> +
> +/* Called when an inactive queue receives a new request. */
> +void elv_add_ioq_busy(struct elv_fq_data *efqd, struct io_queue *ioq)
> +{
> + BUG_ON(elv_ioq_busy(ioq));
> + BUG_ON(ioq == efqd->active_queue);
> + elv_log_ioq(efqd, ioq, "add to busy");
> + elv_activate_ioq(ioq, 0);
> + elv_mark_ioq_busy(ioq);
> + efqd->busy_queues++;
> + if (elv_ioq_class_rt(ioq)) {
> + struct io_group *iog = ioq_to_io_group(ioq);
> + iog->busy_rt_queues++;
> + }
> +}
> +
> +void elv_del_ioq_busy(struct elevator_queue *e, struct io_queue *ioq,
> + int requeue)
> +{
> + struct elv_fq_data *efqd = &e->efqd;
> +
> + BUG_ON(!elv_ioq_busy(ioq));
> + BUG_ON(ioq->nr_queued);
> + elv_log_ioq(efqd, ioq, "del from busy");
> + elv_clear_ioq_busy(ioq);
> + BUG_ON(efqd->busy_queues == 0);
> + efqd->busy_queues--;
> + if (elv_ioq_class_rt(ioq)) {
> + struct io_group *iog = ioq_to_io_group(ioq);
> + iog->busy_rt_queues--;
> + }
> +
> + elv_deactivate_ioq(efqd, ioq, requeue);
> +}
> +
> +/*
> + * Do the accounting. Determine how much service (in terms of time slices)
> + * current queue used and adjust the start, finish time of queue and vtime
> + * of the tree accordingly.
> + *
> + * Determining the service used in terms of time is tricky in certain
> + * situations. Especially when underlying device supports command queuing
> + * and requests from multiple queues can be there at same time, then it
> + * is not clear which queue consumed how much of disk time.
> + *
> + * To mitigate this problem, cfq starts the time slice of the queue only
> + * after first request from the queue has completed. This does not work
> + * very well if we expire the queue before we wait for first and more
> + * request to finish from the queue. For seeky queues, we will expire the
> + * queue after dispatching few requests without waiting and start dispatching
> + * from next queue.
> + *
> + * Not sure how to determine the time consumed by queue in such scenarios.
> + * Currently as a crude approximation, we are charging 25% of time slice
> + * for such cases. A better mechanism is needed for accurate accounting.
> + */
> +void __elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> + struct io_entity *entity = &ioq->entity;
> + long slice_unused = 0, slice_used = 0, slice_overshoot = 0;
> +
> + assert_spin_locked(q->queue_lock);
> + elv_log_ioq(efqd, ioq, "slice expired");
> +
> + if (elv_ioq_wait_request(ioq))
> + del_timer(&efqd->idle_slice_timer);
> +
> + elv_clear_ioq_wait_request(ioq);
> +
> + /*
> + * if ioq->slice_end = 0, that means a queue was expired before first
> + * reuqest from the queue got completed. Of course we are not planning
> + * to idle on the queue otherwise we would not have expired it.
> + *
> + * Charge for the 25% slice in such cases. This is not the best thing
> + * to do but at the same time not very sure what's the next best
> + * thing to do.
> + *
> + * This arises from that fact that we don't have the notion of
> + * one queue being operational at one time. io scheduler can dispatch
> + * requests from multiple queues in one dispatch round. Ideally for
> + * more accurate accounting of exact disk time used by disk, one
> + * should dispatch requests from only one queue and wait for all
> + * the requests to finish. But this will reduce throughput.
> + */
> + if (!ioq->slice_end)
> + slice_used = entity->budget/4;
> + else {
> + if (time_after(ioq->slice_end, jiffies)) {
> + slice_unused = ioq->slice_end - jiffies;
> + if (slice_unused == entity->budget) {
> + /*
> + * queue got expired immediately after
> + * completing first request. Charge 25% of
> + * slice.
> + */
> + slice_used = entity->budget/4;
> + } else
> + slice_used = entity->budget - slice_unused;
> + } else {
> + slice_overshoot = jiffies - ioq->slice_end;
> + slice_used = entity->budget + slice_overshoot;
> + }
> + }
> +
> + elv_log_ioq(efqd, ioq, "sl_end=%lx, jiffies=%lx", ioq->slice_end,
> + jiffies);
> + elv_log_ioq(efqd, ioq, "sl_used=%ld, budget=%ld overshoot=%ld",
> + slice_used, entity->budget, slice_overshoot);
> + elv_ioq_served(ioq, slice_used);
> +
> + BUG_ON(ioq != efqd->active_queue);
> + elv_reset_active_ioq(efqd);
> +
> + if (!ioq->nr_queued)
> + elv_del_ioq_busy(q->elevator, ioq, 1);
> + else
> + elv_activate_ioq(ioq, 0);
> +}
> +EXPORT_SYMBOL(__elv_ioq_slice_expired);
> +
> +/*
> + * Expire the ioq.
> + */
> +void elv_ioq_slice_expired(struct request_queue *q)
> +{
> + struct io_queue *ioq = elv_active_ioq(q->elevator);
> +
> + if (ioq)
> + __elv_ioq_slice_expired(q, ioq);
> +}
> +
> +/*
> + * Check if new_cfqq should preempt the currently active queue. Return 0 for
> + * no or if we aren't sure, a 1 will cause a preemption attempt.
> + */
> +int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq,
> + struct request *rq)
> +{
> + struct io_queue *ioq;
> + struct elevator_queue *eq = q->elevator;
> + struct io_entity *entity, *new_entity;
> +
> + ioq = elv_active_ioq(eq);
> +
> + if (!ioq)
> + return 0;
> +
> + entity = &ioq->entity;
> + new_entity = &new_ioq->entity;
> +
> + /*
> + * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
> + */
> +
> + if (new_entity->ioprio_class == IOPRIO_CLASS_RT
> + && entity->ioprio_class != IOPRIO_CLASS_RT)
> + return 1;
> + /*
> + * Allow an BE request to pre-empt an ongoing IDLE clas timeslice.
> + */
> +
> + if (new_entity->ioprio_class == IOPRIO_CLASS_BE
> + && entity->ioprio_class == IOPRIO_CLASS_IDLE)
> + return 1;
> +
> + /*
> + * Check with io scheduler if it has additional criterion based on
> + * which it wants to preempt existing queue.
> + */
> + if (eq->ops->elevator_should_preempt_fn)
> + return eq->ops->elevator_should_preempt_fn(q,
> + ioq_sched_queue(new_ioq), rq);
> +
> + return 0;
> +}
> +
> +static void elv_preempt_queue(struct request_queue *q, struct io_queue *ioq)
> +{
> + elv_log_ioq(&q->elevator->efqd, ioq, "preempt");
> + elv_ioq_slice_expired(q);
> +
> + /*
> + * Put the new queue at the front of the of the current list,
> + * so we know that it will be selected next.
> + */
> +
> + elv_activate_ioq(ioq, 1);
> + elv_ioq_set_slice_end(ioq, 0);
> + elv_mark_ioq_slice_new(ioq);
> +}
> +
> +void elv_ioq_request_add(struct request_queue *q, struct request *rq)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> + struct io_queue *ioq = rq->ioq;
> +
> + if (!elv_iosched_fair_queuing_enabled(q->elevator))
> + return;
> +
> + BUG_ON(!efqd);
> + BUG_ON(!ioq);
> + efqd->rq_queued++;
> + ioq->nr_queued++;
> +
> + if (!elv_ioq_busy(ioq))
> + elv_add_ioq_busy(efqd, ioq);
> +
> + elv_ioq_update_io_thinktime(ioq);
> + elv_ioq_update_idle_window(q->elevator, ioq, rq);
> +
> + if (ioq == elv_active_ioq(q->elevator)) {
> + /*
> + * Remember that we saw a request from this process, but
> + * don't start queuing just yet. Otherwise we risk seeing lots
> + * of tiny requests, because we disrupt the normal plugging
> + * and merging. If the request is already larger than a single
> + * page, let it rip immediately. For that case we assume that
> + * merging is already done. Ditto for a busy system that
> + * has other work pending, don't risk delaying until the
> + * idle timer unplug to continue working.
> + */
> + if (elv_ioq_wait_request(ioq)) {
> + if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
> + efqd->busy_queues > 1) {
> + del_timer(&efqd->idle_slice_timer);
> + blk_start_queueing(q);
> + }
> + elv_mark_ioq_must_dispatch(ioq);
> + }
> + } else if (elv_should_preempt(q, ioq, rq)) {
> + /*
> + * not the active queue - expire current slice if it is
> + * idle and has expired it's mean thinktime or this new queue
> + * has some old slice time left and is of higher priority or
> + * this new queue is RT and the current one is BE
> + */
> + elv_preempt_queue(q, ioq);
> + blk_start_queueing(q);
> + }
> +}
> +
> +void elv_idle_slice_timer(unsigned long data)
> +{
> + struct elv_fq_data *efqd = (struct elv_fq_data *)data;
> + struct io_queue *ioq;
> + unsigned long flags;
> + struct request_queue *q = efqd->queue;
> +
> + elv_log(efqd, "idle timer fired");
> +
> + spin_lock_irqsave(q->queue_lock, flags);
> +
> + ioq = efqd->active_queue;
> +
> + if (ioq) {
> +
> + /*
> + * We saw a request before the queue expired, let it through
> + */
> + if (elv_ioq_must_dispatch(ioq))
> + goto out_kick;
> +
> + /*
> + * expired
> + */
> + if (elv_ioq_slice_used(ioq))
> + goto expire;
> +
> + /*
> + * only expire and reinvoke request handler, if there are
> + * other queues with pending requests
> + */
> + if (!elv_nr_busy_ioq(q->elevator))
> + goto out_cont;
> +
> + /*
> + * not expired and it has a request pending, let it dispatch
> + */
> + if (ioq->nr_queued)
> + goto out_kick;
> + }
> +expire:
> + elv_ioq_slice_expired(q);
> +out_kick:
> + elv_schedule_dispatch(q);
> +out_cont:
> + spin_unlock_irqrestore(q->queue_lock, flags);
> +}
> +
> +void elv_ioq_arm_slice_timer(struct request_queue *q)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> + struct io_queue *ioq = elv_active_ioq(q->elevator);
> + unsigned long sl;
> +
> + BUG_ON(!ioq);
> +
> + /*
> + * SSD device without seek penalty, disable idling. But only do so
> + * for devices that support queuing, otherwise we still have a problem
> + * with sync vs async workloads.
> + */
> + if (blk_queue_nonrot(q) && efqd->hw_tag)
> + return;
> +
> + /*
> + * still requests with the driver, don't idle
> + */
> + if (efqd->rq_in_driver)
> + return;
> +
> + /*
> + * idle is disabled, either manually or by past process history
> + */
> + if (!efqd->elv_slice_idle || !elv_ioq_idle_window(ioq))
> + return;
> +
> + /*
> + * may be iosched got its own idling logic. In that case io
> + * schduler will take care of arming the timer, if need be.
> + */
> + if (q->elevator->ops->elevator_arm_slice_timer_fn) {
> + q->elevator->ops->elevator_arm_slice_timer_fn(q,
> + ioq->sched_queue);
> + } else {
> + elv_mark_ioq_wait_request(ioq);
> + sl = efqd->elv_slice_idle;
> + mod_timer(&efqd->idle_slice_timer, jiffies + sl);
> + elv_log_ioq(efqd, ioq, "arm idle: %lu", sl);
> + }
> +}
> +
> +/* Common layer function to select the next queue to dispatch from */
> +void *elv_fq_select_ioq(struct request_queue *q, int force)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> + struct io_queue *new_ioq = NULL, *ioq = elv_active_ioq(q->elevator);
> + struct io_group *iog;
> +
> + if (!elv_nr_busy_ioq(q->elevator))
> + return NULL;
> +
> + if (ioq == NULL)
> + goto new_queue;
> +
> + /*
> + * Force dispatch. Continue to dispatch from current queue as long
> + * as it has requests.
> + */
> + if (unlikely(force)) {
> + if (ioq->nr_queued)
> + goto keep_queue;
> + else
> + goto expire;
> + }
> +
> + /*
> + * The active queue has run out of time, expire it and select new.
> + */
> + if (elv_ioq_slice_used(ioq) && !elv_ioq_must_dispatch(ioq))
> + goto expire;
> +
> + /*
> + * If we have a RT cfqq waiting, then we pre-empt the current non-rt
> + * cfqq.
> + */
> + iog = ioq_to_io_group(ioq);
> +
> + if (!elv_ioq_class_rt(ioq) && iog->busy_rt_queues) {
> + /*
> + * We simulate this as cfqq timed out so that it gets to bank
> + * the remaining of its time slice.
> + */
> + elv_log_ioq(efqd, ioq, "preempt");
> + goto expire;
> + }
> +
> + /*
> + * The active queue has requests and isn't expired, allow it to
> + * dispatch.
> + */
> +
> + if (ioq->nr_queued)
> + goto keep_queue;
> +
> + /*
> + * If another queue has a request waiting within our mean seek
> + * distance, let it run. The expire code will check for close
> + * cooperators and put the close queue at the front of the service
> + * tree.
> + */
> + new_ioq = elv_close_cooperator(q, ioq, 0);
> + if (new_ioq)
> + goto expire;
> +
> + /*
> + * No requests pending. If the active queue still has requests in
> + * flight or is idling for a new request, allow either of these
> + * conditions to happen (or time out) before selecting a new queue.
> + */
> +
> + if (timer_pending(&efqd->idle_slice_timer) ||
> + (elv_ioq_nr_dispatched(ioq) && elv_ioq_idle_window(ioq))) {
> + ioq = NULL;
> + goto keep_queue;
> + }
> +
> +expire:
> + elv_ioq_slice_expired(q);
> +new_queue:
> + ioq = elv_set_active_ioq(q, new_ioq);
> +keep_queue:
> + return ioq;
> +}
> +
> +/* A request got removed from io_queue. Do the accounting */
> +void elv_ioq_request_removed(struct elevator_queue *e, struct request *rq)
> +{
> + struct io_queue *ioq;
> + struct elv_fq_data *efqd;
> +
> + if (!elv_iosched_fair_queuing_enabled(e))
> + return;
> +
> + ioq = rq->ioq;
> + BUG_ON(!ioq);
> + ioq->nr_queued--;
> +
> + efqd = ioq->efqd;
> + BUG_ON(!efqd);
> + efqd->rq_queued--;
> +
> + if (elv_ioq_busy(ioq) && (elv_active_ioq(e) != ioq) && !ioq->nr_queued)
> + elv_del_ioq_busy(e, ioq, 1);
> +}
> +
> +/* A request got dispatched. Do the accounting. */
> +void elv_fq_dispatched_request(struct elevator_queue *e, struct request *rq)
> +{
> + struct io_queue *ioq = rq->ioq;
> +
> + if (!elv_iosched_fair_queuing_enabled(e))
> + return;
> +
> + BUG_ON(!ioq);
> + elv_ioq_request_dispatched(ioq);
> + elv_ioq_request_removed(e, rq);
> + elv_clear_ioq_must_dispatch(ioq);
> +}
> +
> +void elv_fq_activate_rq(struct request_queue *q, struct request *rq)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> + if (!elv_iosched_fair_queuing_enabled(q->elevator))
> + return;
> +
> + efqd->rq_in_driver++;
> + elv_log_ioq(efqd, rq_ioq(rq), "activate rq, drv=%d",
> + efqd->rq_in_driver);
> +}
> +
> +void elv_fq_deactivate_rq(struct request_queue *q, struct request *rq)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> + if (!elv_iosched_fair_queuing_enabled(q->elevator))
> + return;
> +
> + WARN_ON(!efqd->rq_in_driver);
> + efqd->rq_in_driver--;
> + elv_log_ioq(efqd, rq_ioq(rq), "deactivate rq, drv=%d",
> + efqd->rq_in_driver);
> +}
> +
> +/*
> + * Update hw_tag based on peak queue depth over 50 samples under
> + * sufficient load.
> + */
> +static void elv_update_hw_tag(struct elv_fq_data *efqd)
> +{
> + if (efqd->rq_in_driver > efqd->rq_in_driver_peak)
> + efqd->rq_in_driver_peak = efqd->rq_in_driver;
> +
> + if (efqd->rq_queued <= ELV_HW_QUEUE_MIN &&
> + efqd->rq_in_driver <= ELV_HW_QUEUE_MIN)
> + return;
> +
> + if (efqd->hw_tag_samples++ < 50)
> + return;
> +
> + if (efqd->rq_in_driver_peak >= ELV_HW_QUEUE_MIN)
> + efqd->hw_tag = 1;
> + else
> + efqd->hw_tag = 0;
> +
> + efqd->hw_tag_samples = 0;
> + efqd->rq_in_driver_peak = 0;
> +}
> +
> +/*
> + * If ioscheduler has functionality of keeping track of close cooperator, check
> + * with it if it has got a closely co-operating queue.
> + */
> +static inline struct io_queue *elv_close_cooperator(struct request_queue *q,
> + struct io_queue *ioq, int probe)
> +{
> + struct elevator_queue *e = q->elevator;
> + struct io_queue *new_ioq = NULL;
> +
> + /*
> + * Currently this feature is supported only for flat hierarchy or
> + * root group queues so that default cfq behavior is not changed.
> + */
> + if (!is_root_group_ioq(q, ioq))
> + return NULL;
> +
> + if (q->elevator->ops->elevator_close_cooperator_fn)
> + new_ioq = e->ops->elevator_close_cooperator_fn(q,
> + ioq->sched_queue, probe);
> +
> + /* Only select co-operating queue if it belongs to root group */
> + if (new_ioq && !is_root_group_ioq(q, new_ioq))
> + return NULL;
> +
> + return new_ioq;
> +}
> +
> +/* A request got completed from io_queue. Do the accounting. */
> +void elv_ioq_completed_request(struct request_queue *q, struct request *rq)
> +{
> + const int sync = rq_is_sync(rq);
> + struct io_queue *ioq;
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> + if (!elv_iosched_fair_queuing_enabled(q->elevator))
> + return;
> +
> + ioq = rq->ioq;
> +
> + elv_log_ioq(efqd, ioq, "complete");
> +
> + elv_update_hw_tag(efqd);
> +
> + WARN_ON(!efqd->rq_in_driver);
> + WARN_ON(!ioq->dispatched);
> + efqd->rq_in_driver--;
> + ioq->dispatched--;
> +
> + if (sync)
> + ioq->last_end_request = jiffies;
> +
> + /*
> + * If this is the active queue, check if it needs to be expired,
> + * or if we want to idle in case it has no pending requests.
> + */
> +
> + if (elv_active_ioq(q->elevator) == ioq) {
> + if (elv_ioq_slice_new(ioq)) {
> + elv_ioq_set_prio_slice(q, ioq);
> + elv_clear_ioq_slice_new(ioq);
> + }
> + /*
> + * If there are no requests waiting in this queue, and
> + * there are other queues ready to issue requests, AND
> + * those other queues are issuing requests within our
> + * mean seek distance, give them a chance to run instead
> + * of idling.
> + */
> + if (elv_ioq_slice_used(ioq) || elv_ioq_class_idle(ioq))
> + elv_ioq_slice_expired(q);
> + else if (!ioq->nr_queued && !elv_close_cooperator(q, ioq, 1)
> + && sync && !rq_noidle(rq))
> + elv_ioq_arm_slice_timer(q);
> + }
> +
> + if (!efqd->rq_in_driver)
> + elv_schedule_dispatch(q);
> +}
> +
> +struct io_group *io_lookup_io_group_current(struct request_queue *q)
> +{
> + struct elv_fq_data *efqd = &q->elevator->efqd;
> +
> + return efqd->root_group;
> +}
> +EXPORT_SYMBOL(io_lookup_io_group_current);
> +
> +void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
> + int ioprio)
> +{
> + struct io_queue *ioq = NULL;
> +
> + switch (ioprio_class) {
> + case IOPRIO_CLASS_RT:
> + ioq = iog->async_queue[0][ioprio];
> + break;
> + case IOPRIO_CLASS_BE:
> + ioq = iog->async_queue[1][ioprio];
> + break;
> + case IOPRIO_CLASS_IDLE:
> + ioq = iog->async_idle_queue;
> + break;
> + default:
> + BUG();
> + }
> +
> + if (ioq)
> + return ioq->sched_queue;
> + return NULL;
> +}
> +EXPORT_SYMBOL(io_group_async_queue_prio);
> +
> +void io_group_set_async_queue(struct io_group *iog, int ioprio_class,
> + int ioprio, struct io_queue *ioq)
> +{
> + switch (ioprio_class) {
> + case IOPRIO_CLASS_RT:
> + iog->async_queue[0][ioprio] = ioq;
> + break;
> + case IOPRIO_CLASS_BE:
> + iog->async_queue[1][ioprio] = ioq;
> + break;
> + case IOPRIO_CLASS_IDLE:
> + iog->async_idle_queue = ioq;
> + break;
> + default:
> + BUG();
> + }
> +
> + /*
> + * Take the group reference and pin the queue. Group exit will
> + * clean it up
> + */
> + elv_get_ioq(ioq);
> +}
> +EXPORT_SYMBOL(io_group_set_async_queue);
> +
> +/*
> + * Release all the io group references to its async queues.
> + */
> +void io_put_io_group_queues(struct elevator_queue *e, struct io_group *iog)
> +{
> + int i, j;
> +
> + for (i = 0; i < 2; i++)
> + for (j = 0; j < IOPRIO_BE_NR; j++)
> + elv_release_ioq(e, &iog->async_queue[i][j]);
> +
> + /* Free up async idle queue */
> + elv_release_ioq(e, &iog->async_idle_queue);
> +}
> +
> +struct io_group *io_alloc_root_group(struct request_queue *q,
> + struct elevator_queue *e, void *key)
> +{
> + struct io_group *iog;
> + int i;
> +
> + iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
> + if (iog == NULL)
> + return NULL;
> +
> + for (i = 0; i < IO_IOPRIO_CLASSES; i++)
> + iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT;
> +
> + return iog;
> +}
> +
> +void io_free_root_group(struct elevator_queue *e)
> +{
> + struct io_group *iog = e->efqd.root_group;
> + struct io_service_tree *st;
> + int i;
> +
> + for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
> + st = iog->sched_data.service_tree + i;
> + io_flush_idle_tree(st);
> + }
> +
> + io_put_io_group_queues(e, iog);
> + kfree(iog);
> +}
> +
> +static void elv_slab_kill(void)
> +{
> + /*
> + * Caller already ensured that pending RCU callbacks are completed,
> + * so we should have no busy allocations at this point.
> + */
> + if (elv_ioq_pool)
> + kmem_cache_destroy(elv_ioq_pool);
> +}
> +
> +static int __init elv_slab_setup(void)
> +{
> + elv_ioq_pool = KMEM_CACHE(io_queue, 0);
> + if (!elv_ioq_pool)
> + goto fail;
> +
> + return 0;
> +fail:
> + elv_slab_kill();
> + return -ENOMEM;
> +}
> +
> +/* Initialize fair queueing data associated with elevator */
> +int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e)
> +{
> + struct io_group *iog;
> + struct elv_fq_data *efqd = &e->efqd;
> +
> + if (!elv_iosched_fair_queuing_enabled(e))
> + return 0;
> +
> + iog = io_alloc_root_group(q, e, efqd);
> + if (iog == NULL)
> + return 1;
> +
> + efqd->root_group = iog;
> + efqd->queue = q;
> +
> + init_timer(&efqd->idle_slice_timer);
> + efqd->idle_slice_timer.function = elv_idle_slice_timer;
> + efqd->idle_slice_timer.data = (unsigned long) efqd;
> +
> + INIT_WORK(&efqd->unplug_work, elv_kick_queue);
> +
> + efqd->elv_slice[0] = elv_slice_async;
> + efqd->elv_slice[1] = elv_slice_sync;
> + efqd->elv_slice_idle = elv_slice_idle;
> + efqd->hw_tag = 1;
> +
> + return 0;
> +}
> +
> +/*
> + * elv_exit_fq_data is called before we call elevator_exit_fn. Before
> + * we ask elevator to cleanup its queues, we do the cleanup here so
> + * that all the group and idle tree references to ioq are dropped. Later
> + * during elevator cleanup, ioc reference will be dropped which will lead
> + * to removal of ioscheduler queue as well as associated ioq object.
> + */
> +void elv_exit_fq_data(struct elevator_queue *e)
> +{
> + struct elv_fq_data *efqd = &e->efqd;
> +
> + if (!elv_iosched_fair_queuing_enabled(e))
> + return;
> +
> + elv_shutdown_timer_wq(e);
> +
> + BUG_ON(timer_pending(&efqd->idle_slice_timer));
> + io_free_root_group(e);
> +}
> +
> +/*
> + * This is called after the io scheduler has cleaned up its data structres.
> + * I don't think that this function is required. Right now just keeping it
> + * because cfq cleans up timer and work queue again after freeing up
> + * io contexts. To me io scheduler has already been drained out, and all
> + * the active queue have already been expired so time and work queue should
> + * not been activated during cleanup process.
> + *
> + * Keeping it here for the time being. Will get rid of it later.
> + */
> +void elv_exit_fq_data_post(struct elevator_queue *e)
> +{
> + struct elv_fq_data *efqd = &e->efqd;
> +
> + if (!elv_iosched_fair_queuing_enabled(e))
> + return;
> +
> + elv_shutdown_timer_wq(e);
> + BUG_ON(timer_pending(&efqd->idle_slice_timer));
> +}
> +
> +
> +static int __init elv_fq_init(void)
> +{
> + if (elv_slab_setup())
> + return -ENOMEM;
> +
> + /* could be 0 on HZ < 1000 setups */
> +
> + if (!elv_slice_async)
> + elv_slice_async = 1;
> +
> + if (!elv_slice_idle)
> + elv_slice_idle = 1;
> +
> + return 0;
> +}
> +
> +module_init(elv_fq_init);
> diff --git a/block/elevator-fq.h b/block/elevator-fq.h
> new file mode 100644
> index 0000000..5b6c1cc
> --- /dev/null
> +++ b/block/elevator-fq.h
> @@ -0,0 +1,473 @@
> +/*
> + * BFQ: data structures and common functions prototypes.
> + *
> + * Based on ideas and code from CFQ:
> + * Copyright (C) 2003 Jens Axboe <axboe@xxxxxxxxx>
> + *
> + * Copyright (C) 2008 Fabio Checconi <fabio@xxxxxxxxxxxxxxxx>
> + * Paolo Valente <paolo.valente@xxxxxxxxxx>
> + * Copyright (C) 2009 Vivek Goyal <vgoyal@xxxxxxxxxx>
> + * Nauman Rafique <nauman@xxxxxxxxxx>
> + */
> +
> +#include <linux/blkdev.h>
> +
> +#ifndef _BFQ_SCHED_H
> +#define _BFQ_SCHED_H
> +
> +#define IO_IOPRIO_CLASSES 3
> +
> +typedef u64 bfq_timestamp_t;
> +typedef unsigned long bfq_weight_t;
> +typedef unsigned long bfq_service_t;

Does this abstraction really provide any benefit? Why not directly use
the standard C types, make the code easier to read.

> +struct io_entity;
> +struct io_queue;
> +
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> +
> +#define ELV_ATTR(name) \
> + __ATTR(name, S_IRUGO|S_IWUSR, elv_##name##_show, elv_##name##_store)
> +
> +/**
> + * struct bfq_service_tree - per ioprio_class service tree.

Comment is old, does not reflect the newer name

> + * @active: tree for active entities (i.e., those backlogged).
> + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
> + * @first_idle: idle entity with minimum F_i.
> + * @last_idle: idle entity with maximum F_i.
> + * @vtime: scheduler virtual time.
> + * @wsum: scheduler weight sum; active and idle entities contribute to it.
> + *
> + * Each service tree represents a B-WF2Q+ scheduler on its own. Each
> + * ioprio_class has its own independent scheduler, and so its own
> + * bfq_service_tree. All the fields are protected by the queue lock
> + * of the containing efqd.
> + */
> +struct io_service_tree {
> + struct rb_root active;
> + struct rb_root idle;
> +
> + struct io_entity *first_idle;
> + struct io_entity *last_idle;
> +
> + bfq_timestamp_t vtime;
> + bfq_weight_t wsum;
> +};
> +
> +/**
> + * struct bfq_sched_data - multi-class scheduler.

Again the naming convention is broken, you need to change several
bfq's to io's :)

> + * @active_entity: entity under service.
> + * @next_active: head-of-the-line entity in the scheduler.
> + * @service_tree: array of service trees, one per ioprio_class.
> + *
> + * bfq_sched_data is the basic scheduler queue. It supports three
> + * ioprio_classes, and can be used either as a toplevel queue or as
> + * an intermediate queue on a hierarchical setup.
> + * @next_active points to the active entity of the sched_data service
> + * trees that will be scheduled next.
> + *
> + * The supported ioprio_classes are the same as in CFQ, in descending
> + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
> + * Requests from higher priority queues are served before all the
> + * requests from lower priority queues; among requests of the same
> + * queue requests are served according to B-WF2Q+.
> + * All the fields are protected by the queue lock of the containing bfqd.
> + */
> +struct io_sched_data {
> + struct io_entity *active_entity;
> + struct io_service_tree service_tree[IO_IOPRIO_CLASSES];
> +};
> +
> +/**
> + * struct bfq_entity - schedulable entity.
> + * @rb_node: service_tree member.
> + * @on_st: flag, true if the entity is on a tree (either the active or
> + * the idle one of its service_tree).
> + * @finish: B-WF2Q+ finish timestamp (aka F_i).
> + * @start: B-WF2Q+ start timestamp (aka S_i).

Could you mention what key is used in the rb_tree? start, finish
sounds like a range, so my suspicion is that start is used.

> + * @tree: tree the entity is enqueued into; %NULL if not on a tree.
> + * @min_start: minimum start time of the (active) subtree rooted at
> + * this entity; used for O(log N) lookups into active trees.

Used for O(log N) makes no sense to me, RBTree has a worst case
lookup time of O(log N), but what is the comment saying?

> + * @service: service received during the last round of service.
> + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
> + * @weight: weight of the queue, calculated as IOPRIO_BE_NR - @ioprio.
> + * @parent: parent entity, for hierarchical scheduling.
> + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
> + * associated scheduler queue, %NULL on leaf nodes.
> + * @sched_data: the scheduler queue this entity belongs to.
> + * @ioprio: the ioprio in use.
> + * @new_ioprio: when an ioprio change is requested, the new ioprio value
> + * @ioprio_class: the ioprio_class in use.
> + * @new_ioprio_class: when an ioprio_class change is requested, the new
> + * ioprio_class value.
> + * @ioprio_changed: flag, true when the user requested an ioprio or
> + * ioprio_class change.
> + *
> + * A bfq_entity is used to represent either a bfq_queue (leaf node in the
> + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
> + * entity belongs to the sched_data of the parent group in the cgroup
> + * hierarchy. Non-leaf entities have also their own sched_data, stored
> + * in @my_sched_data.
> + *
> + * Each entity stores independently its priority values; this would allow
> + * different weights on different devices, but this functionality is not
> + * exported to userspace by now. Priorities are updated lazily, first
> + * storing the new values into the new_* fields, then setting the
> + * @ioprio_changed flag. As soon as there is a transition in the entity
> + * state that allows the priority update to take place the effective and
> + * the requested priority values are synchronized.
> + *
> + * The weight value is calculated from the ioprio to export the same
> + * interface as CFQ. When dealing with ``well-behaved'' queues (i.e.,
> + * queues that do not spend too much time to consume their budget and
> + * have true sequential behavior, and when there are no external factors
> + * breaking anticipation) the relative weights at each level of the
> + * cgroups hierarchy should be guaranteed.
> + * All the fields are protected by the queue lock of the containing bfqd.
> + */
> +struct io_entity {
> + struct rb_node rb_node;
> +
> + int on_st;
> +
> + bfq_timestamp_t finish;
> + bfq_timestamp_t start;
> +
> + struct rb_root *tree;
> +
> + bfq_timestamp_t min_start;
> +
> + bfq_service_t service, budget;
> + bfq_weight_t weight;
> +
> + struct io_entity *parent;
> +
> + struct io_sched_data *my_sched_data;
> + struct io_sched_data *sched_data;
> +
> + unsigned short ioprio, new_ioprio;
> + unsigned short ioprio_class, new_ioprio_class;
> +
> + int ioprio_changed;
> +};
> +
> +/*
> + * A common structure embedded by every io scheduler into their respective
> + * queue structure.
> + */
> +struct io_queue {
> + struct io_entity entity;

So the io_queue has an abstract entity called io_entity that contains
it QoS parameters? Correct?

> + atomic_t ref;
> + unsigned int flags;
> +
> + /* Pointer to generic elevator data structure */
> + struct elv_fq_data *efqd;
> + pid_t pid;

Why do we store the pid?

> +
> + /* Number of requests queued on this io queue */
> + unsigned long nr_queued;
> +
> + /* Requests dispatched from this queue */
> + int dispatched;
> +
> + /* Keep a track of think time of processes in this queue */
> + unsigned long last_end_request;
> + unsigned long ttime_total;
> + unsigned long ttime_samples;
> + unsigned long ttime_mean;
> +
> + unsigned long slice_end;
> +
> + /* Pointer to io scheduler's queue */
> + void *sched_queue;
> +};
> +
> +struct io_group {
> + struct io_sched_data sched_data;
> +
> + /* async_queue and idle_queue are used only for cfq */
> + struct io_queue *async_queue[2][IOPRIO_BE_NR];

Again the 2 is confusing

> + struct io_queue *async_idle_queue;
> +
> + /*
> + * Used to track any pending rt requests so we can pre-empt current
> + * non-RT cfqq in service when this value is non-zero.
> + */
> + unsigned int busy_rt_queues;
> +};
> +
> +struct elv_fq_data {

What does fq stand for?

> + struct io_group *root_group;
> +
> + struct request_queue *queue;
> + unsigned int busy_queues;
> +
> + /* Number of requests queued */
> + int rq_queued;
> +
> + /* Pointer to the ioscheduler queue being served */
> + void *active_queue;
> +
> + int rq_in_driver;
> + int hw_tag;
> + int hw_tag_samples;
> + int rq_in_driver_peak;

Some comments of _in_driver and _in_driver_peak would be nice.

> +
> + /*
> + * elevator fair queuing layer has the capability to provide idling
> + * for ensuring fairness for processes doing dependent reads.
> + * This might be needed to ensure fairness among two processes doing
> + * synchronous reads in two different cgroups. noop and deadline don't
> + * have any notion of anticipation/idling. As of now, these are the
> + * users of this functionality.
> + */
> + unsigned int elv_slice_idle;
> + struct timer_list idle_slice_timer;
> + struct work_struct unplug_work;
> +
> + unsigned int elv_slice[2];

Why [2] makes the code hearder to read

> +};
> +
> +extern int elv_slice_idle;
> +extern int elv_slice_async;
> +
> +/* Logging facilities. */
> +#define elv_log_ioq(efqd, ioq, fmt, args...) \
> + blk_add_trace_msg((efqd)->queue, "elv%d%c " fmt, (ioq)->pid, \
> + elv_ioq_sync(ioq) ? 'S' : 'A', ##args)
> +
> +#define elv_log(efqd, fmt, args...) \
> + blk_add_trace_msg((efqd)->queue, "elv " fmt, ##args)
> +
> +#define ioq_sample_valid(samples) ((samples) > 80)
> +
> +/* Some shared queue flag manipulation functions among elevators */
> +
> +enum elv_queue_state_flags {
> + ELV_QUEUE_FLAG_busy = 0, /* has requests or is under service */
> + ELV_QUEUE_FLAG_sync, /* synchronous queue */
> + ELV_QUEUE_FLAG_idle_window, /* elevator slice idling enabled */
> + ELV_QUEUE_FLAG_wait_request, /* waiting for a request */
> + ELV_QUEUE_FLAG_must_dispatch, /* must be allowed a dispatch */
> + ELV_QUEUE_FLAG_slice_new, /* no requests dispatched in slice */
> + ELV_QUEUE_FLAG_NR,
> +};
> +
> +#define ELV_IO_QUEUE_FLAG_FNS(name) \
> +static inline void elv_mark_ioq_##name(struct io_queue *ioq) \
> +{ \
> + (ioq)->flags |= (1 << ELV_QUEUE_FLAG_##name); \
> +} \
> +static inline void elv_clear_ioq_##name(struct io_queue *ioq) \
> +{ \
> + (ioq)->flags &= ~(1 << ELV_QUEUE_FLAG_##name); \
> +} \
> +static inline int elv_ioq_##name(struct io_queue *ioq) \
> +{ \
> + return ((ioq)->flags & (1 << ELV_QUEUE_FLAG_##name)) != 0; \
> +}
> +
> +ELV_IO_QUEUE_FLAG_FNS(busy)
> +ELV_IO_QUEUE_FLAG_FNS(sync)
> +ELV_IO_QUEUE_FLAG_FNS(wait_request)
> +ELV_IO_QUEUE_FLAG_FNS(must_dispatch)
> +ELV_IO_QUEUE_FLAG_FNS(idle_window)
> +ELV_IO_QUEUE_FLAG_FNS(slice_new)
> +
> +static inline struct io_service_tree *
> +io_entity_service_tree(struct io_entity *entity)
> +{
> + struct io_sched_data *sched_data = entity->sched_data;
> + unsigned int idx = entity->ioprio_class - 1;
> +
> + BUG_ON(idx >= IO_IOPRIO_CLASSES);
> + BUG_ON(sched_data == NULL);
> +
> + return sched_data->service_tree + idx;
> +}
> +
> +/* A request got dispatched from the io_queue. Do the accounting. */
> +static inline void elv_ioq_request_dispatched(struct io_queue *ioq)
> +{
> + ioq->dispatched++;
> +}
> +
> +static inline int elv_ioq_slice_used(struct io_queue *ioq)
> +{
> + if (elv_ioq_slice_new(ioq))
> + return 0;
> + if (time_before(jiffies, ioq->slice_end))
> + return 0;
> +
> + return 1;
> +}
> +
> +/* How many request are currently dispatched from the queue */
> +static inline int elv_ioq_nr_dispatched(struct io_queue *ioq)
> +{
> + return ioq->dispatched;
> +}
> +
> +/* How many request are currently queued in the queue */
> +static inline int elv_ioq_nr_queued(struct io_queue *ioq)
> +{
> + return ioq->nr_queued;
> +}
> +
> +static inline void elv_get_ioq(struct io_queue *ioq)
> +{
> + atomic_inc(&ioq->ref);
> +}
> +
> +static inline void elv_ioq_set_slice_end(struct io_queue *ioq,
> + unsigned long slice_end)
> +{
> + ioq->slice_end = slice_end;
> +}
> +
> +static inline int elv_ioq_class_idle(struct io_queue *ioq)
> +{
> + return ioq->entity.ioprio_class == IOPRIO_CLASS_IDLE;
> +}
> +
> +static inline int elv_ioq_class_rt(struct io_queue *ioq)
> +{
> + return ioq->entity.ioprio_class == IOPRIO_CLASS_RT;
> +}
> +
> +static inline int elv_ioq_ioprio_class(struct io_queue *ioq)
> +{
> + return ioq->entity.new_ioprio_class;
> +}
> +
> +static inline int elv_ioq_ioprio(struct io_queue *ioq)
> +{
> + return ioq->entity.new_ioprio;
> +}
> +
> +static inline void elv_ioq_set_ioprio_class(struct io_queue *ioq,
> + int ioprio_class)
> +{
> + ioq->entity.new_ioprio_class = ioprio_class;
> + ioq->entity.ioprio_changed = 1;
> +}
> +
> +static inline void elv_ioq_set_ioprio(struct io_queue *ioq, int ioprio)
> +{
> + ioq->entity.new_ioprio = ioprio;
> + ioq->entity.ioprio_changed = 1;
> +}
> +
> +static inline void *ioq_sched_queue(struct io_queue *ioq)
> +{
> + if (ioq)
> + return ioq->sched_queue;
> + return NULL;
> +}
> +
> +static inline struct io_group *ioq_to_io_group(struct io_queue *ioq)
> +{
> + return container_of(ioq->entity.sched_data, struct io_group,
> + sched_data);
> +}
> +
> +extern ssize_t elv_slice_idle_show(struct elevator_queue *q, char *name);
> +extern ssize_t elv_slice_idle_store(struct elevator_queue *q, const char *name,
> + size_t count);
> +extern ssize_t elv_slice_sync_show(struct elevator_queue *q, char *name);
> +extern ssize_t elv_slice_sync_store(struct elevator_queue *q, const char *name,
> + size_t count);
> +extern ssize_t elv_slice_async_show(struct elevator_queue *q, char *name);
> +extern ssize_t elv_slice_async_store(struct elevator_queue *q, const char *name,
> + size_t count);
> +
> +/* Functions used by elevator.c */
> +extern int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e);
> +extern void elv_exit_fq_data(struct elevator_queue *e);
> +extern void elv_exit_fq_data_post(struct elevator_queue *e);
> +
> +extern void elv_ioq_request_add(struct request_queue *q, struct request *rq);
> +extern void elv_ioq_request_removed(struct elevator_queue *e,
> + struct request *rq);
> +extern void elv_fq_dispatched_request(struct elevator_queue *e,
> + struct request *rq);
> +
> +extern void elv_fq_activate_rq(struct request_queue *q, struct request *rq);
> +extern void elv_fq_deactivate_rq(struct request_queue *q, struct request *rq);
> +
> +extern void elv_ioq_completed_request(struct request_queue *q,
> + struct request *rq);
> +
> +extern void *elv_fq_select_ioq(struct request_queue *q, int force);
> +extern struct io_queue *rq_ioq(struct request *rq);
> +
> +/* Functions used by io schedulers */
> +extern void elv_put_ioq(struct io_queue *ioq);
> +extern void __elv_ioq_slice_expired(struct request_queue *q,
> + struct io_queue *ioq);
> +extern int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq,
> + void *sched_queue, int ioprio_class, int ioprio, int is_sync);
> +extern void elv_schedule_dispatch(struct request_queue *q);
> +extern int elv_hw_tag(struct elevator_queue *e);
> +extern void *elv_active_sched_queue(struct elevator_queue *e);
> +extern int elv_mod_idle_slice_timer(struct elevator_queue *eq,
> + unsigned long expires);
> +extern int elv_del_idle_slice_timer(struct elevator_queue *eq);
> +extern unsigned int elv_get_slice_idle(struct elevator_queue *eq);
> +extern void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
> + int ioprio);
> +extern void io_group_set_async_queue(struct io_group *iog, int ioprio_class,
> + int ioprio, struct io_queue *ioq);
> +extern struct io_group *io_lookup_io_group_current(struct request_queue *q);
> +extern int elv_nr_busy_ioq(struct elevator_queue *e);
> +extern struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask);
> +extern void elv_free_ioq(struct io_queue *ioq);
> +
> +#else /* CONFIG_ELV_FAIR_QUEUING */
> +
> +static inline int elv_init_fq_data(struct request_queue *q,
> + struct elevator_queue *e)
> +{
> + return 0;
> +}
> +
> +static inline void elv_exit_fq_data(struct elevator_queue *e) {}
> +static inline void elv_exit_fq_data_post(struct elevator_queue *e) {}
> +
> +static inline void elv_fq_activate_rq(struct request_queue *q,
> + struct request *rq)
> +{
> +}
> +
> +static inline void elv_fq_deactivate_rq(struct request_queue *q,
> + struct request *rq)
> +{
> +}
> +
> +static inline void elv_fq_dispatched_request(struct elevator_queue *e,
> + struct request *rq)
> +{
> +}
> +
> +static inline void elv_ioq_request_removed(struct elevator_queue *e,
> + struct request *rq)
> +{
> +}
> +
> +static inline void elv_ioq_request_add(struct request_queue *q,
> + struct request *rq)
> +{
> +}
> +
> +static inline void elv_ioq_completed_request(struct request_queue *q,
> + struct request *rq)
> +{
> +}
> +
> +static inline void *ioq_sched_queue(struct io_queue *ioq) { return NULL; }
> +static inline struct io_queue *rq_ioq(struct request *rq) { return NULL; }
> +static inline void *elv_fq_select_ioq(struct request_queue *q, int force)
> +{
> + return NULL;
> +}
> +#endif /* CONFIG_ELV_FAIR_QUEUING */
> +#endif /* _BFQ_SCHED_H */
> diff --git a/block/elevator.c b/block/elevator.c
> index 7073a90..c2f07f5 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -231,6 +231,9 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
> for (i = 0; i < ELV_HASH_ENTRIES; i++)
> INIT_HLIST_HEAD(&eq->hash[i]);
>
> + if (elv_init_fq_data(q, eq))
> + goto err;
> +
> return eq;
> err:
> kfree(eq);
> @@ -301,9 +304,11 @@ EXPORT_SYMBOL(elevator_init);
> void elevator_exit(struct elevator_queue *e)
> {
> mutex_lock(&e->sysfs_lock);
> + elv_exit_fq_data(e);
> if (e->ops->elevator_exit_fn)
> e->ops->elevator_exit_fn(e);
> e->ops = NULL;
> + elv_exit_fq_data_post(e);
> mutex_unlock(&e->sysfs_lock);
>
> kobject_put(&e->kobj);
> @@ -314,6 +319,8 @@ static void elv_activate_rq(struct request_queue *q, struct request *rq)
> {
> struct elevator_queue *e = q->elevator;
>
> + elv_fq_activate_rq(q, rq);
> +
> if (e->ops->elevator_activate_req_fn)
> e->ops->elevator_activate_req_fn(q, rq);
> }
> @@ -322,6 +329,8 @@ static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
> {
> struct elevator_queue *e = q->elevator;
>
> + elv_fq_deactivate_rq(q, rq);
> +
> if (e->ops->elevator_deactivate_req_fn)
> e->ops->elevator_deactivate_req_fn(q, rq);
> }
> @@ -446,6 +455,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
> elv_rqhash_del(q, rq);
>
> q->nr_sorted--;
> + elv_fq_dispatched_request(q->elevator, rq);
>
> boundary = q->end_sector;
> stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED;
> @@ -486,6 +496,7 @@ void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
> elv_rqhash_del(q, rq);
>
> q->nr_sorted--;
> + elv_fq_dispatched_request(q->elevator, rq);
>
> q->end_sector = rq_end_sector(rq);
> q->boundary_rq = rq;
> @@ -553,6 +564,7 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
> elv_rqhash_del(q, next);
>
> q->nr_sorted--;
> + elv_ioq_request_removed(e, next);
> q->last_merge = rq;
> }
>
> @@ -657,12 +669,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
> q->last_merge = rq;
> }
>
> - /*
> - * Some ioscheds (cfq) run q->request_fn directly, so
> - * rq cannot be accessed after calling
> - * elevator_add_req_fn.
> - */
> q->elevator->ops->elevator_add_req_fn(q, rq);
> + elv_ioq_request_add(q, rq);
> break;
>
> case ELEVATOR_INSERT_REQUEUE:
> @@ -872,13 +880,12 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
>
> int elv_queue_empty(struct request_queue *q)
> {
> - struct elevator_queue *e = q->elevator;
> -
> if (!list_empty(&q->queue_head))
> return 0;
>
> - if (e->ops->elevator_queue_empty_fn)
> - return e->ops->elevator_queue_empty_fn(q);
> + /* Hopefully nr_sorted works and no need to call queue_empty_fn */
> + if (q->nr_sorted)
> + return 0;
>
> return 1;
> }
> @@ -953,8 +960,11 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
> */
> if (blk_account_rq(rq)) {
> q->in_flight--;
> - if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
> - e->ops->elevator_completed_req_fn(q, rq);
> + if (blk_sorted_rq(rq)) {
> + if (e->ops->elevator_completed_req_fn)
> + e->ops->elevator_completed_req_fn(q, rq);
> + elv_ioq_completed_request(q, rq);
> + }
> }
>
> /*
> @@ -1242,3 +1252,17 @@ struct request *elv_rb_latter_request(struct request_queue *q,
> return NULL;
> }
> EXPORT_SYMBOL(elv_rb_latter_request);
> +
> +/* Get the io scheduler queue pointer. For cfq, it is stored in rq->ioq*/
> +void *elv_get_sched_queue(struct request_queue *q, struct request *rq)
> +{
> + return ioq_sched_queue(rq_ioq(rq));
> +}
> +EXPORT_SYMBOL(elv_get_sched_queue);
> +
> +/* Select an ioscheduler queue to dispatch request from. */
> +void *elv_select_sched_queue(struct request_queue *q, int force)
> +{
> + return ioq_sched_queue(elv_fq_select_ioq(q, force));
> +}
> +EXPORT_SYMBOL(elv_select_sched_queue);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index b4f71f1..96a94c9 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -245,6 +245,11 @@ struct request {
>
> /* for bidi */
> struct request *next_rq;
> +
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> + /* io queue request belongs to */
> + struct io_queue *ioq;
> +#endif
> };
>
> static inline unsigned short req_get_ioprio(struct request *req)
> diff --git a/include/linux/elevator.h b/include/linux/elevator.h
> index c59b769..679c149 100644
> --- a/include/linux/elevator.h
> +++ b/include/linux/elevator.h
> @@ -2,6 +2,7 @@
> #define _LINUX_ELEVATOR_H
>
> #include <linux/percpu.h>
> +#include "../../block/elevator-fq.h"
>
> #ifdef CONFIG_BLOCK
>
> @@ -29,6 +30,18 @@ typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct reques
>
> typedef void *(elevator_init_fn) (struct request_queue *);
> typedef void (elevator_exit_fn) (struct elevator_queue *);
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> +typedef void (elevator_free_sched_queue_fn) (struct elevator_queue*, void *);
> +typedef void (elevator_active_ioq_set_fn) (struct request_queue*, void *, int);
> +typedef void (elevator_active_ioq_reset_fn) (struct request_queue *, void*);
> +typedef void (elevator_arm_slice_timer_fn) (struct request_queue*, void*);
> +typedef int (elevator_should_preempt_fn) (struct request_queue*, void*,
> + struct request*);
> +typedef int (elevator_update_idle_window_fn) (struct elevator_queue*, void*,
> + struct request*);
> +typedef struct io_queue* (elevator_close_cooperator_fn) (struct request_queue*,
> + void*, int probe);
> +#endif
>
> struct elevator_ops
> {
> @@ -56,6 +69,17 @@ struct elevator_ops
> elevator_init_fn *elevator_init_fn;
> elevator_exit_fn *elevator_exit_fn;
> void (*trim)(struct io_context *);
> +
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> + elevator_free_sched_queue_fn *elevator_free_sched_queue_fn;
> + elevator_active_ioq_set_fn *elevator_active_ioq_set_fn;
> + elevator_active_ioq_reset_fn *elevator_active_ioq_reset_fn;
> +
> + elevator_arm_slice_timer_fn *elevator_arm_slice_timer_fn;
> + elevator_should_preempt_fn *elevator_should_preempt_fn;
> + elevator_update_idle_window_fn *elevator_update_idle_window_fn;
> + elevator_close_cooperator_fn *elevator_close_cooperator_fn;
> +#endif
> };
>
> #define ELV_NAME_MAX (16)
> @@ -76,6 +100,9 @@ struct elevator_type
> struct elv_fs_entry *elevator_attrs;
> char elevator_name[ELV_NAME_MAX];
> struct module *elevator_owner;
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> + int elevator_features;
> +#endif
> };
>
> /*
> @@ -89,6 +116,10 @@ struct elevator_queue
> struct elevator_type *elevator_type;
> struct mutex sysfs_lock;
> struct hlist_head *hash;
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> + /* fair queuing data */
> + struct elv_fq_data efqd;
> +#endif
> };
>
> /*
> @@ -209,5 +240,25 @@ enum {
> __val; \
> })
>
> +/* iosched can let elevator know their feature set/capability */
> +#ifdef CONFIG_ELV_FAIR_QUEUING
> +
> +/* iosched wants to use fq logic of elevator layer */
> +#define ELV_IOSCHED_NEED_FQ 1
> +
> +static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e)
> +{
> + return (e->elevator_type->elevator_features) & ELV_IOSCHED_NEED_FQ;
> +}
> +
> +#else /* ELV_IOSCHED_FAIR_QUEUING */
> +
> +static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e)
> +{
> + return 0;
> +}
> +#endif /* ELV_IOSCHED_FAIR_QUEUING */
> +extern void *elv_get_sched_queue(struct request_queue *q, struct request *rq);
> +extern void *elv_select_sched_queue(struct request_queue *q, int force);
> #endif /* CONFIG_BLOCK */
> #endif
> --
> 1.6.0.6
>

--
Balbir
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/