[PATCH 3/3] f2fs: refactor flush_nat_entries to remove costly reorganizing ops

From: Jaegeuk Kim
Date: Tue Sep 23 2014 - 00:53:30 EST


Previously, f2fs tries to reorganize the dirty nat entries into multiple sets
according to its nid ranges. This can improve the flushing nat pages, however,
if there are a lot of cached nat entries, it becomes a bottleneck.

This patch introduces a new set management flow by removing dirty nat list and
adding a series of set operations when the nat entry becomes dirty.

Signed-off-by: Jaegeuk Kim <jaegeuk@xxxxxxxxxx>
---
fs/f2fs/f2fs.h | 13 +--
fs/f2fs/node.c | 299 +++++++++++++++++++++++++++++----------------------------
fs/f2fs/node.h | 9 +-
3 files changed, 162 insertions(+), 159 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7b1e1d2..94cfdc4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -164,6 +164,9 @@ struct fsync_inode_entry {
#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)

+#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
+#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+
static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
{
int before = nats_in_cursum(rs);
@@ -182,9 +185,8 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
int type)
{
if (type == NAT_JOURNAL)
- return nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES;
-
- return sits_in_cursum(sum) + size <= SIT_JOURNAL_ENTRIES;
+ return size <= MAX_NAT_JENTRIES(sum);
+ return size <= MAX_SIT_JENTRIES(sum);
}

/*
@@ -292,11 +294,10 @@ struct f2fs_nm_info {

/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
+ struct radix_tree_root nat_set_root;/* root of the nat set cache */
rwlock_t nat_tree_lock; /* protect nat_tree_lock */
- unsigned int nat_cnt; /* the # of cached nat entries */
struct list_head nat_entries; /* cached nat entry list (clean) */
- struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
- struct list_head nat_entry_set; /* nat entry set list */
+ unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */

/* free node ids management */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 21ed91b..f5a21f4 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -123,6 +123,57 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
kmem_cache_free(nat_entry_slab, e);
}

+static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
+{
+ nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
+ struct nat_entry_set *head;
+
+ if (get_nat_flag(ne, IS_DIRTY))
+ return;
+retry:
+ head = radix_tree_lookup(&nm_i->nat_set_root, set);
+ if (!head) {
+ head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+
+ INIT_LIST_HEAD(&head->entry_list);
+ INIT_LIST_HEAD(&head->set_list);
+ head->set = set;
+ head->entry_cnt = 0;
+
+ if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
+ cond_resched();
+ goto retry;
+ }
+ }
+ list_move_tail(&ne->list, &head->entry_list);
+ nm_i->dirty_nat_cnt++;
+ head->entry_cnt++;
+ set_nat_flag(ne, IS_DIRTY, true);
+}
+
+static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
+ struct nat_entry *ne)
+{
+ nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
+ struct nat_entry_set *head;
+
+ head = radix_tree_lookup(&nm_i->nat_set_root, set);
+ if (head) {
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+ set_nat_flag(ne, IS_DIRTY, false);
+ head->entry_cnt--;
+ nm_i->dirty_nat_cnt--;
+ }
+}
+
+static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
+ nid_t start, unsigned int nr, struct nat_entry_set **ep)
+{
+ return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
+ start, nr);
+}
+
bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1739,79 +1790,6 @@ skip:
return err;
}

-static struct nat_entry_set *grab_nat_entry_set(void)
-{
- struct nat_entry_set *nes =
- f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
-
- nes->entry_cnt = 0;
- INIT_LIST_HEAD(&nes->set_list);
- INIT_LIST_HEAD(&nes->entry_list);
- return nes;
-}
-
-static void release_nat_entry_set(struct nat_entry_set *nes,
- struct f2fs_nm_info *nm_i)
-{
- nm_i->dirty_nat_cnt -= nes->entry_cnt;
- list_del(&nes->set_list);
- kmem_cache_free(nat_entry_set_slab, nes);
-}
-
-static void adjust_nat_entry_set(struct nat_entry_set *nes,
- struct list_head *head)
-{
- struct nat_entry_set *next = nes;
-
- if (list_is_last(&nes->set_list, head))
- return;
-
- list_for_each_entry_continue(next, head, set_list)
- if (nes->entry_cnt <= next->entry_cnt)
- break;
-
- list_move_tail(&nes->set_list, &next->set_list);
-}
-
-static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
-{
- struct nat_entry_set *nes;
- nid_t start_nid = START_NID(ne->ni.nid);
-
- list_for_each_entry(nes, head, set_list) {
- if (nes->start_nid == start_nid) {
- list_move_tail(&ne->list, &nes->entry_list);
- nes->entry_cnt++;
- adjust_nat_entry_set(nes, head);
- return;
- }
- }
-
- nes = grab_nat_entry_set();
-
- nes->start_nid = start_nid;
- list_move_tail(&ne->list, &nes->entry_list);
- nes->entry_cnt++;
- list_add(&nes->set_list, head);
-}
-
-static void merge_nats_in_set(struct f2fs_sb_info *sbi)
-{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct list_head *dirty_list = &nm_i->dirty_nat_entries;
- struct list_head *set_list = &nm_i->nat_entry_set;
- struct nat_entry *ne, *tmp;
-
- write_lock(&nm_i->nat_tree_lock);
- list_for_each_entry_safe(ne, tmp, dirty_list, list) {
- if (nat_get_blkaddr(ne) == NEW_ADDR)
- continue;
- add_nat_entry(ne, set_list);
- nm_i->dirty_nat_cnt++;
- }
- write_unlock(&nm_i->nat_tree_lock);
-}
-
static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1846,101 +1824,129 @@ found:
mutex_unlock(&curseg->curseg_mutex);
}

-/*
- * This function is called during the checkpointing process.
- */
-void flush_nat_entries(struct f2fs_sb_info *sbi)
+static void __adjust_nat_entry_set(struct nat_entry_set *nes,
+ struct list_head *head, int max)
{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
- struct nat_entry_set *nes, *tmp;
- struct list_head *head = &nm_i->nat_entry_set;
- bool to_journal = true;
+ struct nat_entry_set *cur;
+ nid_t dirty_cnt = 0;

- /* merge nat entries of dirty list to nat entry set temporarily */
- merge_nats_in_set(sbi);
+ if (nes->entry_cnt >= max)
+ goto add_out;

- /*
- * if there are no enough space in journal to store dirty nat
- * entries, remove all entries from journal and merge them
- * into nat entry set.
- */
- if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) {
- remove_nats_in_journal(sbi);
-
- /*
- * merge nat entries of dirty list to nat entry set temporarily
- */
- merge_nats_in_set(sbi);
+ list_for_each_entry(cur, head, set_list) {
+ dirty_cnt += cur->entry_cnt;
+ if (dirty_cnt > max)
+ break;
+ if (cur->entry_cnt >= nes->entry_cnt) {
+ list_add(&nes->set_list, cur->set_list.prev);
+ return;
+ }
}
+add_out:
+ list_add_tail(&nes->set_list, head);
+}

- if (!nm_i->dirty_nat_cnt)
- return;
+static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
+ struct nat_entry_set *set)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
+ bool to_journal = true;
+ struct f2fs_nat_block *nat_blk;
+ struct nat_entry *ne, *cur;
+ struct page *page = NULL;

/*
* there are two steps to flush nat entries:
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
- list_for_each_entry_safe(nes, tmp, head, set_list) {
- struct f2fs_nat_block *nat_blk;
- struct nat_entry *ne, *cur;
- struct page *page;
- nid_t start_nid = nes->start_nid;
+ if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+ to_journal = false;

- if (to_journal &&
- !__has_cursum_space(sum, nes->entry_cnt, NAT_JOURNAL))
- to_journal = false;
+ if (to_journal) {
+ mutex_lock(&curseg->curseg_mutex);
+ } else {
+ page = get_next_nat_page(sbi, start_nid);
+ nat_blk = page_address(page);
+ f2fs_bug_on(sbi, !nat_blk);
+ }
+
+ /* flush dirty nats in nat entry set */
+ list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
+ struct f2fs_nat_entry *raw_ne;
+ nid_t nid = nat_get_nid(ne);
+ int offset;

if (to_journal) {
- mutex_lock(&curseg->curseg_mutex);
+ offset = lookup_journal_in_cursum(sum,
+ NAT_JOURNAL, nid, 1);
+ f2fs_bug_on(sbi, offset < 0);
+ raw_ne = &nat_in_journal(sum, offset);
+ nid_in_journal(sum, offset) = cpu_to_le32(nid);
} else {
- page = get_next_nat_page(sbi, start_nid);
- nat_blk = page_address(page);
- f2fs_bug_on(sbi, !nat_blk);
+ raw_ne = &nat_blk->entries[nid - start_nid];
}
+ raw_nat_from_node_info(raw_ne, &ne->ni);

- /* flush dirty nats in nat entry set */
- list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
- struct f2fs_nat_entry *raw_ne;
- nid_t nid = nat_get_nid(ne);
- int offset;
+ write_lock(&NM_I(sbi)->nat_tree_lock);
+ nat_reset_flag(ne);
+ __clear_nat_cache_dirty(NM_I(sbi), ne);
+ write_unlock(&NM_I(sbi)->nat_tree_lock);

- if (to_journal) {
- offset = lookup_journal_in_cursum(sum,
- NAT_JOURNAL, nid, 1);
- f2fs_bug_on(sbi, offset < 0);
- raw_ne = &nat_in_journal(sum, offset);
- nid_in_journal(sum, offset) = cpu_to_le32(nid);
- } else {
- raw_ne = &nat_blk->entries[nid - start_nid];
- }
- raw_nat_from_node_info(raw_ne, &ne->ni);
+ if (nat_get_blkaddr(ne) == NULL_ADDR)
+ add_free_nid(sbi, nid, false);
+ }

- if (nat_get_blkaddr(ne) == NULL_ADDR &&
- add_free_nid(sbi, nid, false) <= 0) {
- write_lock(&nm_i->nat_tree_lock);
- __del_from_nat_cache(nm_i, ne);
- write_unlock(&nm_i->nat_tree_lock);
- } else {
- write_lock(&nm_i->nat_tree_lock);
- nat_reset_flag(ne);
- __clear_nat_cache_dirty(nm_i, ne);
- write_unlock(&nm_i->nat_tree_lock);
- }
- }
+ if (to_journal)
+ mutex_unlock(&curseg->curseg_mutex);
+ else
+ f2fs_put_page(page, 1);

- if (to_journal)
- mutex_unlock(&curseg->curseg_mutex);
- else
- f2fs_put_page(page, 1);
+ f2fs_bug_on(sbi, set->entry_cnt);
+ radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+ kmem_cache_free(nat_entry_set_slab, set);
+}

- f2fs_bug_on(sbi, !list_empty(&nes->entry_list));
- release_nat_entry_set(nes, nm_i);
+/*
+ * This function is called during the checkpointing process.
+ */
+void flush_nat_entries(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct nat_entry_set *setvec[NATVEC_SIZE];
+ struct nat_entry_set *set, *tmp;
+ unsigned int found;
+ nid_t set_idx = 0;
+ LIST_HEAD(sets);
+
+ /*
+ * if there are no enough space in journal to store dirty nat
+ * entries, remove all entries from journal and merge them
+ * into nat entry set.
+ */
+ if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+ remove_nats_in_journal(sbi);
+
+ if (!nm_i->dirty_nat_cnt)
+ return;
+
+ while ((found = __gang_lookup_nat_set(nm_i,
+ set_idx, NATVEC_SIZE, setvec))) {
+ unsigned idx;
+ set_idx = setvec[found - 1]->set + 1;
+ for (idx = 0; idx < found; idx++)
+ __adjust_nat_entry_set(setvec[idx], &sets,
+ MAX_NAT_JENTRIES(sum));
}

- f2fs_bug_on(sbi, !list_empty(head));
+ /* flush dirty nats in nat entry set */
+ list_for_each_entry_safe(set, tmp, &sets, set_list)
+ __flush_nat_entry_set(sbi, set);
+
f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
}

@@ -1968,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
+ INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->nat_entries);
- INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
- INIT_LIST_HEAD(&nm_i->nat_entry_set);

mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index b8ba63c..bd826d9 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -43,6 +43,7 @@ enum {
IS_CHECKPOINTED, /* is it checkpointed before? */
HAS_FSYNCED_INODE, /* is the inode fsynced before? */
HAS_LAST_FSYNC, /* has the latest node fsync mark? */
+ IS_DIRTY, /* this nat entry is dirty? */
};

struct nat_entry {
@@ -60,10 +61,6 @@ struct nat_entry {
#define nat_get_version(nat) (nat->ni.version)
#define nat_set_version(nat, v) (nat->ni.version = v)

-#define __set_nat_cache_dirty(nm_i, ne) \
- list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
-#define __clear_nat_cache_dirty(nm_i, ne) \
- list_move_tail(&ne->list, &nm_i->nat_entries);
#define inc_node_version(version) (++version)

static inline void set_nat_flag(struct nat_entry *ne,
@@ -113,9 +110,9 @@ enum mem_type {
};

struct nat_entry_set {
- struct list_head set_list; /* link with all nat sets */
+ struct list_head set_list; /* link with other nat sets */
struct list_head entry_list; /* link with dirty nat entries */
- nid_t start_nid; /* start nid of nats in set */
+ nid_t set; /* set number*/
unsigned int entry_cnt; /* the # of nat entries in set */
};

--
1.8.5.2 (Apple Git-48)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/