Re: [PATCH v3] fat: Batched discard support for fat

From: Kyungmin Park
Date: Wed Mar 02 2011 - 03:25:40 EST


On Wed, Mar 2, 2011 at 11:07 AM, Kyungmin Park <kmpark@xxxxxxxxxxxxx> wrote:
> On Mon, Feb 28, 2011 at 9:51 PM, Lukas Czerner <lczerner@xxxxxxxxxx> wrote:
>> On Mon, 28 Feb 2011, Kyungmin Park wrote:
>>
>>> On Fri, Feb 25, 2011 at 8:17 PM, Lukas Czerner <lczerner@xxxxxxxxxx> wrote:
>>> > On Fri, 25 Feb 2011, Kyungmin Park wrote:
>>> >
>>> >> From: Kyungmin Park <kyungmin.park@xxxxxxxxxxx>
>>> >>
>>> >> FAT supports batched discard as ext4.
>>> >>
>>> >> Cited from Lukas words.
>>> >> "The current solution is not ideal because of its bad performance impact.
>>> >> So basic idea to improve things is to avoid discarding every time some
>>> >> blocks are freed. and instead batching is together into bigger trims,
>>> >> which tends to be more effective."
>>> >>
>>> >> You can find an information in detail at following URLs.
>>> >> http://lwn.net/Articles/397538/
>>> >> http://lwn.net/Articles/383933/
>>> >>
>>> >> Clearify the meaning of "len" (Cited form Lukas mail)
>>> >>
>>> >> Let the "O" be free (bytes, blocks, whatever), and "=" be used.
>>> >> Now, we have a filesystem like this.
>>> >>
>>> >>   OOOO==O===OO===OOOOO==O===O===OOOOOOO===
>>> >>   ^                                      ^
>>> >>   0                                      40
>>> >>
>>> >> This is how it supposed to wotk if you have called FITIRM with parameters:
>>> >>
>>> >> start = 0
>>> >> minlen = 2
>>> >> len = 20
>>> >>
>>> >> So you will go through (blocks, bytes...) 0 -> 20
>>> >>
>>> >>   OOOO==O===OO===OOOOO==O===O===OOOOOOO===
>>> >>   ^                   ^
>>> >>   0                   20
>>> >>
>>> >> So, you will call discard on extents:
>>> >>
>>> >> 0-3
>>> >> You'll skip 6 because is smaller than minlen
>>> >> 10-11
>>> >> 15-19
>>> >>
>>> >> instead of
>>> >>
>>> >> 0-3
>>> >> 10-11
>>> >> 15-19
>>> >> 30-36
>>> >
>>> > Hi thanks for the next version. And again I have to ask: Did you test it
>>> > ? and how ? Did you tried xfstest No. 251 ? Couple of comments bellow.
>>>
>>> I tested it with your test program. Of course I modified for our
>>> environment (eMMC).
>>
>> Ok, good.
>>
>>>
>>> #include <errno.h>
>>> #include <fcntl.h>
>>> #include <stdio.h>
>>> #include <stdint.h>
>>> #include <sys/ioctl.h>
>>>
>>> struct fstrim_range {
>>>         uint64_t start;
>>>         uint64_t len;
>>>         uint64_t minlen;
>>> };
>>>
>>> #define FITRIM          _IOWR('X', 121, struct fstrim_range)
>>>
>>> int main(int argc, char **argv)
>>> {
>>>         struct fstrim_range range;
>>>         uint64_t len;
>>>         int fd;
>>>
>>>         if (argc < 2) {
>>>                 fprintf(stderr, "usage: %s mountpoint [size]\n", argv[0]);
>>>                 return 1;
>>>         }
>>>
>>>         if (argc == 3)
>>>                 len = atoll(argv[1]);
>>>         else
>>>                 len = ((1UL<<31) - 1);
>>>
>>>         range.start = 0;
>>>         range.len = len;
>>>         range.minlen = 256 * 1024;      /* Minimum is 256KiB */
>>
>> Why exactly you need to set this ? What will happen if the minlen is 0 ?
>
> It's dependent on eMMC chip. it's for our environment. If it passed
> with 0, the code is working but the less than 256KiB trim command is
> meaningless.
>
>>
>>>
>>>         fd = open(argv[1], O_RDONLY);
>>>         if (fd < 0) {
>>>                 perror("open");
>>>                 return 1;
>>>         }
>>>
>>>         if (ioctl(fd, FITRIM, &range)) {
>>>                 if (errno == EOPNOTSUPP)
>>>                         fprintf(stderr, "TRIM not supported\n");
>>>                 else
>>>                         perror("FITRIM");
>>>                 return 1;
>>>         }
>>>
>>>         fprintf(stderr, "Trimmed size %llu\n", range.len);
>>>
>>>         return 0;
>>> }
>>>
>>> >
>>> > Thanks!
>>> > -Lukas
>>> >
>>> >>
>>> >> Signed-off-by: Kyungmin Park <kyungmin.park@xxxxxxxxxxx>
>>> >> ---
>>> >> Changelog v3:
>>> >>       Adjust the minlen from queue discard_granularity
>>> >>       Use the corrent len usage
>>> >> Changelog v2:
>>> >>       Use the given start and len as Lukas comments
>>> >>       Check the queue supports discard feature
>>> >> ---
>>> >> diff --git a/fs/fat/fat.h b/fs/fat/fat.h
>>> >> index f504089..08b53e1 100644
>>> >> --- a/fs/fat/fat.h
>>> >> +++ b/fs/fat/fat.h
>>> >> @@ -299,6 +299,7 @@ extern int fat_alloc_clusters(struct inode *inode, int *cluster,
>>> >>                             int nr_cluster);
>>> >>  extern int fat_free_clusters(struct inode *inode, int cluster);
>>> >>  extern int fat_count_free_clusters(struct super_block *sb);
>>> >> +extern int fat_trim_fs(struct super_block *sb, struct fstrim_range *range);
>>> >>
>>> >>  /* fat/file.c */
>>> >>  extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
>>> >> diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
>>> >> index b47d2c9..a8e3837 100644
>>> >> --- a/fs/fat/fatent.c
>>> >> +++ b/fs/fat/fatent.c
>>> >> @@ -1,6 +1,8 @@
>>> >>  /*
>>> >>   * Copyright (C) 2004, OGAWA Hirofumi
>>> >>   * Released under GPL v2.
>>> >> + *
>>> >> + * Batched discard support by Kyungmin Park <kyungmin.park@xxxxxxxxxxx>
>>> >>   */
>>> >>
>>> >>  #include <linux/module.h>
>>> >> @@ -541,6 +543,16 @@ out:
>>> >>       return err;
>>> >>  }
>>> >>
>>> >> +static int fat_issue_discard(struct super_block *sb, int cluster, int nr_clus)
>>> >> +{
>>> >> +     struct msdos_sb_info *sbi = MSDOS_SB(sb);
>>> >> +     sector_t block, nr_blocks;
>>> >> +
>>> >> +     block = fat_clus_to_blknr(sbi, cluster);
>>> >> +     nr_blocks = nr_clus * sbi->sec_per_clus;
>>> >> +     return sb_issue_discard(sb, block, nr_blocks, GFP_NOFS, 0);
>>> >> +}
>>> >> +
>>> >>  int fat_free_clusters(struct inode *inode, int cluster)
>>> >>  {
>>> >>       struct super_block *sb = inode->i_sb;
>>> >> @@ -575,11 +587,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
>>> >>                       if (cluster != fatent.entry + 1) {
>>> >>                               int nr_clus = fatent.entry - first_cl + 1;
>>> >>
>>> >> -                             sb_issue_discard(sb,
>>> >> -                                     fat_clus_to_blknr(sbi, first_cl),
>>> >> -                                     nr_clus * sbi->sec_per_clus,
>>> >> -                                     GFP_NOFS, 0);
>>> >> -
>>> >> +                             fat_issue_discard(sb, first_cl, nr_clus);
>>> >>                               first_cl = cluster;
>>> >>                       }
>>> >>               }
>>> >> @@ -683,3 +691,88 @@ out:
>>> >>       unlock_fat(sbi);
>>> >>       return err;
>>> >>  }
>>> >> +
>>> >> +int fat_trim_fs(struct super_block *sb, struct fstrim_range *range)
>>> >> +{
>>> >> +     struct msdos_sb_info *sbi = MSDOS_SB(sb);
>>> >> +     struct fatent_operations *ops = sbi->fatent_ops;
>>> >> +     struct fat_entry fatent;
>>> >> +     unsigned long reada_blocks, reada_mask, cur_block;
>>> >> +     int err = 0, free, count, entry;
>>> >> +     int start, len, minlen, trimmed;
>>> >> +
>>> >> +     start = range->start >> sb->s_blocksize_bits;
>>> >> +     start = start / sbi->sec_per_clus;
>>> >> +     len = range->len >> sb->s_blocksize_bits;
>>> >> +     len = len / sbi->sec_per_clus;
>>> >> +     minlen = range->minlen >> sb->s_blocksize_bits;
>>> >> +     minlen = minlen / sbi->sec_per_clus;
>>> >> +     trimmed = 0;
>>> >> +     count = 0;
>>> >> +
>>> >> +     lock_fat(sbi);
>>> >> +     if (sbi->free_clusters != -1 && sbi->free_clus_valid)
>>> >> +             goto out;
>>> >> +
>>> >> +     reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits;
>>> >> +     reada_mask = reada_blocks - 1;
>>> >> +     cur_block = 0;
>>> >> +
>>> >> +     entry = 0;
>>> >> +     free = 0;
>>> >> +     fatent_init(&fatent);
>>> >> +
>>> >> +     if (start < FAT_START_ENT)
>>> >> +             start = FAT_START_ENT;
>>> >> +
>>> >> +     fatent_set_entry(&fatent, start);
>>> >> +
>>> >> +     while (count < sbi->max_cluster) {
>>> >> +             if (fatent.entry >= sbi->max_cluster)
>>> >> +                     fatent.entry = FAT_START_ENT;
>>> >> +             /* readahead of fat blocks */
>>> >> +             if ((cur_block & reada_mask) == 0) {
>>> >> +                     unsigned long rest = sbi->fat_length - cur_block;
>>> >> +                     fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
>>> >
>>> > You really do not need new variable "rest" just for passing it into one
>>> > function. Get rid of it.
>>>
>>> Umm. I don't want to modify it since it's routine is same as
>>> free_count codes. I just borrowed it from FAT codes.
>>> >
>>> >> +             }
>>> >> +             cur_block++;
>>> >> +
>>> >> +             err = fat_ent_read_block(sb, &fatent);
>>> >> +             if (err)
>>> >> +                     goto out;
>>> >> +
>>> >> +             do {
>>> >> +                     if (ops->ent_get(&fatent) == FAT_ENT_FREE) {
>>> >> +                             free++;
>>> >> +                             if (!entry)
>>> >> +                                     entry = fatent.entry;
>>> >> +                             if (count >= len && free >= minlen) {
>>> >> +                                     fat_issue_discard(sb, entry, free);
>>> >> +                                     trimmed += free;
>>
>> I think you can remove this condition completely is you move "done"
>> label befor the if (free >= minlen) condition.
>>
>>> > I really do not understand FAT code very much, but is this right ?
>>> > Should not you be setting free = 0 ?
>>> Right but count is larger then "len". it will exit at next if (count
>>> >= len) goto done statement.
>>
>> Oh, ok I see. But it proves my point that this code is not very well
>> readable.
>>
>>>
>>> > What will happen if you'll end up
>>> > in the same branch in next iteration ? -- free will be still set to
>>> > previous value+1, bu you'll be discarding next entry. I am sorry but
>>> > this whole thing is not very readable.
>>>
>>> >
>>> >> +                             }
>>> >> +                             if (count >= len)
>>> >> +                                     goto done;
>>                                   ^^^^^^^^^^^^^^^^^^
>> this
>
> Okay I will fix it.
>>
>>> >> +                     } else if (entry) {
>>> >> +                             if (free >= minlen) {
>>> >> +                                     fat_issue_discard(sb, entry, free);
>>> >> +                                     trimmed += free;
>>> >> +                             }
>>> >> +                             if (count >= len)
>>> >> +                                     goto done;
>>                                   ^^^^^^^^^^^^^^^^^^
>> and this
>>
>>> >> +                             free = 0;
>>> >> +                             entry = 0;
>>> >> +                     }
>>> >
>>> > I don't not see why you are testing count all the time since it has not been
>>> > changed since the "if" condition started. how about doing one test
>>> > before the "if" condition ?
>>>
>>> For looping the whole fat max_cluster size. now start can be any
>>> address and if start is a middle point and len is max, then it will
>>> search from middle -> end -> start -> middle. The "count" is used for
>>> this purpose.
>>>
>>> Thank you,
>>> Kyungmin Park
>>> >
>> can go here -->>
>>                            if (count >= len)
>>                                    goto done;
>>
>>> >> +                     count++;
>>> >> +             } while (fat_ent_next(sbi, &fatent));
>>> >> +     }

How about this code?

while (count < sbi->max_cluster) {
if (fatent.entry >= sbi->max_cluster)
fatent.entry = FAT_START_ENT;
/* readahead of fat blocks */
if ((cur_block & reada_mask) == 0) {
unsigned long rest = sbi->fat_length - cur_block;
fat_ent_reada(sb, &fatent, min(reada_blocks, rest));
}
cur_block++;

err = fat_ent_read_block(sb, &fatent);
if (err)
goto out;

do {
if (ops->ent_get(&fatent) == FAT_ENT_FREE) {
free++;
if (!entry)
entry = fatent.entry;
} else if (entry) {
/*
* Discard if free entry is equal or greater
* than minimum length
*/
if (free >= minlen) {
fat_issue_discard(sb, entry, free);
trimmed += free;
}
free = 0;
entry = 0;
}
count++;
/* Check the loop count */
if (count >= len)
goto done;
} while (fat_ent_next(sbi, &fatent));
}
done:
if (free >= minlen) {
fat_issue_discard(sb, entry, free);
trimmed += free;
}
range->len = (trimmed * sbi->sec_per_clus) << sb->s_blocksize_bits;
fatent_brelse(&fatent);
out:
unlock_fat(sbi);
return err;


>>> >> +     if (free >= minlen) {
>>> >> +             fat_issue_discard(sb, entry, free);
>>> >> +             trimmed += free;
>>> >> +     }
>>> >> +done:
>>> >> +     range->len = (trimmed * sbi->sec_per_clus) << sb->s_blocksize_bits;
>>> >> +     fatent_brelse(&fatent);
>>> >> +out:
>>> >> +     unlock_fat(sbi);
>>> >> +     return err;
>>> >> +}
>>> >> diff --git a/fs/fat/file.c b/fs/fat/file.c
>>> >> index 7257752..9910aba 100644
>>> >> --- a/fs/fat/file.c
>>> >> +++ b/fs/fat/file.c
>>> >> @@ -125,6 +125,36 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>>> >>               return fat_ioctl_get_attributes(inode, user_attr);
>>> >>       case FAT_IOCTL_SET_ATTRIBUTES:
>>> >>               return fat_ioctl_set_attributes(filp, user_attr);
>>> >> +     case FITRIM:
>>> >> +     {
>>> >> +             struct super_block *sb = inode->i_sb;
>>> >> +             struct request_queue *q = bdev_get_queue(sb->s_bdev);
>>> >> +             struct fstrim_range range;
>>> >> +             int ret = 0;
>>> >> +
>>> >> +             if (!capable(CAP_SYS_ADMIN))
>>> >> +                     return -EPERM;
>>> >> +
>>> >> +             if (!blk_queue_discard(q))
>>> >> +                     return -EOPNOTSUPP;
>>> >> +
>>> >> +             if (copy_from_user(&range, (struct fstrim_range *)arg,
>>> >> +                                     sizeof(range)))
>>> >> +                     return -EFAULT;
>>> >> +
>>> >> +             range.minlen = max((unsigned int)range.minlen,
>>> >> +                                     q->limits.discard_granularity);
>>> >> +             ret = fat_trim_fs(sb, &range);
>>> >> +             if (ret < 0)
>>> >> +                     return ret;
>>> >> +
>>> >> +             if (copy_to_user((struct fstrim_range *)arg, &range,
>>> >> +                                     sizeof(range)))
>>> >> +                     return -EFAULT;
>>> >> +
>>> >> +             return 0;
>>> >> +     }
>>> >> +
>>> >>       default:
>>> >>               return -ENOTTY; /* Inappropriate ioctl for device */
>>> >>       }
>>> >>
>>> >
>>> > --
>>> >
>>>
>>
>> --
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/