[PATCH 4/4] mm: zswap: create a pseudo device /dev/zram0

From: Bob Liu
Date: Sun Aug 18 2013 - 04:42:22 EST


This is used to replace previous zram.
zram users can enable this feature, then a pseudo device will be created
automaticlly after kernel boot.
Just using "mkswp /dev/zram0; swapon /dev/zram0" to use it as a swap disk.

The size of this pseudeo is controlled by zswap boot parameter
zswap.max_pool_percent.
disksize = (totalram_pages * zswap.max_pool_percent/100)*PAGE_SIZE.

Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
---
mm/Kconfig | 12 ++++
mm/zswap.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 208 insertions(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index d80a575..3778026 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -525,6 +525,18 @@ choice
be refused unless frontswap_get happened and freed some space.
endchoice

+config ZSWAP_PSEUDO_BLKDEV
+ bool "Emulate a pseudo blk-dev based on zswap(previous zram)"
+ depends on ZSWAP && ZSMALLOC
+ default n
+
+ help
+ Enable this option will emulate a pseudo block swapdev /dev/zram0
+ with size zswap.max_pool_percent of total ram size. All writes to this
+ block device will be compressed and cached by zswap as a result no
+ real IO disk operations will happen.
+ This feature can be used to replace drivers/staging/zram.
+
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
diff --git a/mm/zswap.c b/mm/zswap.c
index 8e8dc99..ae73c9d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -38,6 +38,11 @@
#include <linux/zbud.h>
#else
#include <linux/zsmalloc.h>
+#ifdef CONFIG_ZSWAP_PSEUDO_BLKDEV
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#endif
#endif
#include <linux/mm_types.h>
#include <linux/page-flags.h>
@@ -968,6 +973,189 @@ static int __init zswap_debugfs_init(void)
static void __exit zswap_debugfs_exit(void) { }
#endif

+#ifdef CONFIG_ZSWAP_PSEUDO_BLKDEV
+#define SECTOR_SHIFT 9
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+#define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT)
+
+struct zram {
+ struct rw_semaphore lock; /* protect concurent reads and writes */
+ struct request_queue *queue;
+ struct gendisk *disk;
+
+ /*
+ * This is the disk size for userland. The size is controlled by
+ * boot parameter zswap.max_pool_percent.
+ * disksize = (totalram_pages * zswap.max_pool_percent/100)*PAGE_SIZE
+ */
+ u64 disksize; /* bytes */
+
+ /*
+ * This page is used to store real data for /dev/zram.
+ * Meanful operation to /dev/zramx is only mkswp and swapon/swapoff.
+ * So use one page to store the real data(written by mkswp).
+ */
+ struct page *metapage;
+};
+
+/*
+ * Only create /dev/zram0, can be extened in future if there is real uercases
+ * need multiple zram devices.
+ */
+static struct zram zram_device;
+static const struct block_device_operations zram_devops = {
+ .owner = THIS_MODULE
+};
+
+static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+{
+ if (*offset + bvec->bv_len >= PAGE_SIZE)
+ (*index)++;
+ *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
+}
+
+static void zram_make_request(struct request_queue *queue, struct bio *bio)
+{
+ u32 index;
+ struct bio_vec *bvec;
+ unsigned char *src, *dst;
+ int offset, i, rw = bio_data_dir(bio);
+ struct zram *zram = queue->queuedata;
+
+ index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+ offset = (bio->bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
+
+ bio_for_each_segment(bvec, bio, i) {
+ /*
+ * The only operation to pseudo /dev/zramx is mkswp and
+ * swapon/swapoff, so we only need one extra page to store the
+ * real meta data!
+ */
+ BUG_ON(bvec->bv_len != PAGE_SIZE);
+ BUG_ON(offset);
+
+ if (!index) {
+ if (rw == READ) {
+ down_read(&zram->lock);
+ dst = kmap_atomic(bvec->bv_page);
+ src = kmap_atomic(zram->metapage);
+ memcpy(dst, src, bvec->bv_len);
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
+ flush_dcache_page(bvec->bv_page);
+ up_read(&zram->lock);
+ } else {
+ down_write(&zram->lock);
+ src = kmap_atomic(bvec->bv_page);
+ dst = kmap_atomic(zram->metapage);
+ memcpy(dst, src, bvec->bv_len);
+ kunmap_atomic(dst);
+ kunmap_atomic(src);
+ up_write(&zram->lock);
+ }
+ }
+ update_position(&index, &offset, bvec);
+ }
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio_endio(bio, 0);
+ return;
+}
+
+static int create_zram_device(struct zram *zram, int major, int device_id)
+{
+ int ret = -ENOMEM;
+ u64 disksize;
+
+ zram->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!zram->queue) {
+ pr_err("Error allocating disk queue for device%d\n", device_id);
+ goto out;
+ }
+
+ blk_queue_make_request(zram->queue, zram_make_request);
+ zram->queue->queuedata = zram;
+
+ /* gendisk structure */
+ zram->disk = alloc_disk(1);
+ if (!zram->disk) {
+ pr_warn("Error allocating disk structure for device %d\n",
+ device_id);
+ goto out_free_queue;
+ }
+
+ zram->disk->major = major;
+ zram->disk->first_minor = device_id;
+ zram->disk->fops = &zram_devops;
+ zram->disk->queue = zram->queue;
+ snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
+
+ /*
+ * To ensure that we always get PAGE_SIZE aligned
+ * and n*PAGE_SIZED sized I/O requests.
+ */
+ blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
+ blk_queue_logical_block_size(zram->disk->queue, 1<<12);
+ blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
+ blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
+
+ add_disk(zram->disk);
+
+ /* Init blk-dev */
+ disksize = totalram_pages * zswap_max_pool_percent / 100;
+ disksize *= PAGE_SIZE;
+ disksize = PAGE_ALIGN(disksize);
+ zram->disksize = disksize;
+ set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+
+ /* zram devices sort of resembles non-rotational disks */
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
+
+ zram->metapage = alloc_page(GFP_KERNEL);
+ if (!zram->metapage)
+ goto out_free_disk;
+
+ pr_debug("Initialization done!\n");
+ return 0;
+
+out_free_disk:
+ pr_debug("Init zram meta pages fail!\n");
+ del_gendisk(zram->disk);
+ put_disk(zram->disk);
+out_free_queue:
+ blk_cleanup_queue(zram->queue);
+out:
+ return ret;
+}
+
+static int zswap_blkdev_init(void)
+{
+ int major, ret = 0;
+
+ major = register_blkdev(0, "zram");
+ if (major <= 0) {
+ pr_warn("Unable to get major number\n");
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = create_zram_device(&zram_device, major, 0);
+ if (ret) {
+ unregister_blkdev(major, "zram");
+ goto out;
+ }
+
+ pr_info("Created zram device(%d, %d).\n", major, 0);
+out:
+ return ret;
+}
+#else
+static int zswap_blkdev_init(void)
+{
+ return 0;
+}
+#endif
+
/*********************************
* module init and exit
**********************************/
@@ -989,9 +1177,17 @@ static int __init init_zswap(void)
pr_err("per-cpu initialization failed\n");
goto pcpufail;
}
+
+ if (IS_ENABLED(CONFIG_ZSWAP_PSEUDO_BLKDEV))
+ if (zswap_blkdev_init()) {
+ pr_err("emulate blk device failed\n");
+ goto pcpufail;
+ }
+
frontswap_register_ops(&zswap_frontswap_ops);
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
+
return 0;
pcpufail:
zswap_comp_exit();
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/