- Adjust various DM structure members to improve alignment relative to

4.18 block's mempool_t and bioset changes.
 
 - Add DM writecache target that offers writeback caching to persistent
   memory or SSD.
 
 - Small DM core error message change to give context for why a DM table
   type transition wasn't allowed.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQEcBAABAgAGBQJbHsFxAAoJEMUj8QotnQNaHAgIAJPTwTOZboTzjQLrdiYEQ6q5
 lk7ZJP44+VlnY+iPRzyf36JyjVgIoZ82gWMW28hJmbq1dWaVphWA9yxYemFqfkSb
 F7oqcWl/C2J7U8Zk5U+gJKGQXRBhhIIYO7W3KWKTfF1cSx1AcqM2Au5IPejBG/sP
 h42Pfil22Rfg1U3kpxU8UQHe/V9cr/3eaRu0rD477HKqob1M08jP+27jdTu1vmNH
 uGGDWz5Dgra2IIxx797f4gn2hHJ825dDgaFF35JkTbKRom/xk8GlREy5wxqFvkbI
 Ti45mMlRdBFxXkFyvToVMtbCfkcZ617hag8KV4/BZ/4zmGBLFQXddHMAgJeYChk=
 =KH0g
 -----END PGP SIGNATURE-----

Merge tag 'for-4.18/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - Adjust various DM structure members to improve alignment relative to
   4.18 block's mempool_t and bioset changes.

 - Add DM writecache target that offers writeback caching to persistent
   memory or SSD.

 - Small DM core error message change to give context for why a DM table
   type transition wasn't allowed.

* tag 'for-4.18/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dm: add writecache target
  dm: adjust structure members to improve alignment
  dm: report which conflicting type caused error during table_load()
This commit is contained in:
Linus Torvalds 2018-06-12 18:12:08 -07:00
commit 4597fcff07
14 changed files with 2472 additions and 80 deletions

View File

@ -0,0 +1,68 @@
The writecache target caches writes on persistent memory or on SSD. It
doesn't cache reads because reads are supposed to be cached in page cache
in normal RAM.
When the device is constructed, the first sector should be zeroed or the
first sector should contain valid superblock from previous invocation.
Constructor parameters:
1. type of the cache device - "p" or "s"
p - persistent memory
s - SSD
2. the underlying device that will be cached
3. the cache device
4. block size (4096 is recommended; the maximum block size is the page
size)
5. the number of optional parameters (the parameters with an argument
count as two)
high_watermark n (default: 50)
start writeback when the number of used blocks reach this
watermark
low_watermark x (default: 45)
stop writeback when the number of used blocks drops below
this watermark
writeback_jobs n (default: unlimited)
limit the number of blocks that are in flight during
writeback. Setting this value reduces writeback
throughput, but it may improve latency of read requests
autocommit_blocks n (default: 64 for pmem, 65536 for ssd)
when the application writes this amount of blocks without
issuing the FLUSH request, the blocks are automatically
commited
autocommit_time ms (default: 1000)
autocommit time in milliseconds. The data is automatically
commited if this time passes and no FLUSH request is
received
fua (by default on)
applicable only to persistent memory - use the FUA flag
when writing data from persistent memory back to the
underlying device
nofua
applicable only to persistent memory - don't use the FUA
flag when writing back data and send the FLUSH request
afterwards
- some underlying devices perform better with fua, some
with nofua. The user should test it
Status:
1. error indicator - 0 if there was no error, otherwise error number
2. the number of blocks
3. the number of free blocks
4. the number of blocks under writeback
Messages:
flush
flush the cache device. The message returns successfully
if the cache device was flushed without an error
flush_on_suspend
flush the cache device on next suspend. Use this message
when you are going to remove the cache device. The proper
sequence for removing the cache device is:
1. send the "flush_on_suspend" message
2. load an inactive table with a linear target that maps
to the underlying device
3. suspend the device
4. ask for status and verify that there are no errors
5. resume the device, so that it will use the linear
target
6. the cache device is now inactive and it can be deleted

View File

@ -334,6 +334,17 @@ config DM_CACHE_SMQ
of less memory utilization, improved performance and increased
adaptability in the face of changing workloads.
config DM_WRITECACHE
tristate "Writecache target"
depends on BLK_DEV_DM
---help---
The writecache target caches writes on persistent memory or SSD.
It is intended for databases or other programs that need extremely
low commit latency.
The writecache target doesn't cache reads because reads are supposed
to be cached in standard RAM.
config DM_ERA
tristate "Era target (EXPERIMENTAL)"
depends on BLK_DEV_DM

View File

@ -67,6 +67,7 @@ obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
obj-$(CONFIG_DM_ZONED) += dm-zoned.o
obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o

View File

@ -19,8 +19,8 @@
struct dm_bio_prison {
spinlock_t lock;
mempool_t cell_pool;
struct rb_root cells;
mempool_t cell_pool;
};
static struct kmem_cache *_cell_cache;

View File

@ -21,8 +21,8 @@ struct dm_bio_prison_v2 {
struct workqueue_struct *wq;
spinlock_t lock;
mempool_t cell_pool;
struct rb_root cells;
mempool_t cell_pool;
};
static struct kmem_cache *_cell_cache;

View File

@ -371,7 +371,13 @@ struct cache_stats {
struct cache {
struct dm_target *ti;
struct dm_target_callbacks callbacks;
spinlock_t lock;
/*
* Fields for converting from sectors to blocks.
*/
int sectors_per_block_shift;
sector_t sectors_per_block;
struct dm_cache_metadata *cmd;
@ -402,13 +408,11 @@ struct cache {
dm_cblock_t cache_size;
/*
* Fields for converting from sectors to blocks.
* Invalidation fields.
*/
sector_t sectors_per_block;
int sectors_per_block_shift;
spinlock_t invalidation_lock;
struct list_head invalidation_requests;
spinlock_t lock;
struct bio_list deferred_bios;
sector_t migration_threshold;
wait_queue_head_t migration_wait;
atomic_t nr_allocated_migrations;
@ -419,13 +423,11 @@ struct cache {
*/
atomic_t nr_io_migrations;
struct bio_list deferred_bios;
struct rw_semaphore quiesce_lock;
/*
* cache_size entries, dirty if set
*/
atomic_t nr_dirty;
unsigned long *dirty_bitset;
struct dm_target_callbacks callbacks;
/*
* origin_blocks entries, discarded if set.
@ -442,24 +444,20 @@ struct cache {
const char **ctr_args;
struct dm_kcopyd_client *copier;
struct workqueue_struct *wq;
struct work_struct deferred_bio_worker;
struct work_struct migration_worker;
struct workqueue_struct *wq;
struct delayed_work waker;
struct dm_bio_prison_v2 *prison;
struct bio_set bs;
mempool_t migration_pool;
/*
* cache_size entries, dirty if set
*/
unsigned long *dirty_bitset;
atomic_t nr_dirty;
struct dm_cache_policy *policy;
unsigned policy_nr_args;
bool need_tick_bio:1;
bool sized:1;
bool invalidate:1;
bool commit_requested:1;
bool loaded_mappings:1;
bool loaded_discards:1;
struct dm_cache_policy *policy;
/*
* Cache features such as write-through.
@ -468,18 +466,23 @@ struct cache {
struct cache_stats stats;
/*
* Invalidation fields.
*/
spinlock_t invalidation_lock;
struct list_head invalidation_requests;
bool need_tick_bio:1;
bool sized:1;
bool invalidate:1;
bool commit_requested:1;
bool loaded_mappings:1;
bool loaded_discards:1;
struct rw_semaphore background_work_lock;
struct batcher committer;
struct work_struct commit_ws;
struct io_tracker tracker;
struct work_struct commit_ws;
struct batcher committer;
mempool_t migration_pool;
struct rw_semaphore background_work_lock;
struct bio_set bs;
};
struct per_bio_data {

View File

@ -31,6 +31,9 @@ struct dm_kobject_holder {
struct mapped_device {
struct mutex suspend_lock;
struct mutex table_devices_lock;
struct list_head table_devices;
/*
* The current mapping (struct dm_table *).
* Use dm_get_live_table{_fast} or take suspend_lock for
@ -38,17 +41,14 @@ struct mapped_device {
*/
void __rcu *map;
struct list_head table_devices;
struct mutex table_devices_lock;
unsigned long flags;
struct request_queue *queue;
int numa_node_id;
enum dm_queue_mode type;
/* Protect queue and type against concurrent access. */
struct mutex type_lock;
enum dm_queue_mode type;
int numa_node_id;
struct request_queue *queue;
atomic_t holders;
atomic_t open_count;
@ -56,21 +56,21 @@ struct mapped_device {
struct dm_target *immutable_target;
struct target_type *immutable_target_type;
char name[16];
struct gendisk *disk;
struct dax_device *dax_dev;
char name[16];
void *interface_ptr;
/*
* A list of ios that arrived while we were suspended.
*/
atomic_t pending[2];
wait_queue_head_t wait;
struct work_struct work;
wait_queue_head_t wait;
atomic_t pending[2];
spinlock_t deferred_lock;
struct bio_list deferred;
void *interface_ptr;
/*
* Event handling.
*/
@ -83,17 +83,17 @@ struct mapped_device {
/* the number of internal suspends */
unsigned internal_suspend_count;
/*
* Processing queue (flush)
*/
struct workqueue_struct *wq;
/*
* io objects are allocated from here.
*/
struct bio_set io_bs;
struct bio_set bs;
/*
* Processing queue (flush)
*/
struct workqueue_struct *wq;
/*
* freeze/thaw support require holding onto a super block
*/
@ -102,11 +102,11 @@ struct mapped_device {
/* forced geometry settings */
struct hd_geometry geometry;
struct block_device *bdev;
/* kobject and completion */
struct dm_kobject_holder kobj_holder;
struct block_device *bdev;
/* zero-length flush that will be cloned and submitted to targets */
struct bio flush_bio;

View File

@ -139,25 +139,13 @@ struct crypt_config {
struct dm_dev *dev;
sector_t start;
/*
* pool for per bio private data, crypto requests,
* encryption requeusts/buffer pages and integrity tags
*/
mempool_t req_pool;
mempool_t page_pool;
mempool_t tag_pool;
unsigned tag_pool_max_sectors;
struct percpu_counter n_allocated_pages;
struct bio_set bs;
struct mutex bio_alloc_lock;
struct workqueue_struct *io_queue;
struct workqueue_struct *crypt_queue;
struct task_struct *write_thread;
wait_queue_head_t write_thread_wait;
struct task_struct *write_thread;
struct rb_root write_tree;
char *cipher;
@ -213,6 +201,18 @@ struct crypt_config {
unsigned int integrity_iv_size;
unsigned int on_disk_tag_size;
/*
* pool for per bio private data, crypto requests,
* encryption requeusts/buffer pages and integrity tags
*/
unsigned tag_pool_max_sectors;
mempool_t tag_pool;
mempool_t req_pool;
mempool_t page_pool;
struct bio_set bs;
struct mutex bio_alloc_lock;
u8 *authenc_key; /* space for keys in authenc() format (if used) */
u8 key[0];
};

View File

@ -1344,7 +1344,8 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si
goto err_unlock_md_type;
}
} else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) {
DMWARN("can't change device type after initial table load.");
DMWARN("can't change device type (old=%u vs new=%u) after initial table load.",
dm_get_md_type(md), dm_table_get_type(t));
r = -EINVAL;
goto err_unlock_md_type;
}

View File

@ -45,7 +45,6 @@ struct dm_kcopyd_client {
struct dm_io_client *io_client;
wait_queue_head_t destroyq;
atomic_t nr_jobs;
mempool_t job_pool;
@ -54,6 +53,8 @@ struct dm_kcopyd_client {
struct dm_kcopyd_throttle *throttle;
atomic_t nr_jobs;
/*
* We maintain three lists of jobs:
*

View File

@ -63,28 +63,29 @@ struct dm_region_hash {
/* hash table */
rwlock_t hash_lock;
mempool_t region_pool;
unsigned mask;
unsigned nr_buckets;
unsigned prime;
unsigned shift;
struct list_head *buckets;
unsigned max_recovery; /* Max # of regions to recover in parallel */
spinlock_t region_lock;
atomic_t recovery_in_flight;
struct semaphore recovery_count;
struct list_head clean_regions;
struct list_head quiesced_regions;
struct list_head recovered_regions;
struct list_head failed_recovered_regions;
/*
* If there was a flush failure no regions can be marked clean.
*/
int flush_failure;
unsigned max_recovery; /* Max # of regions to recover in parallel */
spinlock_t region_lock;
atomic_t recovery_in_flight;
struct list_head clean_regions;
struct list_head quiesced_regions;
struct list_head recovered_regions;
struct list_head failed_recovered_regions;
struct semaphore recovery_count;
mempool_t region_pool;
void *context;
sector_t target_begin;

View File

@ -240,9 +240,9 @@ struct pool {
struct dm_bio_prison *prison;
struct dm_kcopyd_client *copier;
struct work_struct worker;
struct workqueue_struct *wq;
struct throttle throttle;
struct work_struct worker;
struct delayed_work waker;
struct delayed_work no_space_timeout;
@ -260,7 +260,6 @@ struct pool {
struct dm_deferred_set *all_io_ds;
struct dm_thin_new_mapping *next_mapping;
mempool_t mapping_pool;
process_bio_fn process_bio;
process_bio_fn process_discard;
@ -273,6 +272,8 @@ struct pool {
process_mapping_fn process_prepared_discard_pt2;
struct dm_bio_prison_cell **cell_sort_array;
mempool_t mapping_pool;
};
static enum pool_mode get_pool_mode(struct pool *pool);

2305
drivers/md/dm-writecache.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -52,9 +52,9 @@ struct dmz_target {
struct dmz_reclaim *reclaim;
/* For chunk work */
struct mutex chunk_lock;
struct radix_tree_root chunk_rxtree;
struct workqueue_struct *chunk_wq;
struct mutex chunk_lock;
/* For cloned BIOs to zones */
struct bio_set bio_set;