上一篇只是从理论上说明了Device Mapper(DM)的基本知识,这一篇我们来阅读以下代码。因为Device Mapper涉及较多的块设备驱动内容的知识,博主对这一块涉及不深,大家将就看了。
DM是一个驱动
首先DM是一个驱动,User Space的应用要想建立设备映射就需要打开该驱动,然后通过其提供的IOCTL借口来完成对应的工作。DM驱动注册的开始位置在drivers/md/dm.c文件底下,一个一个尽量分析。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
// 驱动初始化过程调用的函数列表 static int (*_inits[])(void) __initdata = { local_init, dm_target_init, dm_linear_init, dm_stripe_init, dm_io_init, dm_kcopyd_init, dm_interface_init, dm_statistics_init, }; // 驱动退出时调用的函数列表 static void (*_exits[])(void) = { local_exit, dm_target_exit, dm_linear_exit, dm_stripe_exit, dm_io_exit, dm_kcopyd_exit, dm_interface_exit, dm_statistics_exit, }; // 驱动init入口函数 static int __init dm_init(void) { const int count = ARRAY_SIZE(_inits); int r, i; for (i = 0; i < count; i++) { r = _inits[i](); if (r) goto bad; } return 0; bad: while (i--) _exits[i](); return r; } // 驱动exit注销函数 static void __exit dm_exit(void) { int i = ARRAY_SIZE(_exits); while (i--) _exits[i](); /* * Should be empty by this point. */ idr_destroy(&_minor_idr); } |
local_init函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
// 定义DM_NAME宏 #define DM_NAME "device-mapper" // 注册的块设备name static const char *_name = DM_NAME; static int __init local_init(void) { int r = -ENOMEM; // 分配IO缓存 _io_cache = KMEM_CACHE(dm_io, 0); if (!_io_cache) return r; _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); if (!_rq_tio_cache) goto out_free_io_cache; // 块设备dm uevent初始化 r = dm_uevent_init(); if (r) goto out_free_rq_tio_cache; // 设备工作队列 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); if (!deferred_remove_workqueue) { r = -ENOMEM; goto out_uevent_exit; } _major = major; // 注册一个名为“device-mapper”块设备驱动 r = register_blkdev(_major, _name); if (r < 0) goto out_free_workqueue; if (!_major) _major = r; return 0; out_free_workqueue: destroy_workqueue(deferred_remove_workqueue); out_uevent_exit: dm_uevent_exit(); out_free_rq_tio_cache: kmem_cache_destroy(_rq_tio_cache); out_free_io_cache: kmem_cache_destroy(_io_cache); return r; } |
dm_target_init
1 2 3 4 5 6 7 8 9 10 11 12 13 |
static struct target_type error_target = { .name = "error", .version = {1, 2, 0}, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, .map_rq = io_err_map_rq, }; int __init dm_target_init(void) { return dm_register_target(&error_target); } |
看代码这里是注册一个名为“error”的target_type,正如上一篇介绍的,Target_type 结构主要包含了 target device 对应的 target driver 插件的名字、定义的构建和删除该类型target device的方法、该类target device对应的IO请求重映射和结束IO的方法等。
linear_target
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
static struct target_type linear_target = { .name = "linear", .version = {1, 2, 1}, .module = THIS_MODULE, .ctr = dm_linear_ctr, .dtr = dm_linear_dtr, .map = dm_linear_map, .status = dm_linear_status, .ioctl = dm_linear_ioctl, .merge = dm_linear_merge, .iterate_devices = dm_linear_iterate_devices, }; int __init dm_linear_init(void) { int r = dm_register_target(&linear_target); if (r < 0) DMERR("register failed %d", r); return r; } |
和注册“error” target_type一样,这里注册“linear” target_type。剩下的dm_stripe_init、dm_kcopyd_init函数也基本类似,这里就不多做解释了。dm_io_init函数只是简单的分配一些需要用到的IO缓存,也没什么好讲的。
dm_interface_init
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
// dm杂项设备file operations结构体 static const struct file_operations _ctl_fops = { .open = nonseekable_open, .unlocked_ioctl = dm_ctl_ioctl, .compat_ioctl = dm_compat_ctl_ioctl, .owner = THIS_MODULE, .llseek = noop_llseek, }; // dm杂项结构体 static struct miscdevice _dm_misc = { .minor = MAPPER_CTRL_MINOR, .name = DM_NAME, .nodename = DM_DIR "/" DM_CONTROL_NODE, .fops = &_ctl_fops }; int __init dm_interface_init(void) { int r; r = dm_hash_init(); if (r) return r; // 注册杂项设备 r = misc_register(&_dm_misc); if (r) { DMERR("misc_register failed for control device"); dm_hash_exit(); return r; } DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, DM_DRIVER_EMAIL); return 0; } |
前面说过,User Space的程序主要是通过DM驱动IOCTL函数进行相关的请求,所以接下来我们着重分析下IOCTL函数。
DM中IOCTL
1 2 3 4 5 6 7 8 9 10 11 12 13 |
static long dm_ctl_ioctl(struct file *file, uint command, ulong u) { return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u); } #ifdef CONFIG_COMPAT static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) { return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u)); } #else #define dm_compat_ctl_ioctl NULL #endif |
不管dm_compat_ctl_ioctl还是dm_ctl_ioctl,最后都是会调用到ctl_ioctl方法的。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
static int ctl_ioctl(uint command, struct dm_ioctl __user *user) { int r = 0; int ioctl_flags; int param_flags; unsigned int cmd; struct dm_ioctl *uninitialized_var(param); ioctl_fn fn = NULL; size_t input_param_size; struct dm_ioctl param_kernel; // 检查操作权限,仅ROOT可以进行该操作 if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (_IOC_TYPE(command) != DM_IOCTL) return -ENOTTY; cmd = _IOC_NR(command); // 检查版本 r = check_version(cmd, user); if (r) return r; // 如果是DM_VERSION_END命令就什么都不做 if (cmd == DM_VERSION_CMD) return 0; // IOCTL主要的处理函数,下面再分析 fn = lookup_ioctl(cmd, &ioctl_flags); if (!fn) { DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); return -ENOTTY; } // 从User Space中拷贝操作的参数到内核空间 r = copy_params(user, ¶m_kernel, ioctl_flags, ¶m, ¶m_flags); if (r) return r; input_param_size = param->data_size; r = validate_params(cmd, param); if (r) goto out; param->data_size = offsetof(struct dm_ioctl, data); r = fn(param, input_param_size); if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) && unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS)) DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd); // 将结果拷贝会User Space if (!r && copy_to_user(user, param, param->data_size)) r = -EFAULT; out: free_params(param, input_param_size, param_flags); return r; } /*----------------------------------------------------------------- * Implementation of open/close/ioctl on the special char * device. *---------------------------------------------------------------*/ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) { static struct { int cmd; int flags; ioctl_fn fn; } _ioctls[] = { {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */ {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all}, {DM_LIST_DEVICES_CMD, 0, list_devices}, {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create}, {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove}, {DM_DEV_RENAME_CMD, 0, dev_rename}, {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend}, {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status}, {DM_DEV_WAIT_CMD, 0, dev_wait}, {DM_TABLE_LOAD_CMD, 0, table_load}, {DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear}, {DM_TABLE_DEPS_CMD, 0, table_deps}, {DM_TABLE_STATUS_CMD, 0, table_status}, {DM_LIST_VERSIONS_CMD, 0, list_versions}, {DM_TARGET_MSG_CMD, 0, target_message}, {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry} }; if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) return NULL; *ioctl_flags = _ioctls[cmd].flags; return _ioctls[cmd].fn; } |
上面的lookup_ioctl函数首先定义一个IOCTL command以及对应操作函数的结构体。支持的command以及对应函数都在这里了。我们调一些重要的来看看就好。
DM_DEV_CREATE_CMD
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
static int dev_create(struct dm_ioctl *param, size_t param_size) { int r, m = DM_ANY_MINOR; struct mapped_device *md; // 检查名字合法性 r = check_name(param->name); if (r) return r; if (param->flags & DM_PERSISTENT_DEV_FLAG) m = MINOR(huge_decode_dev(param->dev)); // 主要工作在这里完成 r = dm_create(m, &md); if (r) return r; r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); if (r) { dm_put(md); dm_destroy(md); return r; } param->flags &= ~DM_INACTIVE_PRESENT_FLAG; __dev_status(md, param); dm_put(md); return 0; } |
dm_create ===> alloc_dev
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
int dm_create(int minor, struct mapped_device **result) { struct mapped_device *md; md = alloc_dev(minor); if (!md) return -ENXIO; dm_sysfs_init(md); *result = md; return 0; } static struct mapped_device *alloc_dev(int minor) { int r; // 分配内存 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); void *old_md; if (!md) { DMWARN("unable to allocate device, out of memory."); return NULL; } if (!try_module_get(THIS_MODULE)) goto bad_module_get; /* get a minor number for the dev */ if (minor == DM_ANY_MINOR) r = next_free_minor(&minor); else r = specific_minor(minor); if (r < 0) goto bad_minor; // sleep-RCU初始化 r = init_srcu_struct(&md->io_barrier); if (r < 0) goto bad_io_barrier; // 初始化mapped_device结构体 md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); mutex_init(&md->table_devices_lock); spin_lock_init(&md->deferred_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); atomic_set(&md->uevent_seq, 0); INIT_LIST_HEAD(&md->uevent_list); INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); md->queue = blk_alloc_queue(GFP_KERNEL); if (!md->queue) goto bad_queue; dm_init_md_queue(md); md->disk = alloc_disk(1); if (!md->disk) goto bad_disk; atomic_set(&md->pending[0], 0); atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); md->disk->major = _major; md->disk->first_minor = minor; // 设置操作函数结构体,block_device_operations结构体如后面代码所示 md->disk->fops = &dm_blk_dops; md->disk->queue = md->queue; md->disk->private_data = md; sprintf(md->disk->disk_name, "dm-%d", minor); // 添加设备到内核list中 add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); if (!md->wq) goto bad_thread; md->bdev = bdget_disk(md->disk, 0); if (!md->bdev) goto bad_bdev; bio_init(&md->flush_bio); md->flush_bio.bi_bdev = md->bdev; md->flush_bio.bi_rw = WRITE_FLUSH; dm_stats_init(&md->stats); /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); spin_unlock(&_minor_lock); BUG_ON(old_md != MINOR_ALLOCED); return md; bad_bdev: destroy_workqueue(md->wq); bad_thread: del_gendisk(md->disk); put_disk(md->disk); bad_disk: blk_cleanup_queue(md->queue); bad_queue: cleanup_srcu_struct(&md->io_barrier); bad_io_barrier: free_minor(minor); bad_minor: module_put(THIS_MODULE); bad_module_get: kfree(md); return NULL; } // 设置的块设备操作函数集 static const struct block_device_operations dm_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, .getgeo = dm_blk_getgeo, .owner = THIS_MODULE }; |
alloc_dev函数基本就是对mapped_device结构体的内容进行逐项初始化。然后通过add_disk函数将该block device添加到内核中。接下来分析另一个IOCTL command。
DM_TABLE_LOAD_CMD
DM_DEV_CREATE_CMD根据映射表为目标target和真是的block设备建立映射关系
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
static int table_load(struct dm_ioctl *param, size_t param_size) { int r; struct hash_cell *hc; struct dm_table *t, *old_map = NULL; struct mapped_device *md; struct target_type *immutable_target_type; // 查找对应的mapped_device md = find_device(param); if (!md) return -ENXIO; // 分配映射表 r = dm_table_create(&t, get_mode(param), param->target_count, md); if (r) goto err; /* Protect md->type and md->queue against concurrent table loads. */ dm_lock_md_type(md); // 根据参数来初始化映射表 r = populate_table(t, param, param_size); if (r) goto err_unlock_md_type; immutable_target_type = dm_get_immutable_target_type(md); if (immutable_target_type && (immutable_target_type != dm_table_get_immutable_target_type(t))) { DMWARN("can't replace immutable target type %s", immutable_target_type->name); r = -EINVAL; goto err_unlock_md_type; } if (dm_get_md_type(md) == DM_TYPE_NONE) /* Initial table load: acquire type of table. */ dm_set_md_type(md, dm_table_get_type(t)); else if (dm_get_md_type(md) != dm_table_get_type(t)) { DMWARN("can't change device type after initial table load."); r = -EINVAL; goto err_unlock_md_type; } /* setup md->queue to reflect md's type (may block) */ r = dm_setup_md_queue(md); if (r) { DMWARN("unable to set up device queue for new table."); goto err_unlock_md_type; } dm_unlock_md_type(md); /* stage inactive table */ down_write(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { DMWARN("device has been removed from the dev hash table."); up_write(&_hash_lock); r = -ENXIO; goto err_destroy_table; } if (hc->new_map) old_map = hc->new_map; hc->new_map = t; up_write(&_hash_lock); param->flags |= DM_INACTIVE_PRESENT_FLAG; __dev_status(md, param); if (old_map) { dm_sync_table(md); dm_table_destroy(old_map); } dm_put(md); return 0; err_unlock_md_type: dm_unlock_md_type(md); err_destroy_table: dm_table_destroy(t); err: dm_put(md); return r; } |
正如上一篇中解析说道:用户空间命令通过ioctl调用table_load函数,该函数根据用户空间传来的参数构建指定mapped device的映射表和所映射的target device。该函数先构建相应的dm_table、dm_target结构,再调用dm-table.c中的dm_table_add_target(populate_table—>dm_table_add_target)函数根据用户传入的参数初始化这些结构,并且根据参数所指定的target类型,调用相应的target类型的构建函数ctr在内存中构建target device对应的结构,然后再根据所建立的dm_target结构更新dm_table中维护的B树。上述过程完毕后,再将建立好的dm_table添加到mapped device的全局hash表对应的hash_cell结构中。
这一篇基本就这样了,简单的从代码层面分析下DM。大家在阅读代码的时候可以结合上一篇的内容同步分析,这样应该会好理解很多。Have fun!