Jimmy Chen

A Programmer

(原创)Device Mapper源码分析

  上一篇只是从理论上说明了Device Mapper(DM)的基本知识,这一篇我们来阅读以下代码。因为Device Mapper涉及较多的块设备驱动内容的知识,博主对这一块涉及不深,大家将就看了。

DM是一个驱动

  首先DM是一个驱动,User Space的应用要想建立设备映射就需要打开该驱动,然后通过其提供的IOCTL借口来完成对应的工作。DM驱动注册的开始位置在drivers/md/dm.c文件底下,一个一个尽量分析。

// 驱动初始化过程调用的函数列表
static int (*_inits[])(void) __initdata = {
    local_init,
    dm_target_init,
    dm_linear_init,
    dm_stripe_init,
    dm_io_init,
    dm_kcopyd_init,
    dm_interface_init,
    dm_statistics_init,
};

// 驱动退出时调用的函数列表
static void (*_exits[])(void) = {
    local_exit,
    dm_target_exit,
    dm_linear_exit,
    dm_stripe_exit,
    dm_io_exit,
    dm_kcopyd_exit,
    dm_interface_exit,
    dm_statistics_exit,
};

// 驱动init入口函数
static int __init dm_init(void)
{
    const int count = ARRAY_SIZE(_inits);

    int r, i;

    for (i = 0; i < count; i++) {
        r = _inits[i]();
        if (r)
            goto bad;
    }

    return 0;

      bad:
    while (i--)
        _exits[i]();

    return r;
}

// 驱动exit注销函数
static void __exit dm_exit(void)
{
    int i = ARRAY_SIZE(_exits);

    while (i--)
        _exits[i]();

    /*
     * Should be empty by this point.
     */
    idr_destroy(&_minor_idr);
}

local_init函数

// 定义DM_NAME宏
#define DM_NAME "device-mapper"

// 注册的块设备name
static const char *_name = DM_NAME;

static int __init local_init(void)
{
    int r = -ENOMEM;

    // 分配IO缓存
    _io_cache = KMEM_CACHE(dm_io, 0);
    if (!_io_cache)
        return r;

    _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
    if (!_rq_tio_cache)
        goto out_free_io_cache;

    // 块设备dm uevent初始化
    r = dm_uevent_init();
    if (r)
        goto out_free_rq_tio_cache;

    // 设备工作队列
    deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
    if (!deferred_remove_workqueue) {
        r = -ENOMEM;
        goto out_uevent_exit;
    }

    _major = major;
    // 注册一个名为“device-mapper”块设备驱动
    r = register_blkdev(_major, _name);
    if (r < 0)
        goto out_free_workqueue;

    if (!_major)
        _major = r;

    return 0;

out_free_workqueue:
    destroy_workqueue(deferred_remove_workqueue);
out_uevent_exit:
    dm_uevent_exit();
out_free_rq_tio_cache:
    kmem_cache_destroy(_rq_tio_cache);
out_free_io_cache:
    kmem_cache_destroy(_io_cache);

    return r;
}

dm_target_init

static struct target_type error_target = {
    .name = "error",
    .version = {1, 2, 0},
    .ctr  = io_err_ctr,
    .dtr  = io_err_dtr,
    .map  = io_err_map,
    .map_rq = io_err_map_rq,
};

int __init dm_target_init(void)
{
    return dm_register_target(&error_target);
}

  看代码这里是注册一个名为“error”的target_type,正如上一篇介绍的,Target_type 结构主要包含了 target device 对应的 target driver 插件的名字、定义的构建和删除该类型target device的方法、该类target device对应的IO请求重映射和结束IO的方法等。

linear_target

static struct target_type linear_target = {
    .name   = "linear",
    .version = {1, 2, 1},
    .module = THIS_MODULE,
    .ctr    = dm_linear_ctr,
    .dtr    = dm_linear_dtr,
    .map    = dm_linear_map,
    .status = dm_linear_status,
    .ioctl  = dm_linear_ioctl,
    .merge  = dm_linear_merge,
    .iterate_devices = dm_linear_iterate_devices,
};

int __init dm_linear_init(void)
{
    int r = dm_register_target(&linear_target);

    if (r < 0)
        DMERR("register failed %d", r);

    return r;
}

  和注册“error” target_type一样,这里注册“linear” target_type。剩下的dm_stripe_init、dm_kcopyd_init函数也基本类似,这里就不多做解释了。dm_io_init函数只是简单的分配一些需要用到的IO缓存,也没什么好讲的。

dm_interface_init

// dm杂项设备file operations结构体
static const struct file_operations _ctl_fops = {
    .open = nonseekable_open,
    .unlocked_ioctl  = dm_ctl_ioctl,
    .compat_ioctl = dm_compat_ctl_ioctl,
    .owner   = THIS_MODULE,
    .llseek  = noop_llseek,
};

// dm杂项结构体
static struct miscdevice _dm_misc = {
    .minor      = MAPPER_CTRL_MINOR,
    .name       = DM_NAME,
    .nodename   = DM_DIR "/" DM_CONTROL_NODE,
    .fops       = &_ctl_fops
};

int __init dm_interface_init(void)
{
    int r;

    r = dm_hash_init();
    if (r)
        return r;

    // 注册杂项设备
    r = misc_register(&_dm_misc);
    if (r) {
        DMERR("misc_register failed for control device");
        dm_hash_exit();
        return r;
    }

    DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
           DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
           DM_DRIVER_EMAIL);
    return 0;
}

  前面说过,User Space的程序主要是通过DM驱动IOCTL函数进行相关的请求,所以接下来我们着重分析下IOCTL函数。

DM中IOCTL

static long dm_ctl_ioctl(struct file *file, uint command, ulong u)
{
    return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u);
}

#ifdef CONFIG_COMPAT
static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
{
    return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u));
}
#else
#define dm_compat_ctl_ioctl NULL
#endif

  不管dm_compat_ctl_ioctl还是dm_ctl_ioctl,最后都是会调用到ctl_ioctl方法的。

static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
{
    int r = 0;
    int ioctl_flags;
    int param_flags;
    unsigned int cmd;
    struct dm_ioctl *uninitialized_var(param);
    ioctl_fn fn = NULL;
    size_t input_param_size;
    struct dm_ioctl param_kernel;

    // 检查操作权限,仅ROOT可以进行该操作
    if (!capable(CAP_SYS_ADMIN))
        return -EACCES;

    if (_IOC_TYPE(command) != DM_IOCTL)
        return -ENOTTY;

    cmd = _IOC_NR(command);

    // 检查版本
    r = check_version(cmd, user);
    if (r)
        return r;

    // 如果是DM_VERSION_END命令就什么都不做
    if (cmd == DM_VERSION_CMD)
        return 0;

    // IOCTL主要的处理函数,下面再分析
    fn = lookup_ioctl(cmd, &ioctl_flags);
    if (!fn) {
        DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
        return -ENOTTY;
    }

    // 从User Space中拷贝操作的参数到内核空间
    r = copy_params(user, ¶m_kernel, ioctl_flags, ¶m, ¶m_flags);

    if (r)
        return r;

    input_param_size = param->data_size;
    r = validate_params(cmd, param);
    if (r)
        goto out;

    param->data_size = offsetof(struct dm_ioctl, data);
    r = fn(param, input_param_size);

    if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
        unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
        DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd);

    // 将结果拷贝会User Space
    if (!r && copy_to_user(user, param, param->data_size))
        r = -EFAULT;

out:
    free_params(param, input_param_size, param_flags);
    return r;
}


/*-----------------------------------------------------------------
 * Implementation of open/close/ioctl on the special char
 * device.
 *---------------------------------------------------------------*/
static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
{
    static struct {
        int cmd;
        int flags;
        ioctl_fn fn;
    } _ioctls[] = {
        {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */
        {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all},
        {DM_LIST_DEVICES_CMD, 0, list_devices},

        {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create},
        {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove},
        {DM_DEV_RENAME_CMD, 0, dev_rename},
        {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend},
        {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status},
        {DM_DEV_WAIT_CMD, 0, dev_wait},

        {DM_TABLE_LOAD_CMD, 0, table_load},
        {DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear},
        {DM_TABLE_DEPS_CMD, 0, table_deps},
        {DM_TABLE_STATUS_CMD, 0, table_status},

        {DM_LIST_VERSIONS_CMD, 0, list_versions},

        {DM_TARGET_MSG_CMD, 0, target_message},
        {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}
    };

    if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
        return NULL;

    *ioctl_flags = _ioctls[cmd].flags;
    return _ioctls[cmd].fn;
}

  上面的lookup_ioctl函数首先定义一个IOCTL command以及对应操作函数的结构体。支持的command以及对应函数都在这里了。我们调一些重要的来看看就好。

DM_DEV_CREATE_CMD

static int dev_create(struct dm_ioctl *param, size_t param_size)
{
    int r, m = DM_ANY_MINOR;
    struct mapped_device *md;

    // 检查名字合法性
    r = check_name(param->name);
    if (r)
        return r;

    if (param->flags & DM_PERSISTENT_DEV_FLAG)
        m = MINOR(huge_decode_dev(param->dev));

    // 主要工作在这里完成
    r = dm_create(m, &md);
    if (r)
        return r;

    r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
    if (r) {
        dm_put(md);
        dm_destroy(md);
        return r;
    }

    param->flags &= ~DM_INACTIVE_PRESENT_FLAG;

    __dev_status(md, param);

    dm_put(md);

    return 0;
}

dm_create ===> alloc_dev


int dm_create(int minor, struct mapped_device **result)
{
    struct mapped_device *md;

    md = alloc_dev(minor);
    if (!md)
        return -ENXIO;

    dm_sysfs_init(md);

    *result = md;
    return 0;
}

static struct mapped_device *alloc_dev(int minor)
{
    int r;
    // 分配内存
    struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
    void *old_md;

    if (!md) {
        DMWARN("unable to allocate device, out of memory.");
        return NULL;
    }

    if (!try_module_get(THIS_MODULE))
        goto bad_module_get;

    /* get a minor number for the dev */
    if (minor == DM_ANY_MINOR)
        r = next_free_minor(&minor);
    else
        r = specific_minor(minor);
    if (r < 0)
        goto bad_minor;

    // sleep-RCU初始化
    r = init_srcu_struct(&md->io_barrier);
    if (r < 0)
        goto bad_io_barrier;

    // 初始化mapped_device结构体
    md->type = DM_TYPE_NONE;
    mutex_init(&md->suspend_lock);
    mutex_init(&md->type_lock);
    mutex_init(&md->table_devices_lock);
    spin_lock_init(&md->deferred_lock);
    atomic_set(&md->holders, 1);
    atomic_set(&md->open_count, 0);
    atomic_set(&md->event_nr, 0);
    atomic_set(&md->uevent_seq, 0);
    INIT_LIST_HEAD(&md->uevent_list);
    INIT_LIST_HEAD(&md->table_devices);
    spin_lock_init(&md->uevent_lock);

    md->queue = blk_alloc_queue(GFP_KERNEL);
    if (!md->queue)
        goto bad_queue;

    dm_init_md_queue(md);

    md->disk = alloc_disk(1);
    if (!md->disk)
        goto bad_disk;

    atomic_set(&md->pending[0], 0);
    atomic_set(&md->pending[1], 0);
    init_waitqueue_head(&md->wait);
    INIT_WORK(&md->work, dm_wq_work);
    init_waitqueue_head(&md->eventq);
    init_completion(&md->kobj_holder.completion);

    md->disk->major = _major;
    md->disk->first_minor = minor;
    // 设置操作函数结构体,block_device_operations结构体如后面代码所示
    md->disk->fops = &dm_blk_dops;
    md->disk->queue = md->queue;
    md->disk->private_data = md;
    sprintf(md->disk->disk_name, "dm-%d", minor);
    // 添加设备到内核list中
    add_disk(md->disk);
    format_dev_t(md->name, MKDEV(_major, minor));

    md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
    if (!md->wq)
        goto bad_thread;

    md->bdev = bdget_disk(md->disk, 0);
    if (!md->bdev)
        goto bad_bdev;

    bio_init(&md->flush_bio);
    md->flush_bio.bi_bdev = md->bdev;
    md->flush_bio.bi_rw = WRITE_FLUSH;

    dm_stats_init(&md->stats);

    /* Populate the mapping, nobody knows we exist yet */
    spin_lock(&_minor_lock);
    old_md = idr_replace(&_minor_idr, md, minor);
    spin_unlock(&_minor_lock);

    BUG_ON(old_md != MINOR_ALLOCED);

    return md;

bad_bdev:
    destroy_workqueue(md->wq);
bad_thread:
    del_gendisk(md->disk);
    put_disk(md->disk);
bad_disk:
    blk_cleanup_queue(md->queue);
bad_queue:
    cleanup_srcu_struct(&md->io_barrier);
bad_io_barrier:
    free_minor(minor);
bad_minor:
    module_put(THIS_MODULE);
bad_module_get:
    kfree(md);
    return NULL;
}

// 设置的块设备操作函数集
static const struct block_device_operations dm_blk_dops = {
    .open = dm_blk_open,
    .release = dm_blk_close,
    .ioctl = dm_blk_ioctl,
    .getgeo = dm_blk_getgeo,
    .owner = THIS_MODULE
};

alloc_dev函数基本就是对mapped_device结构体的内容进行逐项初始化。然后通过add_disk函数将该block device添加到内核中。接下来分析另一个IOCTL command。

DM_TABLE_LOAD_CMD

  DM_DEV_CREATE_CMD根据映射表为目标target和真是的block设备建立映射关系

static int table_load(struct dm_ioctl *param, size_t param_size)
{
    int r;
    struct hash_cell *hc;
    struct dm_table *t, *old_map = NULL;
    struct mapped_device *md;
    struct target_type *immutable_target_type;

    // 查找对应的mapped_device
    md = find_device(param);
    if (!md)
        return -ENXIO;

    // 分配映射表
    r = dm_table_create(&t, get_mode(param), param->target_count, md);
    if (r)
        goto err;

    /* Protect md->type and md->queue against concurrent table loads. */
    dm_lock_md_type(md);
    // 根据参数来初始化映射表
    r = populate_table(t, param, param_size);
    if (r)
        goto err_unlock_md_type;

    immutable_target_type = dm_get_immutable_target_type(md);
    if (immutable_target_type &&
        (immutable_target_type != dm_table_get_immutable_target_type(t))) {
        DMWARN("can't replace immutable target type %s",
               immutable_target_type->name);
        r = -EINVAL;
        goto err_unlock_md_type;
    }

    if (dm_get_md_type(md) == DM_TYPE_NONE)
        /* Initial table load: acquire type of table. */
        dm_set_md_type(md, dm_table_get_type(t));
    else if (dm_get_md_type(md) != dm_table_get_type(t)) {
        DMWARN("can't change device type after initial table load.");
        r = -EINVAL;
        goto err_unlock_md_type;
    }

    /* setup md->queue to reflect md's type (may block) */
    r = dm_setup_md_queue(md);
    if (r) {
        DMWARN("unable to set up device queue for new table.");
        goto err_unlock_md_type;
    }
    dm_unlock_md_type(md);

    /* stage inactive table */
    down_write(&_hash_lock);
    hc = dm_get_mdptr(md);
    if (!hc || hc->md != md) {
        DMWARN("device has been removed from the dev hash table.");
        up_write(&_hash_lock);
        r = -ENXIO;
        goto err_destroy_table;
    }

    if (hc->new_map)
        old_map = hc->new_map;
    hc->new_map = t;
    up_write(&_hash_lock);

    param->flags |= DM_INACTIVE_PRESENT_FLAG;
    __dev_status(md, param);

    if (old_map) {
        dm_sync_table(md);
        dm_table_destroy(old_map);
    }

    dm_put(md);

    return 0;

err_unlock_md_type:
    dm_unlock_md_type(md);
err_destroy_table:
    dm_table_destroy(t);
err:
    dm_put(md);

    return r;
}

  正如上一篇中解析说道:用户空间命令通过ioctl调用table_load函数,该函数根据用户空间传来的参数构建指定mapped device的映射表和所映射的target device。该函数先构建相应的dm_table、dm_target结构,再调用dm-table.c中的dm_table_add_target(populate_table--->dm_table_add_target)函数根据用户传入的参数初始化这些结构,并且根据参数所指定的target类型,调用相应的target类型的构建函数ctr在内存中构建target device对应的结构,然后再根据所建立的dm_target结构更新dm_table中维护的B树。上述过程完毕后,再将建立好的dm_table添加到mapped device的全局hash表对应的hash_cell结构中。

  这一篇基本就这样了,简单的从代码层面分析下DM。大家在阅读代码的时候可以结合上一篇的内容同步分析,这样应该会好理解很多。Have fun!

发表评论

电子邮件地址不会被公开。 必填项已用*标注

This site uses Akismet to reduce spam. Learn how your comment data is processed.

%d 博主赞过: