前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >聊聊lustre中mgc的实现和思考

聊聊lustre中mgc的实现和思考

作者头像
用户4700054
发布2022-08-17 12:49:21
4210
发布2022-08-17 12:49:21
举报
文章被收录于专栏:存储内核技术交流

mgc实现代码走读

  • lustre_mount : 客户端挂载lustre的时候,内核执行do_mount系统调用最终是调用lustre_mount来完成。lustre_mount并无实际的物理物理磁盘设备,执行mount_nodev,其中核心的是lustre_fill_super函数
代码语言:javascript
复制
// do_mount的入口
static struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
				   const char *devname, void *data)
{
	return mount_nodev(fs_type, flags, data, lustre_fill_super);
}
  • lustre_fill_super:客户端执行mount时候,该函数完成lustre客户端超级块的初始化,初始化会完成lustre客户端中vvp/lov/lovsub/osc/mdc/mgc等模块初始化
代码语言:javascript
复制
static int lustre_fill_super(struct super_block *sb, void *lmd2_data,
			     int silent)
{
	// 初始化 lustre 超级块信息,返回struct lustre_sb_info * 
	lsi = lustre_init_lsi(sb);

	
	lmd_parse(lmd2_data, lmd)
	// 启动mgc
	rc = lustre_start_mgc(sb);

	rc = ll_fill_super(sb);
}
  • lustre_init_lsi :lustre_sb_info申请空间,并且把超级块的s_fs_info设置为lustre_sb_info,lustre_sb_info包括了struct lustre_mount_data *lsi_lmd,lsi_lmd是保存客户单mount的信息
代码语言:javascript
复制
/***************** lustre superblock **************/

struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
{
	struct lustre_sb_info *lsi;
	OBD_ALLOC_PTR(lsi);
	OBD_ALLOC_PTR(lsi->lsi_lmd);
	s2lsi_nocast(sb) = lsi;
	RETURN(lsi);
}
  • lmd_parse :解析mount挂载时候的信息,并且保存在lmd中。解析mount命令时候的参数信息,保存在struct lustre_mount_data *lmd结构中,这里的mount命令是:mount -t lustre CentOS-Lustre-Server@tcp0:/perrynfs /mnt/lustre/
代码语言:javascript
复制
// perrynfs-client 的格式为 {fsname}-client.这是mgt上保存的固定配置文件
mount data:
		profile: perrynfs-client
		device:  10.211.55.4@tcp:/perrynfs
		flags:   2

  • lustre_start_mgc :设置mgc obd处理层,通过传入的mount信息连接到mgs获取整个文件系统的信息。
代码语言:javascript
复制
// mgc的obd_ops定义的函数,class_process_config 根据命令调用对对应的函数
static const struct obd_ops mgc_obd_ops = {
        .o_owner        = THIS_MODULE,
        .o_setup        = mgc_setup,
        .o_precleanup   = mgc_precleanup,
        .o_cleanup      = mgc_cleanup,
        .o_add_conn     = client_import_add_conn,
        .o_del_conn     = client_import_del_conn,
        .o_connect      = client_connect_import,
        .o_disconnect   = client_disconnect_export,
        .o_set_info_async = mgc_set_info_async,
        .o_get_info       = mgc_get_info,
        .o_import_event = mgc_import_event,
        .o_process_config = mgc_process_config,
};


int lustre_start_mgc(struct super_block *sb)
{
	// 解析到的mgcname 'MGC10.211.55.4@tcp'
	// 解析到的nid
	CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);

	/* Start the MGC */
	rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
				 niduuid, NULL, NULL);
	// 初始化mgc相关的obd,通过配置添加一个connection,调用client_import_add_conn
	rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
				     niduuid, NULL, NULL, NULL);

	// 查找mgc的obd
	obd = class_name2obd(mgcname);

	// 设置mgc obd的信息,调用mgc_set_info_async
	rc = obd_set_info_async(NULL, obd->obd_self_export,
				strlen(KEY_MGSSEC), KEY_MGSSEC,
				strlen(mgssec), mgssec, NULL);

	// 连接到mgs,获取整个文件系统的信息的日志,调用 client_connect_import连接到mgs,mgs针对push 整个文件系统的配置日志。
	rc = obd_connect(NULL, &exp, obd, uuid, data, NULL);
}
  • lustre_start_simple :根据配置注册、申请、初始化mgc的struct obd_device
代码语言:javascript
复制
/**
 * Call class_attach and class_setup.  These methods in turn call
 * OBD type-specific methods.
 */
int lustre_start_simple(char *obdname, char *type, char *uuid,
			char *s1, char *s2, char *s3, char *s4)
{
	int rc;
	// 这里输出的是:Starting OBD MGC10.211.55.4@tcp (typ=mgc)
	CDEBUG(D_MOUNT, "Starting OBD %s (typ=%s)\n", obdname, type);

	rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, NULL, NULL);
	if (rc) {
		CERROR("%s attach error %d\n", obdname, rc);
		return rc;
	}
	rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
	if (rc) {
		CERROR("%s setup error %d\n", obdname, rc);
		do_lcfg(obdname, 0, LCFG_DETACH, NULL, NULL, NULL, NULL);
	}
	return rc;
}
  • do_lcfg :根据传入的参数,注册和初始化mgc的OBD。mgc的OBD会经历LCFG_ATTACH->LCFG_SETUP,其中class_process_config是处理整个逻辑的核心
代码语言:javascript
复制
/**************** OBD start *******************/

/**
 * lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
 * lctl (and do for echo cli/srv.
 */
static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
		   char *s1, char *s2, char *s3, char *s4)
{
	
	rc = class_process_config(lcfg);
	return rc;
}
  • class_process_config :根据配置信息处理mgc的信息,首先是attach,其次是setup.setup实际调用的是mgc_setup完成
代码语言:javascript
复制
// mgc的obd_ops定义的函数,class_process_config 根据命令调用对对应的函数
static const struct obd_ops mgc_obd_ops = {
        .o_owner        = THIS_MODULE,
        .o_setup        = mgc_setup,
        .o_precleanup   = mgc_precleanup,
        .o_cleanup      = mgc_cleanup,
        .o_add_conn     = client_import_add_conn,
        .o_del_conn     = client_import_del_conn,
        .o_connect      = client_connect_import,
        .o_disconnect   = client_disconnect_export,
        .o_set_info_async = mgc_set_info_async,
        .o_get_info       = mgc_get_info,
        .o_import_event = mgc_import_event,
        .o_process_config = mgc_process_config,
};


int class_process_config(struct lustre_cfg *lcfg)
{
	switch (lcfg->lcfg_command) {
		case LCFG_ATTACH: {
		// 注册一个mgc的OBD,同时导出mgc接受消息的device
			err = class_attach(lcfg);
			GOTO(out, err);
		}
	}
	switch(lcfg->lcfg_command) {
		case LCFG_SETUP: {
			// 这里实际调用的是mgc_setup函数,在
			err = class_setup(obd, lcfg);
			GOTO(out, err);
		}
		case LCFG_ADD_CONN: {
			err = class_add_conn(obd, lcfg);
			GOTO(out, err = 0);
		}
	}
}

// mgc的OBD导出接受消息的export,同时内核注册mgc的OBD
int class_attach(struct lustre_cfg *lcfg)
{
	exp = class_new_export_self(obd, &obd->obd_uuid);
	rc = class_register_device(obd);
	obd->obd_attached = 1;
}
  • mgc_setup: mgc的初始化设置
代码语言:javascript
复制
static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
{
	struct task_struct	*task;
	int			 rc;

	// 初始化客户端的rpc的数据结构,设置mgc import用于发送消息给mgs
	rc = client_obd_setup(obd, lcfg);
	if (rc)
		GOTO(err_decref, rc);
	// mgc中lustre log的上下文初始化
	rc = mgc_llog_init(NULL, obd);
	if (rc) {
		CERROR("failed to setup llogging subsystems\n");
		GOTO(err_cleanup, rc);
	}
	
	rc = mgc_tunables_init(obd);

	// 启动一个内核线程,监听配置lustre配置变化,有变化则调用
	task = kthread_run(mgc_requeue_thread, NULL, "ll_cfg_requeue");
	RETURN(rc);
}

// 内核线程中的执行函数
static int mgc_requeue_thread(void *data)
{
	while (!(rq_state & RQ_STOP)) {
		struct config_llog_data *cld, *cld_prev;
		/* Always wait a few seconds to allow the server who
		 * caused the lock revocation to finish its setup, plus some
		 * random so everyone doesn't try to reconnect at once.
		 */
		to = cfs_time_seconds(MGC_TIMEOUT_MIN_SECONDS * 100 + rand);
		/* rand is centi-seconds */
		wait_event_idle_timeout(rq_waitq,
					rq_state & (RQ_STOP | RQ_PRECLEANUP),
					to/100);

		list_for_each_entry(cld, &config_llog_list,
				    cld_list_chain) {
			if (!cld->cld_lostlock || cld->cld_stopping)
				continue;

			/* hold reference to avoid being freed during
			 * subsequent processing. */
			config_log_get(cld);
			cld->cld_lostlock = 0;
			spin_unlock(&config_list_lock);

			config_log_put(cld_prev);
			cld_prev = cld;
			
			if (likely(!(rq_state & RQ_STOP))) {
				do_requeue(cld);
			} else {
				break;
			}
		}
		spin_unlock(&config_list_lock);
	}
}
static void do_requeue(struct config_llog_data *cld)
{
	rc = mgc_process_log(cld->cld_mgcexp->exp_obd, cld);
}
关于MGC实现的思考
  • 整个lustre文件系统的配置系统,都是通过mgc定期检查从mgs获取的。这里可以看到获取到的是mgt上的CONFIGS/{fsname}-client文件。我们通过以ldiskfs挂载mgt,查看{fsname}-client如下:
代码语言:javascript
复制
[perrynzhou@CentOS-Lustre-Server /mnt/mgt_mdt/CONFIGS]$ llog_reader  ./perrynfs-client 
Header size : 8192
Time : Wed Feb  9 01:17:13 2022
Number of records: 35
Target uuid : config_uuid
-----------------------
#01 (224)marker   4 (flags=0x01, v2.14.0.0) perrynfs-clilov 'lov setup' Wed Feb  9 01:17:13 2022-
#02 (120)attach    0:perrynfs-clilov  1:lov  2:perrynfs-clilov_UUID  
#03 (168)lov_setup 0:perrynfs-clilov  1:(struct lov_desc)
                uuid=perrynfs-clilov_UUID  stripe:cnt=1 size=1048576 offset=18446744073709551615 pattern=0x1
#04 (224)END   marker   4 (flags=0x02, v2.14.0.0) perrynfs-clilov 'lov setup' Wed Feb  9 01:17:13 2022-
#05 (224)marker   5 (flags=0x01, v2.14.0.0) perrynfs-clilmv 'lmv setup' Wed Feb  9 01:17:13 2022-
#06 (120)attach    0:perrynfs-clilmv  1:lmv  2:perrynfs-clilmv_UUID  
#07 (168)lov_setup 0:perrynfs-clilmv  1:(struct lov_desc)
                uuid=perrynfs-clilmv_UUID  stripe:cnt=0 size=0 offset=0 pattern=0
#08 (224)END   marker   5 (flags=0x02, v2.14.0.0) perrynfs-clilmv 'lmv setup' Wed Feb  9 01:17:13 2022-
#09 (224)marker   6 (flags=0x01, v2.14.0.0) perrynfs-MDT0000 'add mdc' Wed Feb  9 01:17:13 2022-
#10 (080)add_uuid  nid=10.211.55.4@tcp(0x200000ad33704)  0:  1:10.211.55.4@tcp  
#11 (128)attach    0:perrynfs-MDT0000-mdc  1:mdc  2:perrynfs-clilmv_UUID  
#12 (136)setup     0:perrynfs-MDT0000-mdc  1:perrynfs-MDT0000_UUID  2:10.211.55.4@tcp  
#13 (168)modify_mdc_tgts add 0:perrynfs-clilmv  1:perrynfs-MDT0000_UUID  2:0  3:1  4:perrynfs-MDT0000-mdc_UUID  
#14 (224)END   marker   6 (flags=0x02, v2.14.0.0) perrynfs-MDT0000 'add mdc' Wed Feb  9 01:17:13 2022-
#15 (224)marker   7 (flags=0x01, v2.14.0.0) perrynfs-client 'mount opts' Wed Feb  9 01:17:13 2022-
#16 (120)mount_option 0:  1:perrynfs-client  2:perrynfs-clilov  3:perrynfs-clilmv  
#17 (224)END   marker   7 (flags=0x02, v2.14.0.0) perrynfs-client 'mount opts' Wed Feb  9 01:17:13 2022-
#18 (224)marker  10 (flags=0x01, v2.14.0.0) perrynfs-OST0000 'add osc' Wed Feb  9 01:17:19 2022-
#19 (080)add_uuid  nid=10.211.55.4@tcp(0x200000ad33704)  0:  1:10.211.55.4@tcp  
#20 (128)attach    0:perrynfs-OST0000-osc  1:osc  2:perrynfs-clilov_UUID  
#21 (136)setup     0:perrynfs-OST0000-osc  1:perrynfs-OST0000_UUID  2:10.211.55.4@tcp  
#22 (128)lov_modify_tgts add 0:perrynfs-clilov  1:perrynfs-OST0000_UUID  2:0  3:1  
#23 (224)END   marker  10 (flags=0x02, v2.14.0.0) perrynfs-OST0000 'add osc' Wed Feb  9 01:17:19 2022-
#24 (224)marker  13 (flags=0x01, v2.14.0.0) perrynfs-OST0001 'add osc' Wed Feb  9 01:17:19 2022-
#25 (080)add_uuid  nid=10.211.55.4@tcp(0x200000ad33704)  0:  1:10.211.55.4@tcp  
#26 (128)attach    0:perrynfs-OST0001-osc  1:osc  2:perrynfs-clilov_UUID  
#27 (136)setup     0:perrynfs-OST0001-osc  1:perrynfs-OST0001_UUID  2:10.211.55.4@tcp  
#28 (128)lov_modify_tgts add 0:perrynfs-clilov  1:perrynfs-OST0001_UUID  2:1  3:1  
#29 (224)END   marker  13 (flags=0x02, v2.14.0.0) perrynfs-OST0001 'add osc' Wed Feb  9 01:17:19 2022-
#30 (224)marker  16 (flags=0x01, v2.14.0.0) perrynfs-OST0002 'add osc' Wed Feb  9 01:17:19 2022-
#31 (080)add_uuid  nid=10.211.55.4@tcp(0x200000ad33704)  0:  1:10.211.55.4@tcp  
#32 (128)attach    0:perrynfs-OST0002-osc  1:osc  2:perrynfs-clilov_UUID  
#33 (136)setup     0:perrynfs-OST0002-osc  1:perrynfs-OST0002_UUID  2:10.211.55.4@tcp  
#34 (128)lov_modify_tgts add 0:perrynfs-clilov  1:perrynfs-OST0002_UUID  2:2  3:1  
#35 (224)END   marker  16 (flags=0x02, v2.14.0.0) perrynfs-OST0002 'add osc' Wed Feb  9 01:17:19 2022-
  • 如果整个lustre文件系统中,添加/删除 mds或者oss,已经挂载的客户端是能在很短的时间内能感知到这个变化,这是由于客户端mgc中mgc_requeue_thread函数实现。所以在lustre文件系统中,任何针对参数或者组件的变动,在整个文件系统的客户端是能很快感知。
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2022-02-11,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 存储内核技术交流 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • mgc实现代码走读
    • 关于MGC实现的思考
    领券
    问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档