Round Robin(RR)
和Qos
.qos_threshold_rr
是设置选择ost算法从RR
切换到加权算法的阈值。默认情况下如果两个ost的空间相差17%时候,自动切换到加权算法。如果设qos_threshold_rr=0
会选择Qos
算法;如果设置qos_threshold_rr=100
会选择RR
算法。qos中的17%是在LOV_QOS_DEF_THRESHOLD_RR_PCT
中定义qos_prio_free
是设置空闲空间大的ost的权重,当qos_prio_free=100
,则不会采用stripe对象按照ost的顺序进行,而是基于ost的空闲空间的权重进行。比如ost2的空闲空间是ost1的两倍,这时候ost2大概率会被选择2次,但是如果设置qos_threshold_rr=100
时候设置qos_prio_free=100
是不会生效的。reserved_mb_low
,这个参数单位是MB,如果ost上空间低于ost总容量*reserved_mb_low(默认值是OST 总大小的0.1%)会在这个ost上停止写入对象分片reserved_mb_high
,这个参数单位是MB,如果ost上空间高于ost总容量*reserved_mb_high(默认值值是ost总大小的0.2%)会在这个ost上开始写入对象分片qos_threshold_rr
和qos_prio_free
需要在mds上进行// 在mds节点上查看
[root@CentOS-Lustre-MDS_MGS ~]$ lctl get_param lod.bigfs*.qos_threshold_rr
lod.bigfs-MDT0000-mdtlov.qos_threshold_rr=17%
// 在mds节点上查看 设置qos_threshold_rr
[root@CentOS-Lustre-MDS_MGS ~]$ lctl set_param lod.bigfs*.qos_threshold_rr=25
lod.bigfs-MDT0000-mdtlov.qos_threshold_rr=25%
// 在mds节点上查看
[root@CentOS-Lustre-MDS_MGS ~]$ lctl get_param lod.bigfs-MDT*.qos_prio_free
lod.bigfs-MDT0000-mdtlov.qos_prio_free=91%
// 在mds节点上查看设置qos_prio_free
[root@CentOS-Lustre-MDS_MGS ~]$ lctl set_param lod.bigfs-MDT*.qos_prio_free=85
lod.bigfs-MDT0000-mdtlov.qos_prio_free=85
reserved_mb_low
和reserved_mb_high
需要在mds上进行,其中osp
是对象存储的mds到oss的接口层// 在mds节点上查看 reserved_mb_low
[root@CentOS-Lustre-MDS_MGS ~]$ lctl get_param osp.*.reserved_mb_low
osp.bigfs-OST0000-osc-MDT0000.reserved_mb_low=61
osp.bigfs-OST0001-osc-MDT0000.reserved_mb_low=61
// 在mds节点上设置 reserved_mb_low
[root@CentOS-Lustre-MDS_MGS ~]$ lctl set_param osp.*.reserved_mb_low=10
osp.bigfs-OST0000-osc-MDT0000.reserved_mb_low=10
osp.bigfs-OST0001-osc-MDT0000.reserved_mb_low=10
// 在mds节点上查看 reserved_mb_high
[root@CentOS-Lustre-MDS_MGS ~]$ lctl get_param osp.*.reserved_mb_high
osp.bigfs-OST0000-osc-MDT0000.reserved_mb_high=123
osp.bigfs-OST0001-osc-MDT0000.reserved_mb_high=123
// 在mds节点上设置 reserved_mb_high
[root@CentOS-Lustre-MDS_MGS ~]$ lctl set_param osp.*.reserved_mb_high=40
osp.bigfs-OST0000-osc-MDT0000.reserved_mb_high=40
osp.bigfs-OST0001-osc-MDT0000.reserved_mb_high=40
lod_ost_alloc_qos
,计算每个后端存储的容量相关的元数据,选择合适的ost.// lustre由于可以配置多个mdt和多个ost所以定义了LOV_QOS_DEF_THRESHOLD_RR_PCT和LMV_QOS_DEF_THRESHOLD_RR_PCT分别对应ost和mdt的容量差,权重和rr算法也适合后端的mdt
#define LOV_QOS_DEF_THRESHOLD_RR_PCT 17
#define LMV_QOS_DEF_THRESHOLD_RR_PCT 5
#define LOV_QOS_DEF_PRIO_FREE 90
#define LMV_QOS_DEF_PRIO_FREE 90
// Allocate a striping using an algorithm with weights.
static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo,
struct dt_object **stripe, __u32 *ost_indices,
int flags, struct thandle *th, int comp_idx,
__u64 reserve)
{
ltd_qos_penalties_calc()
lu_tgt_qos_weight_calc()
}
// lu_tgt_desc 定义了后端的存储的容量相关的元数据
// lu_tgt_qos_weight_calc 给定一个lu_tgt_desc来计算它的权重,并且更新这个权重
void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt)
{
struct lu_tgt_qos *ltq = &tgt->ltd_qos;
__u64 temp, temp2;
// tgt 定义了后端的存储的容量相关的元数据
temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8);
temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
if (temp < temp2)
ltq->ltq_weight = 0;
else
ltq->ltq_weight = temp - temp2;
}
lod_ost_alloc_rr
决策,遍历所有的可用的ost,轮训的选择ost// Allocate a striping using round-robin algorithm.
static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo,
struct dt_object **stripe, __u32 *ost_indices,
int flags, struct thandle *th, int comp_idx,
__u64 reserve)
{
lod_qos_calc_rr()
}
// 每次选择一个ost server,循环选择,里面的bitmap就是记录选择的哪些的ost
static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_descs *ltd,
const struct lu_tgt_pool *src_pool,
struct lu_qos_rr *lqr)
{
struct lu_svr_qos *svr;
struct lu_tgt_desc *tgt;
unsigned placed, real_count;
unsigned int i;
int rc;
ENTRY;
if (!test_bit(LQ_DIRTY, &lqr->lqr_flags)) {
LASSERT(lqr->lqr_pool.op_size);
RETURN(0);
}
/* Do actual allocation. */
down_write(<d->ltd_qos.lq_rw_sem);
if (!test_bit(LQ_DIRTY, &lqr->lqr_flags)) {
LASSERT(lqr->lqr_pool.op_size);
up_write(<d->ltd_qos.lq_rw_sem);
RETURN(0);
}
real_count = src_pool->op_count;
lqr->lqr_pool.op_count = real_count;
rc = lu_tgt_pool_extend(&lqr->lqr_pool, real_count);
if (rc) {
up_write(<d->ltd_qos.lq_rw_sem);
RETURN(rc);
}
for (i = 0; i < lqr->lqr_pool.op_count; i++)
lqr->lqr_pool.op_array[i] = LOV_QOS_EMPTY;
/* Place all the tgts from 1 svr at the same time. */
placed = 0;
list_for_each_entry(svr, <d->ltd_qos.lq_svr_list, lsq_svr_list) {
int j = 0;
for (i = 0; i < lqr->lqr_pool.op_count; i++) {
int next;
if (!test_bit(src_pool->op_array[i],
ltd->ltd_tgt_bitmap))
continue;
tgt = LTD_TGT(ltd, src_pool->op_array[i]);
LASSERT(tgt && tgt->ltd_tgt);
if (tgt->ltd_qos.ltq_svr != svr)
continue;
// 遍历当前所有的,RR方式选择一个作为mds选择的ost,注意一般一个oss管理一个ost,但是有时候可以配置多个ost被一个oss管理
next = j * lqr->lqr_pool.op_count / svr->lsq_tgt_count;
while (lqr->lqr_pool.op_array[next] != LOV_QOS_EMPTY)
next = (next + 1) % lqr->lqr_pool.op_count;
lqr->lqr_pool.op_array[next] = src_pool->op_array[i];
j++;
placed++;
}
}
clear_bit(LQ_DIRTY, &lqr->lqr_flags);
up_write(<d->ltd_qos.lq_rw_sem);
if (placed != real_count) {
/* This should never happen */
LCONSOLE_ERROR_MSG(0x14e, "Failed to place all tgts in the "
"round-robin list (%d of %d).\n",
placed, real_count);
for (i = 0; i < lqr->lqr_pool.op_count; i++) {
LCONSOLE(D_WARNING, "rr #%d tgt idx=%d\n", i,
lqr->lqr_pool.op_array[i]);
}
set_bit(LQ_DIRTY, &lqr->lqr_flags);
RETURN(-EAGAIN);
}
#if 0
for (i = 0; i < lqr->lqr_pool.op_count; i++)
QOS_CONSOLE("rr #%d ost idx=%d\n", i, lqr->lqr_pool.op_array[i]);
#endif
RETURN(0);
}