1
bucket index是整个RGW里面一个非常关键的数据结构,用于存储bucket的索引数据,默认情况下单个bucket的index全部存储在一个shard文件(shard数量为0,主要以OMAP-keys方式存储在leveldb中),随着单个bucket内的Object数量增加,整个shard文件的体积也在不断增长,当shard文件体积过大就会引发各种问题,常见的问题有:
本文重点在介绍一种方法去合理优化单个bucket的shard文件体积,目前受限于RGW的index构架设计,shard问题只能优化无法根治.(当然你也可以使用Indexless bucket)
indexless bucket介绍和使用可以参考下面内容 http://www.ksingh.co.in/blog/2017/01/30/ceph-indexless-bucket-part-2/
bing index问题介绍可以参考下文
http://cephnotes.ksperis.com/blog/2015/05/12/radosgw-big-index
2
下面这些都是经验之谈,仅供各位参考:
3
root@demo:/home/user# ceph df
GLOBAL:
SIZE AVAIL RAW USED %RAW USED
92114M 88218M 3895M 4.23
POOLS:
NAME ID USED %USED MAX AVAIL OBJECTS
rbd 0 131 0 87674M 2
.rgw.root 16 324 0 87674M 2
.zone.rgw.root 17 1279 0 87674M 2
.zone.rgw.domain 18 1480 0 87674M 8
.zone.rgw.control 19 0 0 87674M 8
.zone.rgw.gc 20 0 0 87674M 32
.zone.rgw.buckets.index 21 0 0 87674M 32 #index pool
.zone.rgw.buckets.extra 22 0 0 87674M 54
.zone.rgw.buckets 23 768M 0.83 87674M 254
.zone.log 24 0 0 87674M 11
.zone.intent-log 25 0 0 87674M 0
.zone.usage 26 0 0 87674M 2
.zone.users 27 34 0 87674M 3
.zone.users.email 28 0 0 87674M 0
.zone.users.swift 29 0 0 87674M 0
.zone.users.uid 30 1013 0 87674M 5
shard参数介绍
rgw_override_bucket_index_max_shards #用于单机模式
bucket_index_max_shards #用于集群模式
注意默认0表示不做shard切分,最大设置为7877,该参数在集群初始化阶段设置,并且需要重启所有rgw服务才生效,特别注意的是非特殊情况不要在已经上线的生产系统进行调整。
下面以集群模式为例
获取集群模式下的shard设置
root@demo:/home/user# radosgw-admin region get --name client.radosgw.zone1
{
"name": "zone",
"api_name": "zone",
"is_master": "true",
"endpoints": [
"http:\/\/demo.ceph.work:80\/"
],
"hostnames": [],
"master_zone": "zone1",
"zones": [
{
"name": "zone1",
"endpoints": [
"http:\/\/demo.ceph.work:80\/"
],
"log_meta": "true",
"log_data": "true",
"bucket_index_max_shards": 8 #shard数为8
}
],
"placement_targets": [
{
"name": "default-placement",
"tags": []
}
],
"default_placement": "default-placement"
}
获取单机模式下的shard设置
root@demo:/home/user# ceph --admin-daemon /home/ceph/var/run/ceph-client.radosgw.zone1.asok config show|grep rgw_override_bucket_index_max_shards
"rgw_override_bucket_index_max_shards": "0",
查看bucket列表
root@demo:/home/user# radosgw-admin bucket list --name client.radosgw.zone1
[
"multi-upload",
"demo-abc",
"test1",
"user-bucket1"
]
获取multi-upload这个bucket 的ID
root@demo:/home/user# radosgw-admin bucket stats --bucket=multi-upload --name client.radosgw.zone1
{
"bucket": "multi-upload",
"pool": ".zone.rgw.buckets",
"index_pool": ".zone.rgw.buckets.index",
"id": "zone1.14214.10", #bucket ID
"marker": "zone1.14214.10",
"owner": "u-user",
"ver": "0#1,1#3,2#345,3#1,4#1,5#1,6#681,7#5",
"master_ver": "0#0,1#0,2#0,3#0,4#0,5#0,6#0,7#0",
"mtime": "2017-05-05 10:23:12.000000",
"max_marker": "0#,1#00000000002.20.3,2#00000000344.367.3,3#,4#,5#,6#00000000680.711.3,7#00000000004.23.3",
"usage": {
"rgw.main": {
"size_kb": 724947,
"size_kb_actual": 725308,
"num_objects": 114
},
"rgw.multimeta": {
"size_kb": 0,
"size_kb_actual": 0,
"num_objects": 51
}
},
"bucket_quota": {
"enabled": false,
"max_size_kb": -1,
"max_objects": -1
}
}
获取multi-upload这个bucket的shard文件对应的Object列表 上面查到shard为8,所以下面有编号为0~7的几个文件。
root@demo:/home/user# rados ls -p .zone.rgw.buckets.index |grep "zone1.14214.10"
.dir.zone1.14214.10.5
.dir.zone1.14214.10.1
.dir.zone1.14214.10.7
.dir.zone1.14214.10.6
.dir.zone1.14214.10.4
.dir.zone1.14214.10.3
.dir.zone1.14214.10.0
.dir.zone1.14214.10.2
统计所有bucket shard的omapkeys条目数,总共1329个 每条omapkeys预计占用200 bytes的存储空间,因此1329个omapkeys总共占用的磁盘空间为 1329*200 = 265800 bytes
root@demo:/home/ceph/var/lib/osd/ceph-2# rados ls -p .zone.rgw.buckets.index |grep "zone1.14214.10"|awk '{print "rados listomapkeys -p .zone.rgw.buckets.index "$1 }'|sh -x|wc -l
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.5
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.1
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.7
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.6
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.4
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.3
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.0
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.2
1329
查看每个bucket shard文件的omapkeys条目数,以“.dir.zone1.14214.10.6”为例 926*200 = 185200 bytes ≈ 180kb 满足需求
root@demo:/home/ceph/var/lib/osd/ceph-2# rados ls -p .zone.rgw.buckets.index |grep "zone1.14214.10"|awk '{print "rados listomapkeys -p .zone.rgw.buckets.index "$1 " |wc -l"}'|sh -x
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.5
+ wc -l
0
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.1
+ wc -l
3
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.7
+ wc -l
5
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.6
+ wc -l
926
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.4
+ wc -l
0
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.3
+ wc -l
0
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.0
+ wc -l
0
+ rados listomapkeys -p .zone.rgw.buckets.index .dir.zone1.14214.10.2
+ wc -l
395
细心的各位会发现上面的shard分布极不均匀,那是由于每个Object存储的时候查找对应的shard用的是hash(Object_name)再取余的方式,具体参考下面代码,如果Object名称非常接近,就容易造成bucket shard分布不够均匀的情况。
int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
{
int r = 0;
switch (hash_type) {
case RGWBucketInfo::MOD:
if (!num_shards) {
// By default with no sharding, we use the bucket oid as itself
(*bucket_obj) = bucket_oid_base;
if (shard_id) {
*shard_id = -1;
}
} else {
uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
sid = sid2 % MAX_BUCKET_INDEX_SHARDS_PRIME % num_shards;
char buf[bucket_oid_base.size() + 32];
snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
(*bucket_obj) = buf;
if (shard_id) {
*shard_id = (int)sid;
}
}
break;
default:
r = -ENOTSUP;
}
return r;
}