向单个bucket压测2000W个object,默认设置shard数为16,压测到1800W出现large omap,介绍一下错误定位和如何处理。
集群状态如下
[root@demo123 cephuser]# ceph health detail
HEALTH_WARN 16 large omap objects
LARGE_OMAP_OBJECTS 16 large omap objects
16 large objects found in pool 'cn-bj-test2.rgw.buckets.index'
Search the cluster log for 'Large omap object found' for more details.
通过脚本找到对应的pg信息,脚本请查看之前一篇omap large处理的文章。
[root@demo123 cephuser]# python large_omap.py
Large omap objects poolname = cn-bj-test2.rgw.buckets.index
pgid=13.1f OSDs=[78, 9, 59] num_large_omap_objects=1
pgid=13.33 OSDs=[59, 79, 19] num_large_omap_objects=1
pgid=13.3c OSDs=[49, 29, 78] num_large_omap_objects=1
pgid=13.3d OSDs=[48, 69, 9] num_large_omap_objects=1
pgid=13.45 OSDs=[88, 39, 28] num_large_omap_objects=1
pgid=13.4d OSDs=[38, 29, 89] num_large_omap_objects=1
pgid=13.50 OSDs=[68, 19, 59] num_large_omap_objects=1
pgid=13.6b OSDs=[39, 79, 8] num_large_omap_objects=1
pgid=13.8e OSDs=[38, 9, 78] num_large_omap_objects=1
pgid=13.d1 OSDs=[9, 88, 38] num_large_omap_objects=1
pgid=13.d2 OSDs=[59, 88, 28] num_large_omap_objects=1
pgid=13.e1 OSDs=[19, 88, 49] num_large_omap_objects=1
pgid=13.e4 OSDs=[38, 19, 89] num_large_omap_objects=1
pgid=13.e7 OSDs=[19, 89, 38] num_large_omap_objects=1
pgid=13.ec OSDs=[89, 28, 48] num_large_omap_objects=1
pgid=13.f5 OSDs=[38, 88, 19] num_large_omap_objects=1
查找OSD日志,确定object名称(".dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.11"),发现omap条目数达到了2378492,超过默认告警值
[root@demo123 cephuser]# zcat /var/log/ceph/ceph-osd.19.log-20181231.gz |grep "omap"
2018-12-30 23:00:42.334766 7f6583f44700 0 log_channel(cluster) log [WRN] : Large omap object found. Object: 13:87443b2d:::.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.11:head Key count: 2378492 Size (bytes): 491722758
默认告警值为2000000,2378492>2000000,不建议去修改这个默认值,因为改得过大会加大集群出现异常的风险,属于掩耳盗铃。
[root@demo123 cephuser]# ceph daemon /var/run/ceph/ceph-osd.19.asok config show |grep large
"osd_bench_large_size_max_throughput": "104857600",
"osd_deep_scrub_large_omap_object_key_threshold": "2000000",
"osd_deep_scrub_large_omap_object_value_sum_threshold": "1073741824",
查看一下发生omap过大的bucket,确定相关信息
[root@demo123 cephuser]# radosgw-admin bucket stats --bucket=demo1
{
"bucket": "demo1",
"zonegroup": "68f1dcf5-0470-4a48-8cd2-51c837a2cafb",
"placement_rule": "default-placement",
"explicit_placement": {
"data_pool": "",
"data_extra_pool": "",
"index_pool": ""
},
"id": "afd874cd-f976-4007-a77c-be6fca298b71.34209.1", #当前bucket instance ID,
"marker": "afd874cd-f976-4007-a77c-be6fca298b71.34209.1",
"index_type": "Normal",
"owner": "s3test",
"ver": "0#2638037,1#2637965,2#2632835,3#2632869,4#2632799,5#2632597,6#2633289,7#2633175,8#2637227,9#2637609,10#2637997,11#2632455,12#2631337,13#2631624,14#2631983,15#2632359",
"master_ver": "0#0,1#0,2#0,3#0,4#0,5#0,6#0,7#0,8#0,9#0,10#0,11#0,12#0,13#0,14#0,15#0", #16个shard
"mtime": "2018-11-28 16:47:45.560039",
"max_marker": "0#00002638036.2638608.5,1#00002637964.2638536.5,2#00002632834.2649479.5,3#00002632868.2633634.5,4#00002632798.2633370.5,5#00002632596.2633168.5,6#00002633288.2633860.5,7#00002633174.2633747.5,8#00002637226.2637798.5,9#00002637608.2638181.5,10#00002637996.2638569.5,11#00002632454.2633026.5,12#00002631336.2631914.5,13#00002631623.2632195.5,14#00002631982.2632554.5,15#00002632358.2632930.5",
"usage": {
"rgw.main": {
"size": 1975757355553,
"size_actual": 2047893610496,
"size_utilized": 1975757355553,
"size_kb": 1929450543,
"size_kb_actual": 1999896104,
"size_kb_utilized": 1929450543,
"num_objects": 19998962 #近2000Wobject
}
},
"bucket_quota": {
"enabled": false,
"check_on_raw": false,
"max_size": -1,
"max_size_kb": 0,
"max_objects": -1
}
}
通过bucket reshard操作,将原来的bucket 重新划分shard,shard数量从16->64。注意reshard有风险,最好停掉客户端的读写操作以后再进行,同时如果你使用了multisite,请根据官方说明立即关闭Dynamic resharding特性。
Dynamic resharding 说明: http://docs.ceph.com/docs/mimic/radosgw/dynamicresharding/
做完reshard需要手工删除之前的索引数据,工具也提示了下面的内容。
[root@demo123 cephuser]# radosgw-admin bucket reshard --bucket demo1 --num-shards 64
*** NOTICE: operation will not remove old bucket index objects ***
*** these will need to be removed manually ***
tenant:
bucket name: demo1
old bucket instance id: afd874cd-f976-4007-a77c-be6fca298b71.34209.1
new bucket instance id: afd874cd-f976-4007-a77c-be6fca298b71.45786.1
total entries: 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 19998962
2019-01-03 11:42:33.741314 7f74d15c6dc0 0 WARNING: RGWReshard::add failed to drop lock on demo1:afd874cd-f976-4007-a77c-be6fca298b71.34209.1 ret=-2
检查reshard结果
[root@demo123 cephuser]# radosgw-admin bucket stats --bucket=demo1
{
"bucket": "demo1",
"zonegroup": "68f1dcf5-0470-4a48-8cd2-51c837a2cafb",
"placement_rule": "default-placement",
"explicit_placement": {
"data_pool": "",
"data_extra_pool": "",
"index_pool": ""
},
"id": "afd874cd-f976-4007-a77c-be6fca298b71.45786.1", #bucket instance ID发生变化
"marker": "afd874cd-f976-4007-a77c-be6fca298b71.34209.1",
"index_type": "Normal",
"owner": "s3test",
"ver": "0#4920,1#4920,2#4883,3#4877,4#4882,5#4883,6#4885,7#4880,8#4882,9#4880,10#4878,11#4883,12#4923,13#4883,14#4882,15#4874,16#4878,17#4880,18#4884,19#4881,20#4882,21#4881,22#4876,23#4922,24#4883,25#4887,26#4881,27#4879,28#4879,29#4879,30#4882,31#4884,32#4880,33#4879,34#4917,35#4876,36#4883,37#4885,38#4884,39#4879,40#4883,41#4880,42#4880,43#4882,44#4884,45#4877,46#4879,47#4877,48#4881,49#4880,50#4881,51#4881,52#4883,53#4876,54#4880,55#4884,56#4881,57#4885,58#4882,59#4881,60#4881,61#4881,62#4883,63#4882",#shard 数量变为了64
"master_ver": "0#0,1#0,2#0,3#0,4#0,5#0,6#0,7#0,8#0,9#0,10#0,11#0,12#0,13#0,14#0,15#0,16#0,17#0,18#0,19#0,20#0,21#0,22#0,23#0,24#0,25#0,26#0,27#0,28#0,29#0,30#0,31#0,32#0,33#0,34#0,35#0,36#0,37#0,38#0,39#0,40#0,41#0,42#0,43#0,44#0,45#0,46#0,47#0,48#0,49#0,50#0,51#0,52#0,53#0,54#0,55#0,56#0,57#0,58#0,59#0,60#0,61#0,62#0,63#0",
"mtime": "2019-01-03 11:32:50.349905",
"max_marker": "0#,1#,2#,3#,4#,5#,6#,7#,8#,9#,10#,11#,12#,13#,14#,15#,16#,17#,18#,19#,20#,21#,22#,23#,24#,25#,26#,27#,28#,29#,30#,31#,32#,33#,34#,35#,36#,37#,38#,39#,40#,41#,42#,43#,44#,45#,46#,47#,48#,49#,50#,51#,52#,53#,54#,55#,56#,57#,58#,59#,60#,61#,62#,63#",
"usage": {
"rgw.main": {
"size": 1975757355553,
"size_actual": 2047893610496,
"size_utilized": 1975757355553,
"size_kb": 1929450543,
"size_kb_actual": 1999896104,
"size_kb_utilized": 1929450543,
"num_objects": 19998962
}
},
"bucket_quota": {
"enabled": false,
"check_on_raw": false,
"max_size": -1,
"max_size_kb": 0,
"max_objects": -1
}
}
根据之前工具的提示需要回收index和meta两个pool里面的残留数据
回收index pool数据
[root@demo123 cephuser]# rados ls -p cn-bj-test2.rgw.buckets.index|grep "afd874cd-f976-4007-a77c-be6fca298b71.34209.1"
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.5
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.15
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.2
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.1
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.0
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.4
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.11
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.13
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.6
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.3
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.7
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.9
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.14
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.10
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.12
.dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.8
使用rados rm命令删除数据
[root@demo123 supdev]# rados ls -p cn-bj-test2.rgw.buckets.index|grep "afd874cd-f976-4007-a77c-be6fca298b71.34209.1"|awk '{print "rados rm -p cn-bj-test2.rgw.buckets.index "$1}'|sh -x
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.5
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.15
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.2
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.1
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.0
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.4
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.11
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.13
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.6
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.3
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.7
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.9
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.14
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.10
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.12
+ rados rm -p cn-bj-test2.rgw.buckets.index .dir.afd874cd-f976-4007-a77c-be6fca298b71.34209.1.8
回收meta pool的数据
[root@demo123 cephuser]# rados ls -p cn-bj-test2.rgw.meta --all
root demo1
root .bucket.meta.demo1:afd874cd-f976-4007-a77c-be6fca298b71.45786.1
root .bucket.meta.demo1:afd874cd-f976-4007-a77c-be6fca298b71.34209.1 #残留
root my-new-container_segments
root .bucket.meta.demo2:afd874cd-f976-4007-a77c-be6fca298b71.34353.1
root .bucket.meta.my-new-container:afd874cd-f976-4007-a77c-be6fca298b71.7991.1
users.uid s3test.buckets
users.uid swiftuser
users.swift swiftuser:swiftuser1
users.keys SNACA4LX9DS21NGMSRX4
root .bucket.meta.my-new-container_segments:afd874cd-f976-4007-a77c-be6fca298b71.7991.4
users.uid s3test
root demo2
users.keys XP8E2452AB6EBU3RPD0C
root my-new-container
users.uid swiftuser.buckets
users.uid synchronization-user
注意这里用的ceph L版本,使用了namespace,所以要指定namespace才能删除
[root@demo123 cephuser]# rados rm -p cn-bj-test2.rgw.meta .bucket.meta.demo1:afd874cd-f976-4007-a77c-be6fca298b71.34209.1 --namespace=root
[root@demo123 cephuser]# rados ls -p cn-bj-test2.rgw.meta --all
root demo1
root .bucket.meta.demo1:afd874cd-f976-4007-a77c-be6fca298b71.45786.1
root my-new-container_segments
root .bucket.meta.demo2:afd874cd-f976-4007-a77c-be6fca298b71.34353.1
root .bucket.meta.my-new-container:afd874cd-f976-4007-a77c-be6fca298b71.7991.1
users.uid s3test.buckets
users.uid swiftuser
users.swift swiftuser:swiftuser1
users.keys SNACA4LX9DS21NGMSRX4
root .bucket.meta.my-new-container_segments:afd874cd-f976-4007-a77c-be6fca298b71.7991.4
users.uid s3test
root demo2
users.keys XP8E2452AB6EBU3RPD0C
root my-new-container
users.uid swiftuser.buckets
users.uid synchronization-user
删完了object并不会恢复告警,需要手工对相应的pg进行deep-scrub操作,具体如下
[root@demo123 cephuser]# python large_omap.py
Large omap objects poolname = cn-bj-test2.rgw.buckets.index
pgid=13.33 OSDs=[59, 79, 19] num_large_omap_objects=1
pgid=13.3c OSDs=[49, 29, 78] num_large_omap_objects=1
pgid=13.3d OSDs=[48, 69, 9] num_large_omap_objects=1
pgid=13.45 OSDs=[88, 39, 28] num_large_omap_objects=1
pgid=13.4d OSDs=[38, 29, 89] num_large_omap_objects=1
pgid=13.50 OSDs=[68, 19, 59] num_large_omap_objects=1
pgid=13.6b OSDs=[39, 79, 8] num_large_omap_objects=1
pgid=13.8e OSDs=[38, 9, 78] num_large_omap_objects=1
pgid=13.d1 OSDs=[9, 88, 38] num_large_omap_objects=1
pgid=13.d2 OSDs=[59, 88, 28] num_large_omap_objects=1
pgid=13.e1 OSDs=[19, 88, 49] num_large_omap_objects=1
pgid=13.e4 OSDs=[38, 19, 89] num_large_omap_objects=1
pgid=13.e7 OSDs=[19, 89, 38] num_large_omap_objects=1
pgid=13.ec OSDs=[89, 28, 48] num_large_omap_objects=1
pgid=13.f5 OSDs=[38, 88, 19] num_large_omap_objects=1
[root@demo123 cephuser]# ceph pg deep-scrub 13.33
instructing pg 13.33 on osd.59 to deep-scrub
操作完可以看到有pg进行dep-scrub,之后状态恢复
[root@demo123 cephuser]# ceph -s
cluster:
id: 21cc0dcd-06f3-4d5d-82c2-dbd411ef0ed9
health: HEALTH_WARN
16 large omap objects
services:
mon: 3 daemons, quorum demo122,demo131,demo141
mgr: demo141(active)
osd: 90 osds: 90 up, 90 in
rgw: 1 daemon active
data:
pools: 7 pools, 3712 pgs
objects: 20.13M objects, 1.80TiB
usage: 7.28TiB used, 408TiB / 415TiB avail
pgs: 3711 active+clean
1 active+clean+scrubbing+deep #开始deep scrub
io:
client: 5.29MiB/s rd, 935B/s wr, 69op/s rd, 28op/s wr
[root@demo123 cephuser]# ceph -s
cluster:
id: 21cc0dcd-06f3-4d5d-82c2-dbd411ef0ed9
health: HEALTH_WARN
15 large omap objects #减少了1个
services:
mon: 3 daemons, quorum demo122,demo131,demo141
mgr: demo141(active)
osd: 90 osds: 90 up, 90 in
rgw: 1 daemon active
data:
pools: 7 pools, 3712 pgs
objects: 20.13M objects, 1.80TiB
usage: 7.28TiB used, 408TiB / 415TiB avail
pgs: 3712 active+clean
io:
client: 5.33MiB/s rd, 680B/s wr, 36op/s rd, 6op/s wr
index pool的omap告警一般就分为两类: