故事背景
说在前面:
使用mlogfilter过滤文件时报错说文件非mongodb的日志文件
[root@sh_01 booboo]# mlogfilter shard.log.2018-08-01 --from 2018-08-01T15:00:00.000+0800 --to "+1h" --slow 1000 > /alidata/booboo/tf.1
报错:非mongodb日志格式
回到客户服务器检查日志文件格式,明细如下:
[root@MONGO-SHARD-18 logs]# file *
shard.log: PCX ver. 2.5 image data
shard.log.2018-07-25: PCX ver. 2.5 image data
shard.log.2018-07-26: PCX ver. 2.5 image data
shard.log.2018-07-27: PCX ver. 2.5 image data
shard.log.2018-07-28: PCX ver. 2.5 image data
shard.log.2018-07-29: PCX ver. 2.5 image data
shard.log.2018-07-30: PCX ver. 2.5 image data
shard.log.2018-07-31: PCX ver. 2.5 image data
shard.log.2018-08-01: PCX ver. 2.5 image data
mongodb的日志正常应该为:ASCII text, with very long lines,但是现在却变成了PCX ver. 2.5 image data。需要弄清楚原因。
怀疑与echo >有关,进行验证。
到此验证成功:通过echo > log的方式会往文件头部新增’0a’
在测试环境中复现日志从ASCII text变成PCX ver. 2.5 image data
[root@sh_01 ~]# mongo
MongoDB shell version: 3.2.16
connecting to: test
> db.auth('test_dev','uplooking')
1
> db.t1.find()
{ "_id" : ObjectId("5b5ebb6796b8b74a73ee30f6"), "a" : 1, "b" : 2 }
{ "_id" : ObjectId("5b5ebb6996b8b74a73ee30f7"), "a" : 1, "b" : 1 }
> for (i=1;i<200000;i++){
... db.t1.insert({id:i})}
WriteResult({ "nInserted" : 1 })
[root@sh_01 ~]# cd /alidata/mongodb/logs
[root@sh_01 log]# ll
-rw-r--r-- 1 root root 2182 Aug 3 10:28 mongod27017.log
-rw-r--r-- 1 root root 3100 Aug 2 14:19 mongod27017.log.2018-08-02T07-18-27
-rw-r--r-- 1 root root 1526 Aug 2 15:18 mongod27017.log.2018-08-02T07-18-54
[root@sh_01 log]# file *
mongod27017.log: ASCII text, with very long lines
mongod27017.log.2018-08-02T07-18-27: ASCII text, with very long lines
mongod27017.log.2018-08-02T07-18-54: ASCII text, with very long lines
[root@sh_01 log]# echo > mongod27017.log
[root@sh_01 log]# echo > mongod27017.log
[root@sh_01 log]# ll
total 12
-rw-r--r-- 1 root root 1 Aug 3 10:29 mongod27017.log
-rw-r--r-- 1 root root 3100 Aug 2 14:19 mongod27017.log.2018-08-02T07-18-27
-rw-r--r-- 1 root root 1526 Aug 2 15:18 mongod27017.log.2018-08-02T07-18-54
[root@sh_01 log]# file *
mongod27017.log: very short file (no magic)
mongod27017.log.2018-08-02T07-18-27: ASCII text, with very long lines
mongod27017.log.2018-08-02T07-18-54: ASCII text, with very long lines
[root@sh_01 log]# tail -f mongod27017.log
2018-08-03T10:30:21.936+0800 I NETWORK [initandlisten] connection accepted from 127.0.0.1:43720 #2 (2 connections now open)
[root@sh_01 log]# ll -h
total 12K
-rw-r--r-- 1 root root 2.8K Aug 3 10:30 mongod27017.log
-rw-r--r-- 1 root root 3.1K Aug 2 14:19 mongod27017.log.2018-08-02T07-18-27
-rw-r--r-- 1 root root 1.5K Aug 2 15:18 mongod27017.log.2018-08-02T07-18-54
[root@sh_01 log]# file *
mongod27017.log: PCX ver. 2.5 image data
mongod27017.log.2018-08-02T07-18-27: ASCII text, with very long lines
mongod27017.log.2018-08-02T07-18-54: ASCII text, with very long lines
# 一边不断产生日志,一边多次执行echo>
[root@sh_01 log]# echo > mongod27017.log
[root@sh_01 log]# echo > mongod27017.log
[root@sh_01 log]# file *
mongod27017.log: PCX ver. 2.5 image data
mongod27017.log.2018-08-02T07-18-27: ASCII text, with very long lines
mongod27017.log.2018-08-02T07-18-54: ASCII text, with very long lines
mongod27017.log.2018-08-03T03-09-08: PCX ver. 2.5 image data
[root@sh_01 log]# echo > mongod27017.log
[root@sh_01 log]# file *
mongod27017.log: very short file (no magic)
mongod27017.log.2018-08-02T07-18-27: ASCII text, with very long lines
mongod27017.log.2018-08-02T07-18-54: ASCII text, with very long lines
mongod27017.log.2018-08-03T03-09-08: PCX ver. 2.5 image data
[root@sh_01 log]# ll -h
total 16K
-rw-r--r-- 1 root root 1 Aug 3 11:28 mongod27017.log
-rw-r--r-- 1 root root 3.1K Aug 2 14:19 mongod27017.log.2018-08-02T07-18-27
-rw-r--r-- 1 root root 1.5K Aug 2 15:18 mongod27017.log.2018-08-02T07-18-54
-rw-r--r-- 1 root root 2.9K Aug 3 11:05 mongod27017.log.2018-08-03T03-09-08
[root@sh_01 log]# file *
mongod27017.log: very short file (no magic)
mongod27017.log.2018-08-02T07-18-27: ASCII text, with very long lines
mongod27017.log.2018-08-02T07-18-54: ASCII text, with very long lines
mongod27017.log.2018-08-03T03-09-08: PCX ver. 2.5 image data
[root@sh_01 log]# file *
mongod27017.log: PCX ver. 2.5 image data
mongod27017.log.2018-08-02T07-18-27: ASCII text, with very long lines
mongod27017.log.2018-08-02T07-18-54: ASCII text, with very long lines
mongod27017.log.2018-08-03T03-09-08: PCX ver. 2.5 image data
[root@sh_01 log]# hexdump -c mongod27017.log
0000000 \n \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0
0000010 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0
*
0000850 \0 \0 \0 \0 \0 \0 \0 2 0 1 8 - 0 8 - 0
0000860 3 T 1 1 : 2 9 : 0 6 . 7 2 9 + 0
0000870 8 0 0 I A C C E S S [
0000880 c o n n 3 ] U n a u t h o r i
0000890 z e d : n o t a u t h o r i
00008a0 z e d o n t e s t t o e
00008b0 x e c u t e c o m m a n d {
00008c0 f i n d : " t 1 " , f i l
00008d0 t e r : { } } \n
成功复现了客户的情况:
第一次echo > log,文件头部新增\n
多次echo > log,文件头部如下:
0000000 \n \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0
0000010 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0
*
成功复现了客户的情况:
0000000 \n \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0
0000010 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0 \0
*
确实是mongodb日志轮询出了问题:
[root@sh_01 log]# pidof mongod
1828
[root@sh_01 log]# kill -SIGUSER1 1828
-bash: kill: SIGUSER1: invalid signal specification
[root@sh_01 log]# kill -SIGUSR1 1828
[root@sh_01 log]# ll
total 16
-rw-r--r-- 1 root root 1526 Aug 3 11:09 mongod27017.log
-rw-r--r-- 1 root root 3100 Aug 2 14:19 mongod27017.log.2018-08-02T07-18-27
-rw-r--r-- 1 root root 1526 Aug 2 15:18 mongod27017.log.2018-08-02T07-18-54
-rw-r--r-- 1 root root 2942 Aug 3 11:05 mongod27017.log.2018-08-03T03-09-08
[root@sh_01 log]# file *
mongod27017.log: ASCII text, with very long lines
mongod27017.log.2018-08-02T07-18-27: ASCII text, with very long lines
mongod27017.log.2018-08-02T07-18-54: ASCII text, with very long lines
mongod27017.log.2018-08-03T03-09-08: PCX ver. 2.5 image data
修改日志轮询脚本:
[root@MONGO-SHARD-18 logs]# cat /etc/init.d/mongo_logspit.sh
#!/bin/bash
# 2018/08/02 Apple
#Rotate the MongoDB logs to prevent a single logfile from consuming too much disk space.
cmd=mongod
mongodpath=/opt/mongodb/bin
pidarray=`pidof ${mongodpath}/$cmd`
LOGPATH_SHARD=/data/mongodb/shard1/logs
for pid in $pidarray;do
if [ $pid ]
then
kill -SIGUSR1 $pid
fi
done
#clear logfile more than 7 days
cd $LOGPATH_SHARD
find ./ -xdev -mtime +7 -name "shard.log.*" -exec rm -f {} \;
所有的分片都去执行:
kill -SIGUSR1 pidof mongod
分片15操作如下:其他分片同样
[zyadmin@MONGO-SHARD-15 ~]$ sudo -i
[root@MONGO-SHARD-15 ~]# cd /data/mongodb/shard1/logs/
[root@MONGO-SHARD-15 logs]# ll -h
total 1.7G
-rw-r--r-- 1 root root 20G Aug 2 15:27 shard.log
-rw-r--r-- 1 root root 19G Jul 25 23:50 shard.log.2018-07-25
-rw-r--r-- 1 root root 19G Jul 26 23:50 shard.log.2018-07-26
-rw-r--r-- 1 root root 19G Jul 27 23:50 shard.log.2018-07-27
-rw-r--r-- 1 root root 20G Jul 28 23:50 shard.log.2018-07-28
-rw-r--r-- 1 root root 20G Jul 29 23:50 shard.log.2018-07-29
-rw-r--r-- 1 root root 20G Jul 30 23:50 shard.log.2018-07-30
-rw-r--r-- 1 root root 20G Jul 31 23:50 shard.log.2018-07-31
-rw-r--r-- 1 root root 20G Aug 1 23:50 shard.log.2018-08-01
[root@MONGO-SHARD-15 logs]# file *
shard.log: PCX ver. 2.5 image data
shard.log.2018-07-25: PCX ver. 2.5 image data
shard.log.2018-07-26: PCX ver. 2.5 image data
shard.log.2018-07-27: PCX ver. 2.5 image data
shard.log.2018-07-28: PCX ver. 2.5 image data
shard.log.2018-07-29: PCX ver. 2.5 image data
shard.log.2018-07-30: PCX ver. 2.5 image data
shard.log.2018-07-31: PCX ver. 2.5 image data
shard.log.2018-08-01: PCX ver. 2.5 image data
[root@MONGO-SHARD-15 logs]# kill -SIGUSR1 `pidof mongod`
[root@MONGO-SHARD-15 logs]# file *
shard.log: ASCII text, with very long lines
shard.log.2018-07-25: PCX ver. 2.5 image data
shard.log.2018-07-26: PCX ver. 2.5 image data
shard.log.2018-07-27: PCX ver. 2.5 image data
shard.log.2018-07-28: PCX ver. 2.5 image data
shard.log.2018-07-29: PCX ver. 2.5 image data
shard.log.2018-07-30: PCX ver. 2.5 image data
shard.log.2018-07-31: PCX ver. 2.5 image data
shard.log.2018-08-01: PCX ver. 2.5 image data
shard.log.2018-08-02T07-27-29: PCX ver. 2.5 image data
[root@MONGO-SHARD-15 logs]# ll -h
total 1.7G
-rw-r--r-- 1 root root 2.0K Aug 2 15:27 shard.log
-rw-r--r-- 1 root root 19G Jul 25 23:50 shard.log.2018-07-25
-rw-r--r-- 1 root root 19G Jul 26 23:50 shard.log.2018-07-26
-rw-r--r-- 1 root root 19G Jul 27 23:50 shard.log.2018-07-27
-rw-r--r-- 1 root root 20G Jul 28 23:50 shard.log.2018-07-28
-rw-r--r-- 1 root root 20G Jul 29 23:50 shard.log.2018-07-29
-rw-r--r-- 1 root root 20G Jul 30 23:50 shard.log.2018-07-30
-rw-r--r-- 1 root root 20G Jul 31 23:50 shard.log.2018-07-31
-rw-r--r-- 1 root root 20G Aug 1 23:50 shard.log.2018-08-01
-rw-r--r-- 1 root root 20G Aug 2 15:27 shard.log.2018-08-02T07-27-29
弄清楚日志变更的原因以及复现过程后,不难发现,日志因为头部变化从而导致文件格式变更。因此推测异常日志的组成如下:
因此修复的思路如下:
日志轮询的部署距离现在大概4个月
xaa: PCX ver. 2.5 image data
xab: PCX ver. 2.5 image data
xac: PCX ver. 2.5 image data
xad: PCX ver. 2.5 image data
xaa: PCX ver. 2.5 image data
此处省略。。。
xdo: PCX ver. 2.5 image data
xdp: ASCII text, with very long lines
xdq: ASCII text, with very long lines
xdr: ASCII text, with very long lines
xds: ASCII text, with very long lines
xdt: ASCII text, with very long lines
xdu: ASCII text, with very long lines
xdv: UTF-8 Unicode text, with very long lines
xdw: ASCII text, with very long lines
xdx: ASCII text, with very long lines
xdy: ASCII text, with very long lines
[root@sh_01 mongolog_20180801]# head -n 2 xdv
re: "x86_64", version: "Kernel 3.10.0-693.2.2.el7.x86_64" } }
2018-08-01T16:13:44.958+0800 I ACCESS [conn4972925] Successfully authenticated as principal __system on local
[root@sh_01 mongolog_20180801]# head -n 2 xdu
38422 #4967772 (445 connections now open)
2018-08-01T14:00:10.575+0800 I NETWORK [thread1] connection accepted from 172.16.0.44:38430 #4967773 (446 connections now open)
# 获取08月01号下午3点开始到4点执行时间超过5秒的查询
mlogfilter mongolog.18.14_16 --from 2018-08-01T15 --to "+1h" --slow 5000 > slowlog.txt
# 获取08月01号下午3点开始到4点语句的执行次数、用时等统计信息
mloginfo slowlog.txt --queries > an_slowlog.txt
# 通过mplotqueries进行慢查询散点分布图绘制,且只返回前10个
mplotqueries slowlog.txt --output-file 01.png --logscale --group-limit 10
[root@sh_01 booboo]# mplotqueries slowlog.txt --output-file 01.png --logscale --group-limit 10
SCATTER plot
id #points group
1 692 order.order
2 615 omdmain.item_region_erp
3 1 omdmain.customer
()
echo > log
会往文件头部插入\n
即16进制的0a
echo > log
是无法进行覆盖的,会将所有的数据全部置为0
kill -SIGUSER1 [mongodpid]
该case花了一整天,从怀疑被攻击到确认是日志轮询引起文件格式变更是一个关键转折点;
另外PCX格式是第一次碰到,疑惑了半天~最后是@培尧发现了echo的端倪,@衾袭@大宝去验证最终确认了问题的根源。
来源:https://www.toberoot.com/,作者:BooBooWei 转自:高效运维公众号