|
fastdfs基本的服务有nginx,fdfs_storaged,fdfs_trackerd服务,那么如何对这些进程进行监控并保证其正常工作就很重要了。首先可以接入zabbix,检查机器的进程的状态以及端口号,如果进程down掉会发出报警。同时,还可以部署monit和自己编写监控脚本来保证服务进程是正常工作的。
第一步部署一下monit的部署
monit是一款功能非常丰富的进程、文件、目录和设备的检测软件,适用于Linux/Unix平台。它可以自动修复那些已经停止运作的程序。
在centos7上进行部署。
安装monit:
yum -y install monit 增加文件/lib/systemd/system/fdfs_storaged.service:
[Unit]
Description=FastDFS Storage Server
[Service]
Type=forking
PIDFile=/srv/node/storage/data/fdfs_storaged.pid
ExecStart=/etc/init.d/fdfs_storaged start
ExecReload=/etc/init.d/fdfs_storaged restart
ExecStop=/etc/init.d/fdfs_storaged stop
PrivateTmp=true
[Install]
WantedBy=multi-user.target
在/etc/monit.d/*目录下面编辑需要监控的进程的文件,下面以fdfs_storaged来进行示范:
/etc/monit.d/fdfs_storaged:
check process fdfs_storaged with pidfile /srv/node/storage/data/fdfs_storaged.pid
start program "/usr/bin/systemctl start fdfs_storaged.service"
stop program "/usr/bin/systemctl stop fdfs_storaged.service"
if failed host localhost port 9092 then restart
根据pid文件来检查进程状态。如果进程端口9092失效那么重启进程。
启动monit监控服务:
systemctl start monit.service
第二步:解决进程无法正常提供服务问题
还是以fdfs_storaged进程为例,能够正常的将文件上传到本地服务说明服务正常,如果上传失败,说明服务不可用,那么重启服务。具体步骤:
>先上传一个文件到当前服务器(指定storage ip:port)
>如果上传成功则继续;如果上传不成功,那么报警出来,并且重启fdfs_storaged服务。如果进程hang住了,是没有办法stop或者restart的,需用kill命令强制将其杀死。
为测试需要,我们需要了解一下如何将一个进程hang住:
kill -SIGSTOP pid:停止进程
kill -SIGCONT pid:继续运行被停住的进程
hang住fdfs_storaged进程:ps -ef | grep /usr/bin/fdfs_storaged | grep -v grep | awk '{print $2}' | xargs kill -SIGSTOP
restart fdfs_storaged进程: ps -ef | grep /usr/bin/fdfs_storaged | grep -v grep | awk '{print $2}' | xargs kill -SIGCONT
hang住nginx进程:ps -ef | grep nginx | grep process | grep -v grep | awk '{print $2}' | xargs kill -SIGSTOP
restart nginx进程:ps -ef | grep nginx | grep process | grep -v grep | awk '{print $2}' | xargs kill -SIGCONT
按照命令行上传来测试:
上传文件到指定节点:fdfs_upload_file /etc/fdfs/client.conf /opt/FastDFS/fastdfs-process-protector/storaged_protector/smallfile ip:port
下面就看一下监控进程hang住并处理的脚本:
storaged.conf:
[general]
time_interval = 5
storage_port = 9092
client_conf_file = /etc/fdfs/client.conf
[notice]
isemail = 1
issms = 1
ishotchat = 1
noticeusers= niuliguo
### Logging configuration
[loggers]
keys=root
[handlers]
keys=file_handler
[formatters]
keys=formatter
[logger_root]
level=DEBUG
handlers=file_handler
[handler_file_handler]
class=FileHandler
level=DEBUG
formatter=formatter
args=('/var/log/fdfs_storaged_protector.logs','a+')
[formatter_formatter]
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s run.sh:
#!/bin/bash
nohup python fdfs_storaged_protector.py & fdfs_storaged_protector.py:
import os
import sys
import logging
import logging.handlers
import logging.config
import httplib
import json
import urllib
import socket
import time
import fcntl
from random import randint
from subprocess import PIPE,Popen
from signal import alarm,signal,SIGALRM,SIGKILL
from ConfigParser import ConfigParser
reload(sys)
sys.setdefaultencoding('utf-8')
__rootdir = os.path.realpath( os.path.dirname(__file__) )
if __rootdir not in sys.path:
sys.path.append(__rootdir)
class Protector(object):
def __init__(self,conf_path,rootdir):
self.conf_general = 'general'
self.conf_notice = 'notice'
self.rootdir = rootdir
self.ipaddr = socket.gethostbyname( socket.gethostname() )
config_parser = ConfigParser()
if config_parser.read(conf_path):
self.time_interval = int( config_parser.get(self.conf_general,'time_interval','30') )
self.storage_port = int( config_parser.get(self.conf_general,'storage_port','9092') )
self.client_conf_file = config_parser.get(self.conf_general,'client_conf_file','/etc/fdfs/client.conf')
self.isemail = config_parser.get(self.conf_notice,'isemail','1')
self.issms = config_parser.get(self.conf_notice,'issms','1')
self.ishotchat = config_parser.get(self.conf_notice,'ishotchat','1')
self.noticeusers = config_parser.get(self.conf_notice,'noticeusers','niuliguo')
logging.config.fileConfig(conf_path)
self.logger = logging.getLogger('fdfs_storaged_protector')
def handler(self,signum,frame):
raise IOError("Timeout!")
def run(self,args,cwd=None,shell=False,kill_tree=True,timeout=-1,env=None):
'''
Run a command in timeout,If out of time,process will be killed.
'''
class Alarm(Exception):
pass
def alarm_handler(signum,frame):
raise Alarm
p = Popen(args,shell = shell,cwd = cwd,stdout = PIPE,stderr = PIPE,env=env)
if timeout != -1:
signal(SIGALRM,alarm_handler)
alarm(timeout)
try:
stdout,stderr = p.communicate()
if timeout != -1:
alarm(0)
except Alarm:
pids = [p.pid]
if kill_tree:
pids.extend(self.get_process_children(p.pid))
for pid in pids:
try:
os.kill(pid,SIGALRM)
except OSError:
pass
return -9,'',''
return p.returncode,stdout,stderr
def get_process_children(self,pid):
p = Popen('ps --no-headers -o pid --ppid %d' % pid ,shell= True,
stdout=PIPE,stderr = PIPE)
stdout,stderr = p.communicate()
return [ int(p) for p in stdout.split() ]
def generate_small_file(self):
try:
with open( "%s/smallfile" % (self.rootdir),"wb") as fout:
fout.write(os.urandom( 1024 * randint(80,180)) )
if os.path.exists(os.sep.join([self.rootdir,"smallfile"])):
self.logger.info("generate small file success")
else:
self.generate_small_file()
except IOError:
self.logger('generate small file fail!')
def upload(self):
if os.path.exists(os.sep.join([self.rootdir,"smallfile"])):
cmd = 'fdfs_upload_file %s %s %s:%d > /dev/null' % (self.client_conf_file , os.sep.join([self.rootdir,"smallfile"]) , self.ipaddr , self.storage_port )
return self.run(cmd,shell=True,timeout=10)
else:
self.logger.error("file %s not exist" % os.sep.join([self.rootdir,"smallfile"]))
return -1,"","file missed"
def monit(self):
while True:
retcode,stdout,stderr = self.upload()
if 0 == retcode:
self.logger.info("upload file success and the fdfs process is healthy")
time.sleep(self.time_interval)
elif stderr == "file missed":
self.logger.error(stderr)
self.generate_small_file()
else:
self.logger.error(stderr)
break
def restart_process(self):
'''
1.kill the fdfs_storaged process
2.try to restart fdfs_storaged process for three times
'''
try:
###step one: kill process
cmd = 'ps -ef | grep \'/usr/bin/fdfs_storaged\'| grep -v grep | awk \'{print $2}\' | xargs kill -9'
retcode,stdout,stderr = self.run(cmd,shell=True,timeout=10)
###step two: restart process
flag = False
for i in range(0,3):
cmd = "systemctl stop fdfs_storaged.service && systemctl start fdfs_storaged.service"
retcode,stdout,stderr = self.run(cmd,shell=True,timeout=10)
if 0 == retcode:
flag = True
break
if flag:
self.logger.info("restart fdfs_storaged success!")
else:
self.logger.info("restart fdfs_storaged fail!")
except:
self.logger.error("restart fdfs_storaged fail!")
exit(1)
def alarm(self):
'''
insert your own alarm code
'''
def restart_self(self):
cmd = 'sh run.sh'
retcode,stdout,stderr = self.run(cmd,shell=True,timeout=10)
if 0 == retcode:
self.logger.info("restart fdfs_storaged_protector process success")
else:
self.logger.error("restart fdfs_storaged_protector process fail")
if __name__ == '__main__':
conf_path = os.sep.join([__rootdir,"storaged.conf"])
protector = Protector(conf_path,__rootdir)
try:
protector.generate_small_file()
protector.monit()
raise Exception
except Exception:
protector.restart_process()
protector.alarm()
protector.restart_self()
except KeyboardInterrupt:
exit(1)
except IOError:
exit(1) 启动服务:
>sh run.sh
进程监控及僵死恢复就介绍到这里,剩下说一下Python读取配置文件和日志配置。
上述代码使用ConfigParser来读取配置文件,logging来配置日志:
def __init__(self,conf_path):
self.conf_general = 'general'
self.conf_notice = 'notice'
self.ipaddr = socket.gethostbyname( socket.gethostname() )
config_parser = ConfigParser()
if config_parser.read(conf_path):
self.time_interval = int( config_parser.get(self.conf_general,'time_interval','30') )
self.storage_port = int( config_parser.get(self.conf_general,'storage_port','9092') )
self.client_conf_file = config_parser.get(self.conf_general,'client_conf_file','/etc/fdfs/client.conf')
logging.config.fileConfig(conf_path)
self.logger = logging.getLogger('fdfs_storaged_protector') conf_path就是配置文件的路径,也就是上面的配置文件storaged.conf。
Author:忆之独秀
Email:leaguenew@qq.com
注明出处:http://blog.csdn.net/lavorange/article/details/51721330
|