77import socket
88import struct
99import subprocess
10+ import sys
1011import tempfile
1112import time
1213import rbd
2930# always
3031# present, although this is the case on a nicely ceph-deploy'd system
3132RADOS_NAME = 'client.admin'
32-
33+ SRC_DIR = '/etc/ceph'
3334
3435def fire_event (data , tag ):
3536 return {tag : data }
@@ -53,6 +54,18 @@ class AdminSocketError(MonitoringError):
5354 pass
5455
5556
57+ def get_ceph_version ():
58+ result = ceph_command (None , ['--version' ])
59+ try :
60+ version = result ['out' ].split (' ' )[2 ]
61+ return version
62+ except (KeyError , AttributeError , IndexError ) as ex :
63+ sys .stdout .write ("Error getting ceph --version" )
64+ sys .stdout .write (str (ex ))
65+ raise ex
66+
67+
68+
5669def rados_command (cluster_handle , prefix , args = None , decode = True ):
5770 """Safer wrapper for ceph_argparse.json_command, which raises
5871
@@ -280,8 +293,12 @@ def rados_commands(fsid, cluster_name, commands):
280293 import rados
281294
282295 # Open a RADOS session
296+ if cluster_name is None :
297+ cluster_name = "ceph"
298+
299+ _conf_file = os .path .join (SRC_DIR , cluster_name + ".conf" )
283300 cluster_handle = rados .Rados (
284- name = RADOS_NAME , clustername = cluster_name , conffile = ''
301+ name = RADOS_NAME , clustername = cluster_name , conffile = _conf_file
285302 )
286303 cluster_handle .connect ()
287304
@@ -529,8 +546,12 @@ def get_cluster_object(cluster_name, sync_type):
529546 assert sync_type in SYNC_TYPES
530547
531548 # Open a RADOS session
549+ if cluster_name is None :
550+ cluster_name = "ceph"
551+
552+ _conf_file = os .path .join (SRC_DIR , cluster_name + ".conf" )
532553 cluster_handle = rados .Rados (
533- name = RADOS_NAME , clustername = cluster_name , conffile = ''
554+ name = RADOS_NAME , clustername = cluster_name , conffile = _conf_file
534555 )
535556 cluster_handle .connect ()
536557
@@ -698,10 +719,12 @@ def get_heartbeats():
698719 if "client" in filename :
699720 continue
700721 service_data = service_status (filename )
701- except (rados .Error , MonitoringError ):
722+ except (rados .Error , MonitoringError ) as ex :
702723 # Failed to get info for this service, stale socket or
703724 # unresponsive, exclude it from report
704- pass
725+ sys .stdout .write ("Error getting ceph service status from admin "
726+ "socket %s" % filename )
727+ sys .stdout .write (str (ex ))
705728 else :
706729 if not service_data :
707730 continue
@@ -713,51 +736,46 @@ def get_heartbeats():
713736 # A mon in quorum is elegible to emit a cluster heartbeat
714737 mon_sockets [service_data ['fsid' ]] = filename
715738
716- # Installed Ceph version (as oppose to per-service running ceph version)
717- try :
718- ceph_version_str = subprocess .check_output (
719- "rpm -qa | grep ceph-[0-1]" , shell = True
720- )
721- ceph_version_str = ceph_version_str .split ("-" )[1 ]
722- except subprocess .CalledProcessError :
723- ceph_version_str = None
724- if ceph_version_str :
725- ceph_version = ceph_version_str
726- else :
727- ceph_version = None
739+ ceph_version = get_ceph_version ()
728740
729741 # For each ceph cluster with an in-quorum mon on this node, interrogate
730742 # the cluster
731743 cluster_heartbeat = {}
732744 for fsid , socket_path in mon_sockets .items ():
745+ cluster_handle = None
733746 try :
747+ _conf_file = os .path .join (SRC_DIR , fsid_names [fsid ] + ".conf" )
734748 cluster_handle = rados .Rados (
735- name = RADOS_NAME , clustername = fsid_names [fsid ], conffile = ''
749+ name = RADOS_NAME , clustername = fsid_names [fsid ],
750+ conffile = _conf_file
736751 )
737752 cluster_handle .connect ()
738753 cluster_heartbeat [fsid ] = cluster_status (
739754 cluster_handle , fsid_names [fsid ]
740755 )
741- except (rados .Error , MonitoringError ):
742- # Something went wrong getting data for this cluster, exclude it
743- # from our report
744- pass
756+ except (rados .Error , MonitoringError ) as ex :
757+ # Something went wrong getting data for this cluster
758+ sys .stdout .write ("Error fetching ceph (fsid: %s) cluster maps "
759+ "from "
760+ "admin socket %s" % (fsid_names [fsid ],
761+ socket_path ))
762+ sys .stdout .write (str (ex ))
763+ raise ex
764+ finally :
765+ if cluster_handle :
766+ cluster_handle .shutdown ()
745767
746- cluster_handle .shutdown ()
747768 return ceph_version , cluster_heartbeat
748769
749770
750771def service_status (socket_path ):
751772 """Given an admin socket path, learn all we can about that service
752773
753774 """
754- try :
755- cluster_name , service_type , service_id = \
756- re .match (
757- "^(.+?)-(.+?)\.(.+)\.asok$" ,
758- os .path .basename (socket_path )).groups ()
759- except AttributeError :
760- return None
775+ cluster_name , service_type , service_id = \
776+ re .match (
777+ "^(.+?)-(.+?)\.(.+)\.asok$" ,
778+ os .path .basename (socket_path )).groups ()
761779
762780 status = None
763781 # Interrogate the service for its FSID
@@ -890,11 +908,20 @@ def _heartbeat(fsid):
890908
891909
892910def heartbeat (fsid = None ):
911+ try :
912+ import rados
913+ except ImportError :
914+ # Ceph isn't installed, report no services or clusters
915+ return None , {}
916+
893917 try :
894918 return _heartbeat (fsid )
895- except Exception :
896- # TODO(Rohan): Tackle this later
897- pass
919+ except Exception as ex :
920+ sys .stdout .write ("Error getting heartbeat for ceph cluster fsid %s"
921+ % fsid )
922+ sys .stdout .write (str (ex ))
923+ if type (ex ) in [rados .Error , MonitoringError , AdminSocketError ]:
924+ raise ex
898925
899926
900927def json_load_byteified (file_handle ):
0 commit comments