This repository has been archived by the owner on Sep 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Frameworkcontroller training service on training service v3 interface #5384
Open
QuanluZhang
wants to merge
5
commits into
microsoft:master
Choose a base branch
from
QuanluZhang:dev-frameworkcontroller
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
143593c
fix frameworkcontroller bugs
QuanluZhang cbcd744
frameworkcontroller v3
QuanluZhang 80600b2
Merge branch 'master' of github.com:microsoft/nni into dev-frameworkc…
QuanluZhang 179bf21
update
QuanluZhang eb8fcf7
update
QuanluZhang File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,263 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT license. | ||
|
||
import argparse | ||
import json | ||
import os | ||
import random | ||
import re | ||
import sys | ||
import time | ||
import traceback | ||
from datetime import datetime, timedelta | ||
|
||
import pkg_resources | ||
|
||
from .gpu import collect_gpu_usage | ||
|
||
idle_timeout_seconds = 10 * 60 | ||
gpu_refressh_interval_seconds = 5 | ||
regular = re.compile('v?(?P<version>[0-9](\.[0-9]){0,1}).*') | ||
trial_runner_syslogger = None | ||
|
||
|
||
def main_loop(args): | ||
'''main loop logic for trial runner''' | ||
idle_last_time = datetime.now() | ||
gpu_refresh_last_time = datetime.now() - timedelta(minutes=1) | ||
try: | ||
if args.job_pid_file: | ||
with open(args.job_pid_file, 'w') as job_file: | ||
job_file.write("%d" % os.getpid()) | ||
|
||
trials = dict() | ||
|
||
command_channel = args.command_channel | ||
# command loop | ||
while True: | ||
command_type, command_data = command_channel.receive() | ||
if command_type == CommandType.NewTrialJob: | ||
nni_log(LogType.Info, 'New trial job with {0}'.format(command_data)) | ||
trial_id = command_data["trialId"] | ||
if trial_id in trials.keys(): | ||
trial = trials[trial_id] | ||
if trial.is_running(): | ||
raise Exception('trial %s is running already, cannot start a new one' % trial.id) | ||
else: | ||
del trials[trial_id] | ||
trial = Trial(args, command_data) | ||
trial.run() | ||
trials[trial_id] = trial | ||
elif command_type == CommandType.KillTrialJob: | ||
nni_log(LogType.Info, 'Kill trial job with {0}'.format(command_data)) | ||
trial_id = command_data | ||
if trial_id in trials.keys(): | ||
trial = trials[trial_id] | ||
trial.kill(command_data) | ||
elif command_type == CommandType.SendTrialJobParameter: | ||
nni_log(LogType.Info, 'Receive trial job parameter: {0}'.format(command_data)) | ||
trial_id = command_data["trialId"] | ||
if trial_id in trials.keys(): | ||
trial = trials[trial_id] | ||
trial.save_parameter_file(command_data) | ||
elif command_type is not None: | ||
raise Exception("unknown command %s" % command_type) | ||
|
||
trial_list = list(trials.values()) | ||
for trial in trial_list: | ||
if trial is not None and trial.is_running(): | ||
idle_last_time = datetime.now() | ||
else: | ||
del trials[trial.id] | ||
|
||
if (datetime.now() - idle_last_time).seconds > idle_timeout_seconds: | ||
nni_log(LogType.Info, "trial runner is idle more than {0} seconds, so exit.".format( | ||
idle_timeout_seconds)) | ||
break | ||
|
||
if args.enable_gpu_collect and (datetime.now() - gpu_refresh_last_time).seconds > gpu_refressh_interval_seconds: | ||
# collect gpu information | ||
gpu_info = collect_gpu_usage(args.node_id) | ||
command_channel.send(CommandType.ReportGpuInfo, gpu_info) | ||
gpu_refresh_last_time = datetime.now() | ||
time.sleep(0.5) | ||
except Exception as ex: | ||
traceback.print_exc() | ||
raise ex | ||
finally: | ||
nni_log(LogType.Info, "main_loop exits.") | ||
|
||
trial_list = list(trials.values()) | ||
for trial in trial_list: | ||
trial.kill() | ||
del trials[trial.id] | ||
# wait to send commands | ||
for _ in range(10): | ||
if command_channel.sent(): | ||
break | ||
time.sleep(1) | ||
command_channel.close() | ||
|
||
|
||
def trial_runner_help_info(*args): | ||
print('please run --help to see guidance') | ||
|
||
|
||
def check_version(args): | ||
try: | ||
trial_runner_version = pkg_resources.get_distribution('nni').version | ||
except pkg_resources.ResolutionError: | ||
# package nni does not exist, try nni-tool package | ||
nni_log(LogType.Error, 'Package nni does not exist!') | ||
os._exit(1) | ||
if not args.nni_manager_version: | ||
# skip version check | ||
nni_log(LogType.Warning, 'Skipping version check!') | ||
else: | ||
try: | ||
command_channel = args.command_channel | ||
trial_runner_version = regular.search(trial_runner_version).group('version') | ||
nni_log(LogType.Info, '{0}: runner_version is {1}'.format(args.node_id, trial_runner_version)) | ||
nni_manager_version = regular.search(args.nni_manager_version).group('version') | ||
nni_log(LogType.Info, '{0}: nni_manager_version is {1}'.format(args.node_id, nni_manager_version)) | ||
log_entry = {} | ||
if trial_runner_version != nni_manager_version: | ||
nni_log(LogType.Warning, '{0}: Version does not match!'.format(args.node_id)) | ||
error_message = '{0}: NNIManager version is {1}, Trial runner version is {2}, NNI version does not match!'.format( | ||
args.node_id, nni_manager_version, trial_runner_version) | ||
log_entry['tag'] = 'VCFail' | ||
log_entry['msg'] = error_message | ||
command_channel.send(CommandType.VersionCheck, log_entry) | ||
while not command_channel.sent(): | ||
time.sleep(1) | ||
else: | ||
nni_log(LogType.Info, '{0}: Version match!'.format(args.node_id)) | ||
log_entry['tag'] = 'VCSuccess' | ||
command_channel.send(CommandType.VersionCheck, log_entry) | ||
except AttributeError as err: | ||
nni_log(LogType.Error, '{0}: {1}'.format(args.node_id, err)) | ||
|
||
if __name__ == '__main__': | ||
|
||
'''NNI Trial Runner main function''' | ||
PARSER = argparse.ArgumentParser() | ||
PARSER.set_defaults(func=trial_runner_help_info) | ||
PARSER.add_argument('--trial_command', type=str, help='Command to launch trial process') | ||
PARSER.add_argument('--nnimanager_ip', type=str, help='NNI manager rest server IP') | ||
PARSER.add_argument('--nnimanager_port', type=str, help='NNI manager rest server port') | ||
PARSER.add_argument('--nni_manager_version', type=str, help='the nni version transmitted from nniManager') | ||
PARSER.add_argument('--log_collection', type=str, help='set the way to collect log in trial runner') | ||
PARSER.add_argument('--node_count', type=int, help='number of nodes, it determines how to consume command and save code file') | ||
PARSER.add_argument('--job_pid_file', type=str, help='save trial runner process pid') | ||
args, unknown = PARSER.parse_known_args() | ||
|
||
setting_file = "settings.json" | ||
if not os.path.exists(setting_file): | ||
setting_file = "../{}".format(setting_file) | ||
if os.path.exists(setting_file): | ||
with open(setting_file, 'r') as fp: | ||
settings = json.load(fp) | ||
print("setting is {}".format(settings)) | ||
else: | ||
print("not found setting file") | ||
|
||
args.exp_id = settings["experimentId"] | ||
args.platform = settings["platform"] | ||
# runner_id is unique runner in experiment | ||
args.runner_id = os.path.basename(os.path.realpath(os.path.curdir)) | ||
args.runner_name = "runner_"+args.runner_id | ||
args.enable_gpu_collect = settings["enableGpuCollector"] | ||
args.command_channel = settings["commandChannel"] | ||
|
||
if args.trial_command is None: | ||
# FIXME: deal with distributed trial which has more than one trial command | ||
args.trial_command = settings["command"][0] | ||
if args.nnimanager_ip is None: | ||
args.nnimanager_ip = settings["nniManagerIP"] | ||
if args.nnimanager_port is None: | ||
args.nnimanager_port = settings["nniManagerPort"] | ||
if args.nni_manager_version is None: | ||
args.nni_manager_version = settings["nniManagerVersion"] | ||
if args.log_collection is None: | ||
args.log_collection = settings["logCollection"] | ||
if args.node_count is None: | ||
# default has only one node. | ||
args.node_count = 1 | ||
|
||
# FIXME: in reuse mode, multiple trials would have the same output dir | ||
# NOTE: NNI_OUTPUT_DIR is updated in trial.py | ||
os.environ['NNI_OUTPUT_DIR'] = os.curdir + "/nnioutput" | ||
os.environ['NNI_PLATFORM'] = args.platform | ||
os.environ['NNI_SYS_DIR'] = os.curdir | ||
os.environ['NNI_EXP_ID'] = args.exp_id | ||
# FIXME: multi-phase is true? | ||
os.environ['MULTI_PHASE'] = "true" | ||
# FIXME: trial job id is runner? | ||
os.environ['NNI_TRIAL_JOB_ID'] = "runner" | ||
os.environ['REUSE_MODE'] = "true" | ||
|
||
from .log_utils import LogType, RemoteLogger, StdOutputType, nni_log | ||
from .trialv3 import Trial | ||
from .file_channel import FileChannel | ||
from .web_channel import WebChannel | ||
from .commands import CommandType | ||
|
||
is_multi_node = args.node_count > 1 | ||
|
||
if (is_multi_node): | ||
# FIXME: not supported yet!!! | ||
# for multiple nodes, create a file to get a unique id. | ||
while True: | ||
node_id = random.randint(0, 10000) | ||
unique_check_file_name = "node_%s" % (node_id) | ||
if not os.path.exists(unique_check_file_name): | ||
break | ||
with open(unique_check_file_name, "w") as unique_check_file: | ||
unique_check_file.write("%s" % (int(datetime.now().timestamp() * 1000))) | ||
args.node_id = node_id | ||
else: | ||
# node id is unique in the runner | ||
args.node_id = None | ||
|
||
# init command channel | ||
command_channel = None | ||
if args.command_channel == "file": | ||
command_channel = FileChannel(args) | ||
elif args.command_channel == 'aml': | ||
from .aml_channel import AMLChannel | ||
command_channel = AMLChannel(args) | ||
else: | ||
command_channel = WebChannel(args) | ||
command_channel.open() | ||
|
||
nni_log(LogType.Info, "command channel is {}, actual type is {}".format(args.command_channel, type(command_channel))) | ||
args.command_channel = command_channel | ||
|
||
trial_runner_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'runner', | ||
StdOutputType.Stdout, args.log_collection, args.runner_name, command_channel) | ||
sys.stdout = sys.stderr = trial_runner_syslogger | ||
nni_log(LogType.Info, "{}: merged args is {}".format(args.node_id, args)) | ||
|
||
if args.trial_command is None: | ||
nni_log(LogType.Error, "{}: no command is found.".format(args.node_id)) | ||
os._exit(1) | ||
check_version(args) | ||
try: | ||
main_loop(args) | ||
except SystemExit as se: | ||
nni_log(LogType.Info, '{}: NNI trial runner exit with code {}'.format(args.node_id, se.code)) | ||
|
||
# try best to send latest errors to server | ||
timeout = 10 | ||
while not command_channel.sent() and timeout > 0: | ||
timeout -= 1 | ||
time.sleep(1) | ||
os._exit(se.code) | ||
finally: | ||
if trial_runner_syslogger is not None: | ||
if trial_runner_syslogger.pipeReader is not None: | ||
trial_runner_syslogger.pipeReader.set_process_exit() | ||
trial_runner_syslogger.close() | ||
|
||
# the process doesn't exit even main loop exit. So exit it explictly. | ||
os._exit(0) |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
trial_runner_v3