Add infos into target window
[ryzomcore.git] / ryzom / server / src / admin_modules / aes_module.cpp
blobaee6da68a592bb9719321790bf800461aef785e4
1 // Ryzom - MMORPG Framework <http://dev.ryzom.com/projects/ryzom/>
2 // Copyright (C) 2010 Winch Gate Property Limited
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU Affero General Public License as
6 // published by the Free Software Foundation, either version 3 of the
7 // License, or (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU Affero General Public License for more details.
14 // You should have received a copy of the GNU Affero General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
18 #include "stdpch.h"
19 #include "nel/misc/singleton.h"
20 #include <time.h>
21 #include "nel/misc/path.h"
22 #include "nel/misc/common.h"
23 #include "nel/net/module.h"
24 #include "nel/net/module_builder_parts.h"
25 #include "nel/net/unified_network.h"
26 #include "nel/net/service.h"
28 #include "game_share/utils.h"
30 #include "admin_modules_itf.h"
32 using namespace std;
33 using namespace NLMISC;
34 using namespace NLNET;
36 void aes_forceLink() {}
38 namespace ADMIN
40 const char* LAUNCH_CTRL_START = "LAUNCH";
41 const char* LAUNCH_CTRL_STOP = "STOP";
43 const char *AESPersistentStateFilename = "aes_state.txt";
45 /// We want 10 slot (you can change this, but need at least 3 slots)
46 const uint32 CRASH_COUNTER_SLOT = 10;
47 /// The delay (in second) between slots roll. This value * CRASH_COUNTER_SLOT give the total sampling period
48 const uint32 CRASH_COUNTER_ROLL_DELAY = 10*60; // 10 mn
49 /// If we have more than 5 start of a service in the sampling period, we tag the service as 'chain crashing'
50 const uint32 CRASH_COUNTER_CHAIN_THRESHOLD = 5;
52 /** the name of the file written by the patch man to request a global shutdown
53 * of all registered the services before switching to a new version.
55 CVariable<string> ShutdownRequestFileName("aes","ShutdownRequestFileName", "name of the file to use for shutdown requests", "./global.launch_ctrl", 0, true);
57 /** A kind rolling buffer used to count services start from the runner
58 * script.
60 class CRunnerLoopCounter
62 /// The slot table. Each slot accumulate the service start for a time frame
63 uint32 _Slots[CRASH_COUNTER_SLOT];
64 /** The last value read from the runner script. This is used to compute
65 * the delta value to add to the first slot
67 uint32 _LastValueRead;
68 /// The total sum of all slot (could be recomputed on demand, but a little more efficient)
69 uint32 _CounterSum;
70 public:
72 CRunnerLoopCounter()
74 // we need at least 3 slots
75 nlctassert(CRASH_COUNTER_SLOT >= 3);
77 // init all slots with 0
78 for (uint i=0; i<CRASH_COUNTER_SLOT; ++i)
80 _Slots[i] = 0;
83 // init the last value with a magic value so that the first
84 // update will not compute a delta but only take
85 // the first value as initial reference
86 _LastValueRead = 0xffffffff;
87 _CounterSum = 0;
90 /** Updat the counter by submitting the current start counter
91 * written by the runner script.
92 * Note that the runner script only increment the counter
93 * so we need to compute the delta from _LastValueRead
94 * before accumulating in the first slot.
96 void updateCounter(uint32 lastValue)
98 if (_LastValueRead == 0xffffffff || lastValue < _LastValueRead)
100 // this is the first sample, just init the last value read
101 // or the counter have been reset to a smaller value
102 _LastValueRead = lastValue;
104 else
106 // not the first sample, compute the delta and accumulate
107 uint32 delta = lastValue - _LastValueRead;
108 _Slots[0] += delta;
109 _LastValueRead = lastValue;
110 // update summ
111 _CounterSum += delta;
115 /// Roll the slots. The last slot is ejected and
116 /// each slot are copied in the next one (in
117 /// inverse order obviously)
118 /// The first slot in then set to 0
119 void rollCounter()
122 _CounterSum -= _Slots[CRASH_COUNTER_SLOT-1];
124 for (uint i=CRASH_COUNTER_SLOT-1; i>0; --i)
126 _Slots[i] = _Slots[i-1];
128 _Slots[0] = 0;
131 /// Return the sum of all the slots
132 uint32 getSum()
134 return _CounterSum;
137 /// Return the sum of the first slot, the tree first slot and
138 /// the total of all slots.
139 /// This is useful to understand the behavoir of a crashing
140 /// service over the sampling period.
141 void getCounters(uint32 &oneSlot, uint32 &treeSlots, uint32 &allSlots)
143 oneSlot = _Slots[0];
144 treeSlots = _Slots[0]+_Slots[1]+_Slots[2];
145 allSlots = _CounterSum;
149 /// Reset all counter to zero
150 void resetCounter()
152 for (uint i=0; i<CRASH_COUNTER_SLOT; ++i)
154 _Slots[i] = 0;
156 _CounterSum = 0;
162 class CAdminExecutorService
163 : /*public CManualSingleton<CAdminExecutorService>,*/
164 public CEmptyModuleServiceBehav<CEmptyModuleCommBehav<CEmptySocketBehav<CModuleBase> > >,
165 public CAdminExecutorServiceSkel,
166 public IModuleTrackerCb
168 public:
169 enum
171 SLOW_TO_START_THRESHOLD = 60, // 1 mn
172 SLOW_TO_STOP_THRESHOLD = 60, // 1 mn
173 _NagiosReportDelay = 60, // 1 mn
176 private:
178 typedef CModuleTracker<TModuleClassPred> TServiceTracker;
179 // tracker for admin executor client modules
180 TServiceTracker _ServiceTracker;
182 /// Admin service module
183 TModuleProxyPtr _AdminService;
185 /// Date of last state reporting to AS
186 uint32 _LastStateReport;
188 /// Date of last nagios report output
189 uint32 _LastNagiosReport;
191 typedef string TAliasName;
192 typedef string TShardName;
193 typedef set<TAliasName> TRegisteredServices;
194 /// List of 'registered service', ie. those that are configured in aes cfg.
195 TRegisteredServices _RegisteredServices;
197 /// A set of data for each registered or connected service
198 struct TServiceState
200 string ShardName;
201 bool DontUseShardOrders;
202 TRunningState RunningState;
203 set<TRunningTag> RunningTags;
204 string LongName;
205 string ShortName;
206 uint32 PID;
207 string State;
208 uint32 LastStateDate;
209 uint32 StopRequestDate;
210 uint32 StartRequestDate;
211 TModuleProxyPtr ServiceModule;
212 CRunnerLoopCounter RunnerLoopCounter;
214 TServiceState()
215 : DontUseShardOrders(false),
216 RunningState(TRunningState::rs_stopped),
217 PID(0),
218 LastStateDate(0),
219 StopRequestDate(0),
220 StartRequestDate(0)
224 typedef map<TAliasName, TServiceState> TServiceStates;
225 /// States for each connected or registered service
226 TServiceStates _ServiceStates;
228 typedef map<TModuleProxyPtr, TAliasName> TConnectedServiceIndex;
229 /// Index of connected service proxy to alias name
230 TConnectedServiceIndex _ConnectedServiceIndex;
232 typedef map<TAliasName, TRunningOrders> TPersistentServiceOrders;
233 /// Persistent service state, i.e state that are restored after a stop/start of the aes
234 TPersistentServiceOrders _PersistentServiceOrders;
236 typedef map<TShardName, TShardOrders> TShardsOrders;
237 /// Shard orders (set by AS)
238 TShardsOrders _ShardOrders;
240 /// flag for shutdown request form patch manager.
241 bool _ShutdownForPatch;
243 /// A flag that mean we need to save the persistent state file
244 bool _NeedToWriteStateFile;
246 /// Date of last roll of the runner loop counters
247 uint32 _LastRunnerLoopCounterRoll;
249 /// Data for each command pending result from a service
250 struct TPendingWebCommand
252 /// Date of reception of the command for timeout
253 uint32 ReceptionDate;
254 /// Name of the target service
255 string ServiceAlias;
256 /// Command
257 string Command;
259 typedef uint32 TCommandId;
260 typedef map<TCommandId, TPendingWebCommand> TPendingWebCommands;
261 /// A list of pending command sent to service and waiting result
262 TPendingWebCommands _PendingWebCommands;
264 /// information about shard being stopped
265 struct TStopingShardInfo
267 /// Name of the shard to stop
268 string ShardName;
269 /// Delay before stop
270 uint32 Delay;
271 /// Begin date of countdown
272 uint32 BeginDate;
275 typedef vector<TStopingShardInfo> TStopingShardInfos;
277 /// The vector of shard to stop.
278 TStopingShardInfos _StopingShards;
281 public:
282 CAdminExecutorService()
283 : _ServiceTracker(TModuleClassPred("AdminExecutorServiceClient")),
284 _LastStateReport(0),
285 _LastNagiosReport(0),
286 _ShutdownForPatch(false),
287 _NeedToWriteStateFile(false),
288 _LastRunnerLoopCounterRoll(0)
290 CAdminExecutorServiceSkel::init(this);
291 _ServiceTracker.init(this, this);
295 bool initModule(const TParsedCommandLine &pcl)
297 CModuleBase::initModule(pcl);
299 // read the persistent state file if any
300 string filename = CPath::standardizePath(IService::getInstance()->SaveFilesDirectory.toString(), true)+AESPersistentStateFilename;
301 FILE *fp = nlfopen(filename, "rt");
302 if (fp != NULL)
304 char buffer[1024];
305 char *ret;
306 while ((ret=fgets(buffer, 1024, fp)) != NULL)
308 CSString line(buffer);
309 CSString cmd(line.firstWord(true));
311 if (cmd == "ServiceState")
313 CSString serviceAlias = line.firstWord(true);
314 CSString serviceOrders = line.firstWord(true);
316 TRunningOrders runningOrders(serviceOrders);
317 if (!serviceAlias.empty() && runningOrders != TRunningOrders::invalid_val)
319 // add this one in the list of persistent state
320 _PersistentServiceOrders[serviceAlias] = runningOrders;
323 else if (cmd == "ShardOrders")
325 string shardName(line.firstWord(true));
326 TShardOrders shardOrders(line.firstWord(true));
327 if (shardOrders != TShardOrders::invalid_val)
328 _ShardOrders[shardName] = shardOrders;
331 // clear the flag because 'setGlobalState' has set it
332 _NeedToWriteStateFile = false;
334 fclose(fp);
337 return true;
340 void onModuleUp(IModuleProxy *proxy)
342 if (proxy->getModuleClassName() == "AdminService")
344 nldebug("CAdminExecutorService : admin service up as '%s'", proxy->getModuleName().c_str());
345 // we found the manager of AES
346 if (_AdminService != NULL)
348 nlwarning("CAdminExecutorService : admin service already known as '%s', replacing with new one", _AdminService->getModuleName().c_str());
350 _AdminService = proxy;
352 // cleanup the persistent service state by removing any state not in registered or connected service
354 set<string> removeList;
356 // first, fill the list with all the persistent state service name
358 TPersistentServiceOrders::iterator first(_PersistentServiceOrders.begin()), last(_PersistentServiceOrders.end());
359 for (; first != last; ++first)
361 removeList.insert(first->first);
365 // remove the registered service from the removelist
367 TRegisteredServices::iterator first(_RegisteredServices.begin()), last(_RegisteredServices.end());
368 for (; first != last; ++first)
370 removeList.erase(*first);
373 // remove any connected service (even unregistered one)
375 TServiceStates::iterator first(_ServiceStates.begin()), last(_ServiceStates.end());
376 for (; first != last; ++first)
378 removeList.erase(first->first);
382 // no remove persistent state that left in the remove list
383 while (!removeList.empty())
385 _PersistentServiceOrders.erase(*(removeList.begin()));
387 _NeedToWriteStateFile = true;
389 removeList.erase(removeList.begin());
393 // send the current status
394 sendUpServiceUpdate();
397 uint32 now = NLMISC::CTime::getSecondsSince1970();
398 // check pending command timeout
399 TPendingWebCommands::iterator first(_PendingWebCommands.begin()), last(_PendingWebCommands.end());
400 for (; first != last; ++first)
402 TPendingWebCommand &pwc = first->second;
404 if (now - pwc.ReceptionDate > 10)
406 // timeout
407 if (_AdminService != NULL)
409 CAdminServiceProxy as(_AdminService);
410 as.commandResult(this, first->first, pwc.ServiceAlias, "ERROR : AES : no reponse from service");
413 _PendingWebCommands.erase(first);
415 // check other pending commands at next update
416 break;
421 void onModuleDown(IModuleProxy *proxy)
423 if (proxy == _AdminService)
425 nldebug("CAdminExecutorService : admin service '%s' is down", proxy->getModuleName().c_str());
427 _AdminService = NULL;
431 void onModuleUpdate()
433 H_AUTO(CAdminExecutorService_onModuleUpdate);
435 uint32 now = CTime::getSecondsSince1970();
437 if (_LastStateReport < now)
439 // every second
441 // check services every second
442 TServiceStates::iterator first(_ServiceStates.begin()), last(_ServiceStates.end());
443 for (; first != last; ++first)
445 string aliasName = first->first;
446 TServiceState &ss = first->second;
447 if (_RegisteredServices.find(aliasName) != _RegisteredServices.end())
449 // this is a registered service, we need to control is running state
451 // read the actual running state from the runner script written file
452 if (getOfflineServiceState(aliasName) == "RUNNING")
454 // the service is running
455 ss.RunningTags.erase(TRunningTag::rt_locally_stopped);
456 ss.RunningTags.erase(TRunningTag::rt_globally_stopped);
457 ss.RunningTags.erase(TRunningTag::rt_stopped_for_patch);
459 if (ss.StopRequestDate != 0)
461 // still not stopped, check for slow to stop service
462 if (now - ss.StopRequestDate > SLOW_TO_STOP_THRESHOLD)
464 // add a running tag
465 ss.RunningTags.insert(TRunningTag::rt_slow_to_stop);
469 if (ss.RunningState != TRunningState::rs_online)
471 // tag slow to start service
472 if (now - ss.StartRequestDate > SLOW_TO_START_THRESHOLD)
474 // add a running tag
475 ss.RunningTags.insert(TRunningTag::rt_slow_to_start);
477 else
479 ss.RunningState = TRunningState::rs_running;
483 else
485 // the service is stopped
486 ss.RunningState = TRunningState::rs_stopped;
487 ss.RunningTags.erase(TRunningTag::rt_locally_started);
488 ss.RunningTags.erase(TRunningTag::rt_externaly_started);
489 ss.RunningTags.erase(TRunningTag::rt_slow_to_stop);
491 // clean the stop request date
492 ss.StopRequestDate = 0;
495 // try to obtains service orders from its shard
496 TShardOrders shardOrders(TShardOrders::so_autostart_on);
497 if (_ShardOrders.find(ss.ShardName) != _ShardOrders.end())
499 shardOrders = _ShardOrders[ss.ShardName];
501 // little check, the service must have a entry in the service orders container.
502 nlassert(_PersistentServiceOrders.find(aliasName) != _PersistentServiceOrders.end());
504 TRunningOrders serviceOrders = _PersistentServiceOrders[aliasName];
506 // look if service need to be started
507 if (ss.RunningState == TRunningState::rs_stopped // its stopped
508 && serviceOrders == TRunningOrders::ro_activated // and activated
509 && shardOrders == TShardOrders::so_autostart_on // and shard is autostart on
510 && !ss.DontUseShardOrders // and this service follow its shard orders
511 && !_ShutdownForPatch // and no patch pending
514 // we must start this service !
515 startService(aliasName);
518 // look for service that need to be stopped
519 if (ss.RunningState != TRunningState::rs_stopped // its not stopped
520 && (serviceOrders == TRunningOrders::ro_deactivated // and deactivated
521 || _ShutdownForPatch) // or patch pending
522 && ss.StopRequestDate == 0 // no stop request send
525 // we must stop this service
526 stopService(aliasName);
528 // store the sop
529 ss.StopRequestDate = now;
531 // shuted down for patch ?
532 if (_ShutdownForPatch)
533 ss.RunningTags.insert(TRunningTag::rt_stopped_for_patch);
534 else
535 ss.RunningTags.erase(TRunningTag::rt_stopped_for_patch);
537 // chain crashing ?
538 if (ss.RunnerLoopCounter.getSum() > CRASH_COUNTER_CHAIN_THRESHOLD)
539 ss.RunningTags.insert(TRunningTag::rt_chain_crashing);
540 else
541 ss.RunningTags.erase(TRunningTag::rt_chain_crashing);
543 // update the crash counter
544 ss.RunnerLoopCounter.updateCounter(getServiceStartLoopCounter(aliasName));
551 // if we have an admin service connected, send it an update of service state
552 if (_AdminService != NULL)
553 sendUpServiceUpdate();
556 if ((now & 0xf) == 0)
558 // every 8 seconds for low frequency work
560 // check for shutdown request from patchman
561 checkShutdownRequest();
564 // check for shard to stop (and warning to send)
565 checkServiceToStop();
567 // time to output the nagios report ?
568 if (now > _LastNagiosReport+_NagiosReportDelay)
570 // write the nagios report
571 FILE *fp = nlfopen("aes_nagios_report.txt", "wt");
572 if (fp != NULL)
574 // output the current date
575 time_t t = now;
576 fprintf(fp, "AESReportDate=%s", ::ctime(&t));
578 fprintf(fp, "NBService=%u\n", (uint)_ServiceStates.size());
579 // output state of each service
580 TServiceStates::iterator first(_ServiceStates.begin()), last(_ServiceStates.end());
581 for (; first != last; ++first)
583 CSString serviceLine;
584 TServiceState &ss = first->second;
585 const string &aliasName = first->first;
587 CSString runningTags;
588 set<TRunningTag>::iterator rtf(ss.RunningTags.begin()), rte(ss.RunningTags.end());
589 for (; rtf != rte; ++rtf)
591 runningTags<<"<"<<rtf->toString()<<">";
594 bool registered = _RegisteredServices.find(aliasName) != _RegisteredServices.end();
596 serviceLine << "ServiceAlias='"<<aliasName<<"' RunningState='"<<ss.RunningState.toString()<<"' RunningTag='"<<runningTags<<"'";
597 serviceLine << " NoReportSince="<<now-ss.LastStateDate;
598 serviceLine << " State='"<<ss.State<<"'";
600 fprintf(fp, "%s\n", serviceLine.c_str());
604 fclose(fp);
606 else
608 nlwarning("Can't open the nagios report file !");
611 _LastNagiosReport = now;
615 // update the last report date
616 _LastStateReport = now;
619 // check runner loop counter roll timer
620 if (_LastRunnerLoopCounterRoll+CRASH_COUNTER_ROLL_DELAY < now)
622 // it's time to roll the crash counter
623 TServiceStates::iterator first(_ServiceStates.begin()), last(_ServiceStates.end());
624 for (; first != last; ++first)
626 first->second.RunnerLoopCounter.rollCounter();
629 _LastRunnerLoopCounterRoll = now;
632 if (_NeedToWriteStateFile)
634 /// The persistent service orders need to be saved
635 string filename = CPath::standardizePath(IService::getInstance()->SaveFilesDirectory.toString(), true)+AESPersistentStateFilename;
636 FILE *fp = nlfopen(filename, "wt");
637 if (fp != NULL)
640 CSString line;
641 TShardsOrders::iterator first(_ShardOrders.begin()), last(_ShardOrders.end());
642 for (; first != last; ++first)
644 line << "ShardOrders "<<first->first<<" "<<first->second.toString()<<"\n";
647 fputs(line.c_str(), fp);
651 TPersistentServiceOrders::iterator first(_PersistentServiceOrders.begin()), last(_PersistentServiceOrders.end());
652 for (; first != last; ++first)
654 CSString line;
655 line << "ServiceState "<<first->first<<" "<<first->second.toString()<<"\n";
656 fputs(line.c_str(), fp);
659 // clear the flag because 'setGlobalState' has set it
660 _NeedToWriteStateFile = false;
662 fclose(fp);
667 void sendUpServiceUpdate()
669 if (_AdminService != NULL)
671 vector<TServiceStatus> serviceStatus;
672 set<TAliasName> missingServices = _RegisteredServices;
673 // send an updated list to AES
674 TServiceStates::iterator first(_ServiceStates.begin()), last(_ServiceStates.end());
675 for (; first != last; ++first)
677 const string &aliasName = first->first;
678 bool registered = _RegisteredServices.find(aliasName) != _RegisteredServices.end();
679 TServiceState &ss = first->second;
680 serviceStatus.push_back(TServiceStatus());
681 TServiceStatus &ts = serviceStatus.back();
682 ts.setShardName(ss.ShardName);
683 ts.setServiceLongName(ss.LongName);
684 ts.setServiceShortName(ss.ShortName);
685 ts.setServiceAliasName(aliasName);
686 ts.setRunningState(ss.RunningState);
687 if (registered)
688 ts.setRunningOrders(_PersistentServiceOrders[aliasName]);
689 else
690 ts.setRunningOrders(TRunningOrders::invalid_val);
691 ts.setRunningTags(ss.RunningTags);
692 CSString state;
693 state << ss.State << "\tNoReportSince=" << (NLMISC::CTime::getSecondsSince1970()-ss.LastStateDate);
695 // add the host name
696 state << "\tHostname=" << IService::getInstance()->getHostName();
698 if (registered)
700 // this is a registered service, send the start counter
701 uint32 oneSlot, treeSlots, allSlots;
702 ss.RunnerLoopCounter.getCounters(oneSlot, treeSlots, allSlots);
703 state << "\tStartCounter="<<oneSlot<<" "<<treeSlots<<" "<<allSlots;
705 ts.setStatus(state);
707 missingServices.erase(aliasName);
710 CAdminServiceProxy as(_AdminService);
711 as.upServiceUpdate(this, serviceStatus);
715 IModuleProxy *findOnlineService(const std::string &serviceAlias)
717 TConnectedServiceIndex::iterator first(_ConnectedServiceIndex.begin()), last(_ConnectedServiceIndex.end());
718 for (; first != last; ++first)
720 if (first->second == serviceAlias)
722 // ok, we found it
723 return first->first;
727 // not found
728 return NULL;
731 void checkShutdownRequest()
733 // if there's no ctrl file to be found then giveup
734 if (!NLMISC::CFile::fileExists(ShutdownRequestFileName)) return;
736 // if a shutdown ctrl file exists then read it's contents (if the file doesn't exist this returns an empty string)
737 CSString fileContents;
738 fileContents.readFromFile(ShutdownRequestFileName.c_str());
740 // see if the file exists
741 if (!fileContents.empty())
743 NLMISC::CFile::deleteFile(ShutdownRequestFileName);
744 fileContents= fileContents.strip().splitToOneOfSeparators(" \t\n\r\x1a");
746 NLMISC::fromString(fileContents, _ShutdownForPatch);
747 _ShutdownForPatch = !_ShutdownForPatch;
751 void checkServiceToStop()
753 uint32 now = CTime::getSecondsSince1970();
754 // for each shard to stop
755 for (uint i=0; i<_StopingShards.size(); ++i)
757 const TStopingShardInfo &stopShardInfo = _StopingShards[i];
759 bool timeToStop = stopShardInfo.BeginDate + stopShardInfo.Delay <= now;
760 uint32 timeLeft = (stopShardInfo.BeginDate + stopShardInfo.Delay) - now;
761 // check every service
762 TServiceStates::iterator first(_ServiceStates.begin()), last(_ServiceStates.end());
763 for (; first != last; ++first)
765 TServiceState &serviceState = first->second;
767 if (serviceState.ServiceModule != NULL && serviceState.ShardName == stopShardInfo.ShardName)
769 // this one belong to the shard to stop
770 if (!timeToStop)
772 // send a warning every 30 s
773 if (((now - stopShardInfo.BeginDate) % 30) == 0)
775 CAdminExecutorServiceClientProxy aec(serviceState.ServiceModule);
776 nlinfo("Sending command 'quitDelay' to service '%s'", first->first.c_str());
777 aec.serviceCmdNoReturn(this, toString("quitDelay %u", timeLeft));
780 else
782 // stop the service
783 stopService(first->first);
788 if (timeToStop)
790 nlinfo("All local service for shard %s are stopped", stopShardInfo.ShardName.c_str());
791 // shard stopped, erase this entry
792 _StopingShards.erase(_StopingShards.begin()+i);
793 --i;
799 // the following routine reads the text string contained in the ".state" file for this service
800 // it's used to provide a 'state' value for services that are registered but are not connected
801 // to give info on whether they've been launched, whether their launcher is online, etc
802 std::string getOfflineServiceState(const std::string& serviceAlias)
804 // open the file for reading
805 FILE* f= nlfopen(getServiceStateFileName(serviceAlias), "rt");
806 if (f==NULL) return "STOPPED";
808 // setup a buffer to hold the text read from the file
809 uint32 fileSize= NLMISC::CFile::getFileSize(f);
810 std::string txt;
811 txt.resize(fileSize);
813 // read the text from the file - note that the number of bytes read may be less than the
814 // number of bytes requested because we've opened the file in text mode and not binary mode
815 uint32 bytesRead= (uint32)fread(&txt[0],1,fileSize,f);
816 txt.resize(bytesRead);
817 fclose(f);
819 // return the text read from the file
820 return txt;
824 // the following routine reads the text string contained in the "pid.state" file for this service
825 // it's used to provide a early pid information to the AES (before the service is connected)
826 uint32 getOfflineServicePID(const std::string& serviceAlias)
828 // open the file for reading
829 FILE* f = nlfopen(getServicePIDFileName(serviceAlias), "rt");
830 if (f==NULL) return 0;
832 // setup a buffer to hold the text read from the file
833 uint32 fileSize= NLMISC::CFile::getFileSize(f);
834 std::string txt;
835 txt.resize(fileSize);
837 // read the text from the file - note that the number of bytes read may be less than the
838 // number of bytes requested because we've opened the file in text mode and not binary mode
839 uint32 bytesRead= (uint32)fread(&txt[0],1,fileSize,f);
840 txt.resize(bytesRead);
841 fclose(f);
843 // return the pid read from the file
844 uint32 pid;
845 NLMISC::fromString(txt, pid);
847 return pid;
851 // the following routine reads the text string contained in the ".start_counter" file for this service
852 // it's used to provide the number of start done by the runner loop on the service
853 // This is used for the chain crash detector system.
854 uint32 getServiceStartLoopCounter(const std::string& serviceAlias)
856 // open the file for reading
857 FILE* f= nlfopen(getServiceLoopCounterFileName(serviceAlias), "rt");
858 if (f==NULL)
859 return 0;
861 // setup a buffer to hold the text read from the file
862 uint32 fileSize= NLMISC::CFile::getFileSize(f);
863 std::string txt;
864 txt.resize(fileSize);
866 // read the text from the file - note that the number of bytes read may be less than the
867 // number of bytes requested because we've opened the file in text mode and not binary mode
868 uint32 bytesRead= (uint32)fread(&txt[0],1,fileSize,f);
869 txt.resize(bytesRead);
870 fclose(f);
872 // parse the text in the buffer
873 uint32 counter;
874 NLMISC::fromString(txt, counter);
876 return counter;
879 // retrieve service launch info in the config file
880 bool getServiceLaunchInfo(const string& serviceAlias, string& path)
882 string basePath;
883 CConfigFile::CVar *launchDir = IService::getInstance()->ConfigFile.getVarPtr("AESLauncherDir");
884 if (launchDir != NULL)
886 basePath = launchDir->asString()+"/";
889 if (_RegisteredServices.find(serviceAlias) == _RegisteredServices.end())
890 return false;
891 path = basePath + serviceAlias+"/";
893 return true;
897 std::string getServiceStateFileName(const std::string& serviceAlias)
899 string servicePath;
900 if (getServiceLaunchInfo(serviceAlias, servicePath))
901 return NLMISC::CPath::standardizePath(servicePath)+serviceAlias+".state";
902 else
903 return string();
906 std::string getServicePIDFileName(const std::string& serviceAlias)
908 string servicePath;
909 if (getServiceLaunchInfo(serviceAlias, servicePath))
910 return NLMISC::CPath::standardizePath(servicePath)+"pid.state";
911 else
912 return string();
915 std::string getServiceLoopCounterFileName(const std::string& serviceAlias)
917 string servicePath;
918 if (getServiceLaunchInfo(serviceAlias, servicePath))
919 return NLMISC::CPath::standardizePath(servicePath)+serviceAlias+".start_count";
920 else
921 return string();
924 std::string getServiceLaunchCtrlFileName(const std::string& serviceAlias,const std::string& serviceExecutionPath, bool deferred)
926 return NLMISC::CPath::standardizePath(serviceExecutionPath)+serviceAlias+(deferred?".deferred_":".")+"launch_ctrl";
930 bool writeServiceLaunchCtrl(const std::string& serviceAlias, bool deferred, const std::string& txt)
932 string path;
933 if (!getServiceLaunchInfo(serviceAlias, path))
934 return false;
936 // make sure the path exists
937 NLMISC::CFile::createDirectoryTree(path);
939 // open the file for writing
940 FILE* f = nlfopen(getServiceLaunchCtrlFileName(serviceAlias, path, deferred).c_str(),"wt");
941 if (f==NULL) return false;
943 // write the text to the file
944 fprintf(f,"%s",txt.c_str());
945 fclose(f);
947 return true;
950 bool startService(const std::string &serviceAlias)
952 if (_ServiceStates.find(serviceAlias) != _ServiceStates.end())
954 TServiceState &ss = _ServiceStates[serviceAlias];
955 if (ss.RunningState != TRunningState::rs_stopped)
957 nlwarning("startService '%s' : the service is already running", serviceAlias.c_str());
958 return false;
961 // store the start date
962 ss.StartRequestDate = CTime::getSecondsSince1970();
965 if (_RegisteredServices.find(serviceAlias) == _RegisteredServices.end())
967 nlwarning("startService '%s' : the service in not registered, can't start it", serviceAlias.c_str());
968 return false;
971 // write the start command
972 bool ret = writeServiceLaunchCtrl(serviceAlias, false, LAUNCH_CTRL_START);
974 return ret;
977 bool stopService(const std::string &serviceAlias)
979 // check that the service is running
980 TServiceStates::iterator it(_ServiceStates.find(serviceAlias));
981 if (it == _ServiceStates.end())
983 nlwarning("stopService : Failed to found service '%s' in the list of services", serviceAlias.c_str());
984 return false;
987 TServiceState &ss = it->second;
988 // write the launch control to stop
989 if (_RegisteredServices.find(serviceAlias) != _RegisteredServices.end())
991 if (!writeServiceLaunchCtrl(serviceAlias, false, LAUNCH_CTRL_STOP))
993 nlwarning("Failed to write launch control file for service '%s'", serviceAlias.c_str());
994 return false;
996 else
997 nlinfo("Service '%s' launch control file updated", serviceAlias.c_str());
1000 // set the stopre request date if needed
1001 if (ss.StopRequestDate != 0)
1003 ss.StopRequestDate = CTime::getSecondsSince1970();
1006 if (ss.ServiceModule == NULL)
1008 nlwarning("stopService : The service '%s' is not connected, can't ask him to stop", serviceAlias.c_str());
1009 return false;
1012 // send the "quit" command to the service
1013 CAdminExecutorServiceClientProxy aec(ss.ServiceModule);
1014 nlinfo("Sending command 'quit' to service '%s'", serviceAlias.c_str());
1015 aec.serviceCmdNoReturn(this, "quit");
1017 return true;
1021 ///////////////////////////////////////////////////////////////////////
1022 //// Virtuals from IModuleTrackerCb
1023 ///////////////////////////////////////////////////////////////////////
1025 virtual void onTrackedModuleUp(IModuleProxy *moduleProxy)
1027 nldebug("Service module '%s' UP", moduleProxy->getModuleName().c_str());
1030 TParsedCommandLine pcl;
1031 if (!pcl.parseParamList(moduleProxy->getModuleManifest()))
1033 nlwarning("CAdminExecutorService:onTrackedModuleUp : failed to parse manifest");
1036 const TParsedCommandLine *pclLongName = pcl.getParam("LongName");
1037 const TParsedCommandLine *pclShortName = pcl.getParam("ShortName");
1038 const TParsedCommandLine *pclAliasName = pcl.getParam("AliasName");
1039 const TParsedCommandLine *pclPID = pcl.getParam("PID");
1040 const TParsedCommandLine *pclDontUseShardOrders = pcl.getParam("DontUseShardOrders");
1042 string aliasName = pclAliasName != NULL ? pclAliasName->ParamValue : moduleProxy->getModuleName();
1044 // remove the temporary state and update connected service index
1045 _ServiceStates.erase(moduleProxy->getModuleName());
1046 _ConnectedServiceIndex[moduleProxy] = aliasName;
1048 nlinfo("AES client module %s for service %s is up",
1049 moduleProxy->getModuleName().c_str(),
1050 aliasName.c_str());
1052 // create a new entry or get an existing one
1053 TServiceState &ss = _ServiceStates[aliasName];
1054 // update the service state
1055 ss.RunningState = TRunningState::rs_online;
1056 if (pclDontUseShardOrders)
1057 NLMISC::fromString(pclDontUseShardOrders->ParamValue, ss.DontUseShardOrders);
1058 else
1059 ss.DontUseShardOrders = false;
1060 ss.LongName = pclLongName != NULL ? pclLongName->ParamValue : "unknown";
1061 ss.ShortName = pclShortName != NULL ? pclShortName->ParamValue : "unknown";
1063 if (pclPID!= NULL)
1065 NLMISC::fromString(pclPID->ParamValue, ss.PID);
1067 else
1069 ss.PID = 0;
1072 ss.State = "";
1073 ss.LastStateDate = NLMISC::CTime::getSecondsSince1970();
1074 ss.ServiceModule = moduleProxy;
1075 ss.StartRequestDate = 0;
1076 ss.RunningTags.erase(TRunningTag::rt_slow_to_start);
1077 if (_RegisteredServices.find(aliasName) == _RegisteredServices.end())
1079 ss.RunningTags.insert(TRunningTag::rt_externaly_started);
1081 // else
1082 // {
1083 // // if this service is locally stopped or if the shard it belong to is stopped,
1084 // // then flag it as 'localy started'
1085 // if (_PersistentServiceOrders.find(aliasName) != _PersistentServiceOrders.end()
1086 // && _PersistentServiceOrders[aliasName] == TRunningOrders::ro_stopped)
1087 // {
1088 // // flag it as started
1089 // _PersistentServiceOrders[aliasName] = TRunningOrders::ro_running;
1090 // ss.RunningTags.insert(TRunningTag::rt_locally_started);
1091 // _NeedToWriteStateFile = true;
1092 // }
1093 // else if (_ShardOrders.find(ss.ShardName) != _ShardOrders.end()
1094 // && _ShardOrders[ss.ShardName] == TRunningOrders::ro_stopped)
1095 // {
1096 // // the shard is stopped, flag the service as started
1097 // _PersistentServiceOrders[aliasName] = TRunningOrders::ro_running;
1098 // ss.RunningTags.insert(TRunningTag::rt_locally_started);
1099 // _NeedToWriteStateFile = true;
1100 // }
1101 // }
1103 sendUpServiceUpdate();
1105 virtual void onTrackedModuleDown(IModuleProxy *moduleProxy)
1107 nldebug("Service module '%s' DOWN", moduleProxy->getModuleName().c_str());
1109 TConnectedServiceIndex::iterator it(_ConnectedServiceIndex.find(moduleProxy));
1110 if (it != _ConnectedServiceIndex.end())
1112 string &aliasName = it->second;
1113 nlinfo("AES client module %s of service %s is down",
1114 moduleProxy->getModuleName().c_str(),
1115 aliasName.c_str());
1116 BOMB_IF(_ServiceStates.find(aliasName) == _ServiceStates.end(), "Service down from "<<moduleProxy->getModuleName()<<" with alias "<<aliasName<<" not found in _ServiceStates table", _ConnectedServiceIndex.erase(it); return);
1117 if (_RegisteredServices.find(aliasName) == _RegisteredServices.end())
1119 // this is not a registered service, remove the status record
1120 _ServiceStates.erase(aliasName);
1122 else
1124 TServiceState &ss = _ServiceStates[aliasName];
1125 // update the running state
1126 ss.RunningState = TRunningState::rs_running;
1128 ss.ServiceModule = NULL;
1130 // kill the service to be sure that it is really dead !
1131 if (ss.PID > 1)
1133 nlinfo("Killing process %u (%s) because aes client module '%s' is down",
1134 ss.PID,
1135 aliasName.c_str(),
1136 moduleProxy->getModuleName().c_str());
1137 killProgram(ss.PID);
1141 retry_pending_command_loop:
1142 // check for pending command
1143 TPendingWebCommands::iterator first(_PendingWebCommands.begin()), last(_PendingWebCommands.end());
1144 for (; first != last; ++first)
1146 TPendingWebCommand &pwc = first->second;
1147 if (pwc.ServiceAlias == aliasName)
1149 if (_AdminService != NULL)
1151 CAdminServiceProxy as(_AdminService);
1152 as.commandResult(this, first->first, pwc.ServiceAlias, "ERROR : AES : service connection lost during command");
1155 _PendingWebCommands.erase(first);
1156 // goto to avoid iterator dodging
1157 goto retry_pending_command_loop;
1162 // remove the index record
1163 _ConnectedServiceIndex.erase(it);
1165 else
1167 nlinfo("AES client module %s is not associated with a service",
1168 moduleProxy->getModuleName().c_str());
1172 sendUpServiceUpdate();
1175 ///////////////////////////////////////////////////////////////////////
1176 //// Virtuals from CAdminExecutorServiceSkel
1177 ///////////////////////////////////////////////////////////////////////
1179 // AS send orders for a shard
1180 virtual void setShardOrders(NLNET::IModuleProxy *sender, const std::string &shardName, const TShardOrders &shardOrders)
1182 nlinfo("AS setShardOrders for shard '%s' to '%s'", shardName.c_str(), shardOrders.toString().c_str());
1184 if (_ShardOrders[shardName] == shardOrders)
1186 // nothing to do
1187 return;
1189 _ShardOrders[shardName] = shardOrders;
1190 _NeedToWriteStateFile = true;
1192 // nothing more to do, if service need to be started, they are started
1193 // by the module update function
1197 // AS send a command to shutdown a shard with a delay
1198 virtual void shutdownShard(NLNET::IModuleProxy *sender, const std::string &shardName, uint32 delay)
1200 TStopingShardInfo ssi;
1201 ssi.ShardName = shardName;
1202 ssi.Delay = delay;
1203 ssi.BeginDate = CTime::getSecondsSince1970();
1205 _StopingShards.push_back(ssi);
1207 nlinfo("Received command to stop all service of shard %s in %us", ssi.ShardName.c_str(), ssi.Delay);
1209 // force a first update (to send the first warning message or stop immediately)
1210 checkServiceToStop();
1214 // AS send a control command to this AES
1215 virtual void controlCmd(NLNET::IModuleProxy *sender, uint32 commandId, const std::string &serviceAlias, const std::string &command)
1217 // create a displayer to gather the output of the command
1218 class CStringDisplayer: public IDisplayer
1220 public:
1221 virtual void doDisplay( const CLog::TDisplayInfo& args, const char *message)
1223 _Data += message;
1226 std::string _Data;
1229 nldebug("Control command from '%s' : '%s' '%s'",
1230 sender->getModuleName().c_str(),
1231 serviceAlias.c_str(),
1232 command.c_str());
1234 // look in the list of service for a matching one
1235 IModuleProxy *service = findOnlineService(serviceAlias);
1236 if (service == NULL && _RegisteredServices.find(serviceAlias) == _RegisteredServices.end())
1238 CAdminServiceProxy as(sender);
1239 as.commandResult(this, commandId, serviceAlias, "ERROR : AES : service not found will dispatching the control command");
1240 return;
1243 // ok, we can execute the command concerning the service.
1244 CStringDisplayer stringDisplayer;
1245 IService::getInstance()->CommandLog.addDisplayer(&stringDisplayer);
1247 // build the command line
1248 CSString args(command);
1249 CSString cmdName = args.firstWord(true);
1250 CSString cmdLine;
1251 cmdLine << getCommandHandlerName() << "." << cmdName << " " << serviceAlias << " " << args;
1252 // retrieve the command from the input message and execute it
1253 nlinfo ("ADMIN: Executing control command : '%s' for service '%s'", cmdLine.c_str(), serviceAlias.c_str());
1254 ICommand::execute (cmdLine, IService::getInstance()->CommandLog);
1256 // unhook our displayer as it's work is now done
1257 IService::getInstance()->CommandLog.removeDisplayer(&stringDisplayer);
1259 // send the result back to AS
1260 CAdminServiceProxy as(sender);
1261 as.commandResult(this, commandId, serviceAlias, stringDisplayer._Data);
1264 //The return is sent back by another message
1265 virtual void serviceCmd(NLNET::IModuleProxy *sender, uint32 commandId, const std::string &serviceAlias, const std::string &command)
1267 // look in the list of service for a matching one
1268 IModuleProxy *proxy = findOnlineService(serviceAlias);
1269 if (proxy == NULL)
1271 CAdminServiceProxy as(sender);
1272 as.commandResult(this, commandId, serviceAlias, "ERROR AES : unknown service");
1273 return;
1276 // ok, we found it !
1277 TPendingWebCommand pwc;
1278 pwc.Command = command;
1279 pwc.ReceptionDate = NLMISC::CTime::getSecondsSince1970();
1280 pwc.ServiceAlias = serviceAlias;
1282 _PendingWebCommands.insert(make_pair(commandId, pwc));
1284 CAdminExecutorServiceClientProxy service(proxy);
1285 service.serviceCmd(this, commandId, command);
1288 // AES client send back the result of execution of a command
1289 virtual void commandResult(NLNET::IModuleProxy *sender, uint32 commandId, const std::string &serviceAlias, const std::string &result)
1291 // check for waiting commands
1292 TPendingWebCommands::iterator it(_PendingWebCommands.find(commandId));
1294 if (it == _PendingWebCommands.end())
1296 if (commandId != 0)
1297 nlwarning("CAdminExecutor::commandResult : service '%s' sent result for command ID %u but not in pending command table",
1298 sender->getModuleName().c_str(),
1299 commandId);
1300 return;
1303 // send the result back to AS
1304 if (_AdminService != NULL)
1306 CAdminServiceProxy as(_AdminService);
1308 as.commandResult(this, commandId, serviceAlias, result);
1311 _PendingWebCommands.erase(commandId);
1315 // An AES send graph data update
1316 virtual void graphUpdate(NLNET::IModuleProxy *sender, const TGraphDatas &graphDatas)
1318 if (_AdminService != NULL)
1320 CAdminServiceProxy as(_AdminService);
1321 as.graphUpdate(this, graphDatas);
1325 // A service high rez graph data update
1326 virtual void highRezGraphUpdate(NLNET::IModuleProxy *sender, const THighRezDatas &graphDatas)
1328 if (_AdminService != NULL)
1330 CAdminServiceProxy as(_AdminService);
1331 as.highRezGraphUpdate(this, graphDatas);
1335 // A service send an update of of it's status string
1336 virtual void serviceStatusUpdate(NLNET::IModuleProxy *sender, const std::string &status)
1338 TConnectedServiceIndex::iterator it(_ConnectedServiceIndex.find(sender));
1339 if (it == _ConnectedServiceIndex.end())
1341 nlwarning("serviceStatusUpdate : service '%s' send status but is unknown !", sender->getModuleName().c_str());
1342 return;
1345 string &aliasName = it->second;
1346 TServiceStates::iterator it2(_ServiceStates.find(aliasName));
1347 BOMB_IF(it2 == _ServiceStates.end(), "serviceStateUpdate : service '"
1348 <<sender->getModuleName()
1349 <<"' send an update, but alias '"<<aliasName<<"' is not found in service status", return);
1351 TServiceState &ss = it2->second;
1352 ss.State = status;
1353 ss.LastStateDate = NLMISC::CTime::getSecondsSince1970();
1357 ///////////////////////////////////////////////////////////////////////
1358 //// commands handlers
1359 ///////////////////////////////////////////////////////////////////////
1360 NLMISC_COMMAND_HANDLER_TABLE_EXTEND_BEGIN(CAdminExecutorService, CModuleBase)
1361 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, dump, "Dump a status report to appropriate output logger", "no args")
1362 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, addRegisteredService, "add a registered service", "<serviceAlias> <shardName>")
1363 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, removeRegisteredService, "remove a registered service", "<serviceAlias>")
1364 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, startService, "start a registered service", "<serviceAlias>")
1365 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, restartService, "stop then start a registered service", "<serviceAlias>")
1366 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, stopService, "stop a service (registered or not)", "<serviceAlias>")
1367 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, killService, "kill a (possibly not responding) service (registered or not)", "<serviceAlias>")
1368 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, abortService, "abort a (possibly not responding) service with SIGABORT (registered or not)", "<serviceAlias>")
1369 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, activateService, "activate a service, i.e make it startable either manually or from a shard orders", "<serviceAlias>")
1370 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, deactivateService, "deactivate a service, i.e make it unstartable (either manually or from a shard orders) and stop it if needed", "<serviceAlias>")
1371 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, execScript, "execute the predefined bash script '/home/nevrax/patchman/aes_runnable_script.sh' and give it the passed parameters", "<any parameter>")
1372 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, resetStartCounter, "reset the start counter to 0", "no params")
1373 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService, stopShard, "Stop all service of a given shard aftert the provided delay", "<shardName> <delay (in s)>")
1374 NLMISC_COMMAND_HANDLER_TABLE_END
1377 NLMISC_CLASS_COMMAND_DECL(stopShard)
1379 if (args.size() != 2)
1380 return false;
1382 string shardName = args[0];
1383 uint32 delay;
1384 NLMISC::fromString(args[1], delay);
1386 log.displayNL("Received command to stop all service of shard %s in %us", shardName.c_str(), delay);
1388 shutdownShard(NULL, shardName, delay);
1390 return true;
1394 NLMISC_CLASS_COMMAND_DECL(resetStartCounter)
1396 if (args.size() != 0)
1397 return false;
1400 TServiceStates::iterator first(_ServiceStates.begin()), last(_ServiceStates.end());
1401 for (; first != last; ++first)
1403 TServiceState &ss = first->second;
1405 ss.RunnerLoopCounter.resetCounter();
1408 return true;
1412 NLMISC_CLASS_COMMAND_DECL(execScript)
1414 string cmdLine("/home/nevrax/patchman/aes_runnable_script.sh");
1416 // add parameters
1417 for (uint i=0; i<args.size(); ++i)
1419 cmdLine += " "+args[i];
1422 // add redirection
1423 string logFile = CPath::getTemporaryDirectory() + "aes_command_output.log";
1425 cmdLine += " > "+logFile;
1427 log.displayNL("Executing '%s'", cmdLine.c_str());
1428 // execute the command
1429 int ret = system(cmdLine.c_str());
1431 // echo the output to the requester
1432 CSString output;
1433 output.readFromFile(logFile);
1435 vector<CSString> lines;
1436 output.splitLines(lines);
1438 log.displayNL("Command returned value %d", ret);
1439 log.displayNL("-------------------- Command output begin -----------------------");
1440 for (uint i=0; i<lines.size(); ++i)
1442 log.displayNL("%s", lines[i].c_str());
1444 log.displayNL("-------------------- Command output end -----------------------");
1445 return true;
1448 NLMISC_CLASS_COMMAND_DECL(deactivateService)
1450 if (args.size() != 1)
1451 return false;
1453 string serviceAlias = args[0];
1455 if (_PersistentServiceOrders.find(serviceAlias) == _PersistentServiceOrders.end())
1457 log.displayNL("Unregistered service '%s', could not deactivate it", serviceAlias.c_str());
1458 return true;
1461 _PersistentServiceOrders[serviceAlias] = TRunningOrders::ro_deactivated;
1463 _NeedToWriteStateFile = true;
1465 log.displayNL("Service '%s' deactivated", serviceAlias.c_str());
1467 return true;
1470 NLMISC_CLASS_COMMAND_DECL(activateService)
1472 if (args.size() != 1)
1473 return false;
1475 string serviceAlias = args[0];
1477 if (_PersistentServiceOrders.find(serviceAlias) == _PersistentServiceOrders.end())
1479 log.displayNL("Unregistered service '%s', could not activate it", serviceAlias.c_str());
1480 return true;
1483 _PersistentServiceOrders[serviceAlias] = TRunningOrders::ro_activated;
1485 _NeedToWriteStateFile = true;
1487 log.displayNL("Service '%s' activated", serviceAlias.c_str());
1489 return true;
1492 NLMISC_CLASS_COMMAND_DECL(abortService)
1494 if (args.size() != 1)
1495 return false;
1497 string serviceAlias = args[0];
1499 // check that the service is running
1500 TServiceStates::iterator it(_ServiceStates.find(serviceAlias));
1501 if (it == _ServiceStates.end())
1503 log.displayNL("Failed to found service '%s' in the list of running services", serviceAlias.c_str());
1504 return true;
1507 TServiceState &ss = it->second;
1508 if (ss.RunningState == TRunningState::rs_stopped)
1510 log.displayNL("The service to abort '%s' is currently stopped", serviceAlias.c_str());
1511 return true;
1513 if (ss.PID < 2)
1515 log.displayNL("AES have no valid PID to abort the service '%s'", serviceAlias.c_str());
1516 return true;
1519 // abort it
1520 log.displayNL("Aborting service '%s' with pid %u", serviceAlias.c_str(), ss.PID);
1521 abortProgram(ss.PID);
1523 return true;
1526 NLMISC_CLASS_COMMAND_DECL(killService)
1528 if (args.size() != 1)
1529 return false;
1531 string serviceAlias = args[0];
1533 // check that the service is running
1534 TServiceStates::iterator it(_ServiceStates.find(serviceAlias));
1535 if (it == _ServiceStates.end())
1537 log.displayNL("Failed to found service '%s' in the list of running services", serviceAlias.c_str());
1538 return true;
1541 TServiceState &ss = it->second;
1542 if (ss.RunningState == TRunningState::rs_stopped)
1544 log.displayNL("The service to kill '%s' is currently stopped", serviceAlias.c_str());
1545 return true;
1547 if (ss.PID < 2)
1549 log.displayNL("AES have no valid PID to kill the service '%s'", serviceAlias.c_str());
1550 return true;
1552 // kill it
1553 log.displayNL("Killing service '%s' with pid %u", serviceAlias.c_str(), ss.PID);
1554 killProgram(ss.PID);
1556 return true;
1559 NLMISC_CLASS_COMMAND_DECL(stopService)
1561 if (args.size() != 1)
1562 return false;
1564 string serviceAlias = args[0];
1566 if (_ServiceStates.find(serviceAlias) == _ServiceStates.end())
1568 log.displayNL("Unknown service '%s', could not stop it", serviceAlias.c_str());
1569 return true;
1572 TServiceState &ss = _ServiceStates[serviceAlias];
1573 // look for a shard orders for this service
1574 TShardsOrders::iterator it(_ShardOrders.find(ss.ShardName));
1575 if (it != _ShardOrders.end())
1577 TShardOrders &so = it->second;
1578 if (so == TShardOrders::so_autostart_on)
1580 log.displayNL("Can't stop service '%s' because shard '%s' is autostarting, considers either to deactivate the service or just restart it",
1581 serviceAlias.c_str(),
1582 ss.ShardName.c_str());
1583 return true;
1587 if (stopService(serviceAlias))
1588 log.displayNL("Failed to stop the service '%s'", serviceAlias.c_str());
1589 else
1590 log.displayNL("Service '%s' stop request done", serviceAlias.c_str());
1592 return true;
1595 NLMISC_CLASS_COMMAND_DECL(restartService)
1597 if (args.size() != 1)
1598 return false;
1600 string serviceAlias = args[0];
1602 if (_RegisteredServices.find(serviceAlias) == _RegisteredServices.end())
1604 log.displayNL("startService %s : the service in not registered, can't restart it", serviceAlias.c_str());
1605 return true;
1608 // look for service orders for this service
1609 if (_PersistentServiceOrders.find(serviceAlias) != _PersistentServiceOrders.end())
1611 if (_PersistentServiceOrders[serviceAlias] == TRunningOrders::ro_deactivated)
1613 log.displayNL("Can't restart service '%s' because it is currently deactivated", serviceAlias.c_str());
1614 return true;
1620 // check that the service is running
1621 TServiceStates::iterator it(_ServiceStates.find(serviceAlias));
1622 if (it == _ServiceStates.end())
1624 log.displayNL("Failed to found service '%s' in the list of running services", serviceAlias.c_str());
1625 return true;
1628 // write the deferred start command
1629 if (!writeServiceLaunchCtrl(serviceAlias, true, LAUNCH_CTRL_START))
1631 log.displayNL("Failed to write deferred start control file to restart service '%s'", serviceAlias.c_str());
1632 return true;
1634 else
1635 log.displayNL("Service '%s' start command written", serviceAlias.c_str());
1637 if (it->second.ServiceModule == NULL)
1639 log.displayNL("The AES client module proxy is null ! can't send 'quit' command");
1642 // send the "quit" command to the service
1643 CAdminExecutorServiceClientProxy aec(it->second.ServiceModule);
1644 aec.serviceCmd(this, 0, "quit");
1645 log.displayNL("Service '%s' command 'quit' sent", serviceAlias.c_str());
1647 return true;
1650 NLMISC_CLASS_COMMAND_DECL(startService)
1652 if (args.size() != 1)
1653 return false;
1655 string serviceAlias = args[0];
1657 if (_ServiceStates.find(serviceAlias) == _ServiceStates.end())
1659 log.displayNL("Unknown service '%s', could not start it", serviceAlias.c_str());
1660 return true;
1663 TServiceState &ss = _ServiceStates[serviceAlias];
1665 // look for service orders for this service
1666 if (_PersistentServiceOrders.find(serviceAlias) != _PersistentServiceOrders.end())
1668 if (_PersistentServiceOrders[serviceAlias] == TRunningOrders::ro_deactivated)
1670 log.displayNL("Can't start service '%s' because it is curently deactivated", serviceAlias.c_str());
1671 return true;
1675 // look for a shard orders for this service
1676 TShardsOrders::iterator it(_ShardOrders.find(ss.ShardName));
1677 if (it != _ShardOrders.end())
1679 TShardOrders &so = it->second;
1680 if (so == TShardOrders::so_autostart_on)
1682 log.displayNL("Can't start service '%s' because shard '%s' is autostarting, consider to restart it",
1683 serviceAlias.c_str(),
1684 ss.ShardName.c_str());
1685 return true;
1689 if (!startService(serviceAlias))
1690 log.displayNL("Failed to start service '%s'", serviceAlias.c_str());
1691 else
1692 log.displayNL("Service '%s' start command written", serviceAlias.c_str());
1694 return true;
1697 NLMISC_CLASS_COMMAND_DECL(removeRegisteredService)
1699 if (args.size() != 1)
1700 return false;
1702 string serviceAlias = args[0];
1704 if (_ServiceStates.find(serviceAlias) == _ServiceStates.end())
1706 log.displayNL("Unknown service '%s', could not start it", serviceAlias.c_str());
1707 return true;
1710 TServiceState &ss = _ServiceStates[serviceAlias];
1712 _RegisteredServices.erase(serviceAlias);
1714 if (ss.RunningState == TRunningState::rs_stopped)
1716 // remove the record
1717 _ServiceStates.erase(serviceAlias);
1719 else
1721 // just update some data related the registered service
1722 ss.ShardName = "";
1723 ss.RunningTags.erase(TRunningTag::rt_locally_started);
1724 ss.RunningTags.erase(TRunningTag::rt_chain_crashing);
1725 ss.RunningTags.insert(TRunningTag::rt_externaly_started);
1729 _PersistentServiceOrders.erase(serviceAlias);
1730 _NeedToWriteStateFile = true;
1732 // update the state of services to the AS
1733 sendUpServiceUpdate();
1735 return true;
1738 NLMISC_CLASS_COMMAND_DECL(addRegisteredService)
1740 if (args.size() != 2)
1741 return false;
1743 string serviceAlias = args[0];
1744 string shardName = args[1];
1746 _RegisteredServices.insert(serviceAlias);
1747 _ServiceStates.insert(make_pair(serviceAlias, TServiceState()));
1748 _ServiceStates[serviceAlias].ShardName = shardName;
1749 // _ServiceRunnerLoopCounters.insert(make_pair(serviceAlias, TRunnerLoopCounter()));
1751 if (_PersistentServiceOrders.find(serviceAlias) == _PersistentServiceOrders.end())
1753 _PersistentServiceOrders[serviceAlias] = TRunningOrders::ro_activated;
1754 _NeedToWriteStateFile = true;
1757 // update the state of services to the AS
1758 sendUpServiceUpdate();
1760 return true;
1763 NLMISC_CLASS_COMMAND_DECL(dump)
1765 NLMISC_CLASS_COMMAND_CALL_BASE(CModuleBase, dump);
1767 log.displayNL("===============================");
1768 log.displayNL(" Dumping Admin executor states");
1769 log.displayNL("===============================");
1772 log.displayNL(" There are %u known shard :", _ShardOrders.size());
1774 TShardsOrders::iterator first(_ShardOrders.begin()), last(_ShardOrders.end());
1775 for (; first != last; ++first)
1777 log.displayNL(" + Shard '%s' orders is '%s'", first->first.c_str(), first->second.toString().c_str());
1780 if (_ShutdownForPatch)
1781 log.displayNL(" All service are shuting down for patch !");
1782 log.displayNL(" There are %u known services :", _ServiceStates.size());
1783 TServiceStates::iterator first(_ServiceStates.begin()), last(_ServiceStates.end());
1784 for (; first != last; ++first)
1786 TServiceState &ss = first->second;
1787 const string &aliasName = first->first;
1789 CSString runningTags;
1790 set<TRunningTag>::iterator rtf(ss.RunningTags.begin()), rte(ss.RunningTags.end());
1791 for (; rtf != rte; ++rtf)
1793 runningTags<<"<"<<rtf->toString()<<">";
1796 bool registered = _RegisteredServices.find(aliasName) != _RegisteredServices.end();
1798 log.displayNL(" + Service alias='%s' (%s) ShardName = '%s' RunningState='%s' RunningTag='%s'",
1799 aliasName.c_str(),
1800 registered ? "REGISTERED" : "NOT REGISTERED",
1801 ss.ShardName.c_str(),
1802 ss.RunningState.toString().c_str(),
1803 runningTags.c_str());
1805 log.display(" | %s", ss.DontUseShardOrders ? "DontUseShardOders" : "UseShardOrders");
1807 if (ss.RunningState != TRunningState::rs_stopped)
1809 // the pid should be valid
1810 log.display(" PID=%u", ss.PID);
1812 if (registered)
1814 log.display(" ServiceOrders=%s", _PersistentServiceOrders[aliasName].toString().c_str());
1816 log.displayNL("");
1819 if (ss.ServiceModule != NULL)
1821 // dump a connected service
1822 log.displayNL(" | longName='%s' shortName='%s' moduleName='%s'",
1823 ss.LongName.c_str(),
1824 ss.ShortName.c_str(),
1825 ss.ServiceModule->getModuleName().c_str());
1826 log.displayNL(" | State '%s' (last received %sago)", ss.State.c_str(), NLMISC::CTime::getHumanRelativeTime(NLMISC::CTime::getSecondsSince1970() - ss.LastStateDate).c_str());
1828 else
1830 // dump a offline registered service
1831 // dump a connected service
1832 log.displayNL(" | longName='%s' shortName='%s' ",
1833 ss.LongName.c_str(),
1834 ss.ShortName.c_str());
1835 log.displayNL(" | State '%s' (last received %sago)", ss.State.c_str(), NLMISC::CTime::getHumanRelativeTime(NLMISC::CTime::getSecondsSince1970() - ss.LastStateDate).c_str());
1837 if (registered)
1839 uint32 c1, c2, c3;
1840 ss.RunnerLoopCounter.getCounters(c1, c2, c3);
1841 log.displayNL(" | Service Runner Start counter (%u mn:%u run, %u mn:%u run, %u mn:%u run)",
1842 CRASH_COUNTER_ROLL_DELAY/60, c1,
1843 (CRASH_COUNTER_ROLL_DELAY*3)/60, c2,
1844 (CRASH_COUNTER_ROLL_DELAY*CRASH_COUNTER_SLOT)/60, c3);
1849 return true;
1855 NLNET_REGISTER_MODULE_FACTORY(CAdminExecutorService, "AdminExecutorService");
1857 } // namespace ADMIN