1 // Ryzom - MMORPG Framework <http://dev.ryzom.com/projects/ryzom/>
2 // Copyright (C) 2010 Winch Gate Property Limited
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU Affero General Public License as
6 // published by the Free Software Foundation, either version 3 of the
7 // License, or (at your option) any later version.
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU Affero General Public License for more details.
14 // You should have received a copy of the GNU Affero General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
19 #include "nel/misc/singleton.h"
21 #include "nel/misc/path.h"
22 #include "nel/misc/common.h"
23 #include "nel/net/module.h"
24 #include "nel/net/module_builder_parts.h"
25 #include "nel/net/unified_network.h"
26 #include "nel/net/service.h"
28 #include "game_share/utils.h"
30 #include "admin_modules_itf.h"
33 using namespace NLMISC
;
34 using namespace NLNET
;
36 void aes_forceLink() {}
40 const char* LAUNCH_CTRL_START
= "LAUNCH";
41 const char* LAUNCH_CTRL_STOP
= "STOP";
43 const char *AESPersistentStateFilename
= "aes_state.txt";
45 /// We want 10 slot (you can change this, but need at least 3 slots)
46 const uint32 CRASH_COUNTER_SLOT
= 10;
47 /// The delay (in second) between slots roll. This value * CRASH_COUNTER_SLOT give the total sampling period
48 const uint32 CRASH_COUNTER_ROLL_DELAY
= 10*60; // 10 mn
49 /// If we have more than 5 start of a service in the sampling period, we tag the service as 'chain crashing'
50 const uint32 CRASH_COUNTER_CHAIN_THRESHOLD
= 5;
52 /** the name of the file written by the patch man to request a global shutdown
53 * of all registered the services before switching to a new version.
55 CVariable
<string
> ShutdownRequestFileName("aes","ShutdownRequestFileName", "name of the file to use for shutdown requests", "./global.launch_ctrl", 0, true);
57 /** A kind rolling buffer used to count services start from the runner
60 class CRunnerLoopCounter
62 /// The slot table. Each slot accumulate the service start for a time frame
63 uint32 _Slots
[CRASH_COUNTER_SLOT
];
64 /** The last value read from the runner script. This is used to compute
65 * the delta value to add to the first slot
67 uint32 _LastValueRead
;
68 /// The total sum of all slot (could be recomputed on demand, but a little more efficient)
74 // we need at least 3 slots
75 nlctassert(CRASH_COUNTER_SLOT
>= 3);
77 // init all slots with 0
78 for (uint i
=0; i
<CRASH_COUNTER_SLOT
; ++i
)
83 // init the last value with a magic value so that the first
84 // update will not compute a delta but only take
85 // the first value as initial reference
86 _LastValueRead
= 0xffffffff;
90 /** Updat the counter by submitting the current start counter
91 * written by the runner script.
92 * Note that the runner script only increment the counter
93 * so we need to compute the delta from _LastValueRead
94 * before accumulating in the first slot.
96 void updateCounter(uint32 lastValue
)
98 if (_LastValueRead
== 0xffffffff || lastValue
< _LastValueRead
)
100 // this is the first sample, just init the last value read
101 // or the counter have been reset to a smaller value
102 _LastValueRead
= lastValue
;
106 // not the first sample, compute the delta and accumulate
107 uint32 delta
= lastValue
- _LastValueRead
;
109 _LastValueRead
= lastValue
;
111 _CounterSum
+= delta
;
115 /// Roll the slots. The last slot is ejected and
116 /// each slot are copied in the next one (in
117 /// inverse order obviously)
118 /// The first slot in then set to 0
122 _CounterSum
-= _Slots
[CRASH_COUNTER_SLOT
-1];
124 for (uint i
=CRASH_COUNTER_SLOT
-1; i
>0; --i
)
126 _Slots
[i
] = _Slots
[i
-1];
131 /// Return the sum of all the slots
137 /// Return the sum of the first slot, the tree first slot and
138 /// the total of all slots.
139 /// This is useful to understand the behavoir of a crashing
140 /// service over the sampling period.
141 void getCounters(uint32
&oneSlot
, uint32
&treeSlots
, uint32
&allSlots
)
144 treeSlots
= _Slots
[0]+_Slots
[1]+_Slots
[2];
145 allSlots
= _CounterSum
;
149 /// Reset all counter to zero
152 for (uint i
=0; i
<CRASH_COUNTER_SLOT
; ++i
)
162 class CAdminExecutorService
163 : /*public CManualSingleton<CAdminExecutorService>,*/
164 public CEmptyModuleServiceBehav
<CEmptyModuleCommBehav
<CEmptySocketBehav
<CModuleBase
> > >,
165 public CAdminExecutorServiceSkel
,
166 public IModuleTrackerCb
171 SLOW_TO_START_THRESHOLD
= 60, // 1 mn
172 SLOW_TO_STOP_THRESHOLD
= 60, // 1 mn
173 _NagiosReportDelay
= 60, // 1 mn
178 typedef CModuleTracker
<TModuleClassPred
> TServiceTracker
;
179 // tracker for admin executor client modules
180 TServiceTracker _ServiceTracker
;
182 /// Admin service module
183 TModuleProxyPtr _AdminService
;
185 /// Date of last state reporting to AS
186 uint32 _LastStateReport
;
188 /// Date of last nagios report output
189 uint32 _LastNagiosReport
;
191 typedef string TAliasName
;
192 typedef string TShardName
;
193 typedef set
<TAliasName
> TRegisteredServices
;
194 /// List of 'registered service', ie. those that are configured in aes cfg.
195 TRegisteredServices _RegisteredServices
;
197 /// A set of data for each registered or connected service
201 bool DontUseShardOrders
;
202 TRunningState RunningState
;
203 set
<TRunningTag
> RunningTags
;
208 uint32 LastStateDate
;
209 uint32 StopRequestDate
;
210 uint32 StartRequestDate
;
211 TModuleProxyPtr ServiceModule
;
212 CRunnerLoopCounter RunnerLoopCounter
;
215 : DontUseShardOrders(false),
216 RunningState(TRunningState::rs_stopped
),
224 typedef map
<TAliasName
, TServiceState
> TServiceStates
;
225 /// States for each connected or registered service
226 TServiceStates _ServiceStates
;
228 typedef map
<TModuleProxyPtr
, TAliasName
> TConnectedServiceIndex
;
229 /// Index of connected service proxy to alias name
230 TConnectedServiceIndex _ConnectedServiceIndex
;
232 typedef map
<TAliasName
, TRunningOrders
> TPersistentServiceOrders
;
233 /// Persistent service state, i.e state that are restored after a stop/start of the aes
234 TPersistentServiceOrders _PersistentServiceOrders
;
236 typedef map
<TShardName
, TShardOrders
> TShardsOrders
;
237 /// Shard orders (set by AS)
238 TShardsOrders _ShardOrders
;
240 /// flag for shutdown request form patch manager.
241 bool _ShutdownForPatch
;
243 /// A flag that mean we need to save the persistent state file
244 bool _NeedToWriteStateFile
;
246 /// Date of last roll of the runner loop counters
247 uint32 _LastRunnerLoopCounterRoll
;
249 /// Data for each command pending result from a service
250 struct TPendingWebCommand
252 /// Date of reception of the command for timeout
253 uint32 ReceptionDate
;
254 /// Name of the target service
259 typedef uint32 TCommandId
;
260 typedef map
<TCommandId
, TPendingWebCommand
> TPendingWebCommands
;
261 /// A list of pending command sent to service and waiting result
262 TPendingWebCommands _PendingWebCommands
;
264 /// information about shard being stopped
265 struct TStopingShardInfo
267 /// Name of the shard to stop
269 /// Delay before stop
271 /// Begin date of countdown
275 typedef vector
<TStopingShardInfo
> TStopingShardInfos
;
277 /// The vector of shard to stop.
278 TStopingShardInfos _StopingShards
;
282 CAdminExecutorService()
283 : _ServiceTracker(TModuleClassPred("AdminExecutorServiceClient")),
285 _LastNagiosReport(0),
286 _ShutdownForPatch(false),
287 _NeedToWriteStateFile(false),
288 _LastRunnerLoopCounterRoll(0)
290 CAdminExecutorServiceSkel::init(this);
291 _ServiceTracker
.init(this, this);
295 bool initModule(const TParsedCommandLine
&pcl
)
297 CModuleBase::initModule(pcl
);
299 // read the persistent state file if any
300 string filename
= CPath::standardizePath(IService::getInstance()->SaveFilesDirectory
.toString(), true)+AESPersistentStateFilename
;
301 FILE *fp
= nlfopen(filename
, "rt");
306 while ((ret
=fgets(buffer
, 1024, fp
)) != NULL
)
308 CSString
line(buffer
);
309 CSString
cmd(line
.firstWord(true));
311 if (cmd
== "ServiceState")
313 CSString serviceAlias
= line
.firstWord(true);
314 CSString serviceOrders
= line
.firstWord(true);
316 TRunningOrders
runningOrders(serviceOrders
);
317 if (!serviceAlias
.empty() && runningOrders
!= TRunningOrders::invalid_val
)
319 // add this one in the list of persistent state
320 _PersistentServiceOrders
[serviceAlias
] = runningOrders
;
323 else if (cmd
== "ShardOrders")
325 string
shardName(line
.firstWord(true));
326 TShardOrders
shardOrders(line
.firstWord(true));
327 if (shardOrders
!= TShardOrders::invalid_val
)
328 _ShardOrders
[shardName
] = shardOrders
;
331 // clear the flag because 'setGlobalState' has set it
332 _NeedToWriteStateFile
= false;
340 void onModuleUp(IModuleProxy
*proxy
)
342 if (proxy
->getModuleClassName() == "AdminService")
344 nldebug("CAdminExecutorService : admin service up as '%s'", proxy
->getModuleName().c_str());
345 // we found the manager of AES
346 if (_AdminService
!= NULL
)
348 nlwarning("CAdminExecutorService : admin service already known as '%s', replacing with new one", _AdminService
->getModuleName().c_str());
350 _AdminService
= proxy
;
352 // cleanup the persistent service state by removing any state not in registered or connected service
354 set
<string
> removeList
;
356 // first, fill the list with all the persistent state service name
358 TPersistentServiceOrders::iterator
first(_PersistentServiceOrders
.begin()), last(_PersistentServiceOrders
.end());
359 for (; first
!= last
; ++first
)
361 removeList
.insert(first
->first
);
365 // remove the registered service from the removelist
367 TRegisteredServices::iterator
first(_RegisteredServices
.begin()), last(_RegisteredServices
.end());
368 for (; first
!= last
; ++first
)
370 removeList
.erase(*first
);
373 // remove any connected service (even unregistered one)
375 TServiceStates::iterator
first(_ServiceStates
.begin()), last(_ServiceStates
.end());
376 for (; first
!= last
; ++first
)
378 removeList
.erase(first
->first
);
382 // no remove persistent state that left in the remove list
383 while (!removeList
.empty())
385 _PersistentServiceOrders
.erase(*(removeList
.begin()));
387 _NeedToWriteStateFile
= true;
389 removeList
.erase(removeList
.begin());
393 // send the current status
394 sendUpServiceUpdate();
397 uint32 now
= NLMISC::CTime::getSecondsSince1970();
398 // check pending command timeout
399 TPendingWebCommands::iterator
first(_PendingWebCommands
.begin()), last(_PendingWebCommands
.end());
400 for (; first
!= last
; ++first
)
402 TPendingWebCommand
&pwc
= first
->second
;
404 if (now
- pwc
.ReceptionDate
> 10)
407 if (_AdminService
!= NULL
)
409 CAdminServiceProxy
as(_AdminService
);
410 as
.commandResult(this, first
->first
, pwc
.ServiceAlias
, "ERROR : AES : no reponse from service");
413 _PendingWebCommands
.erase(first
);
415 // check other pending commands at next update
421 void onModuleDown(IModuleProxy
*proxy
)
423 if (proxy
== _AdminService
)
425 nldebug("CAdminExecutorService : admin service '%s' is down", proxy
->getModuleName().c_str());
427 _AdminService
= NULL
;
431 void onModuleUpdate()
433 H_AUTO(CAdminExecutorService_onModuleUpdate
);
435 uint32 now
= CTime::getSecondsSince1970();
437 if (_LastStateReport
< now
)
441 // check services every second
442 TServiceStates::iterator
first(_ServiceStates
.begin()), last(_ServiceStates
.end());
443 for (; first
!= last
; ++first
)
445 string aliasName
= first
->first
;
446 TServiceState
&ss
= first
->second
;
447 if (_RegisteredServices
.find(aliasName
) != _RegisteredServices
.end())
449 // this is a registered service, we need to control is running state
451 // read the actual running state from the runner script written file
452 if (getOfflineServiceState(aliasName
) == "RUNNING")
454 // the service is running
455 ss
.RunningTags
.erase(TRunningTag::rt_locally_stopped
);
456 ss
.RunningTags
.erase(TRunningTag::rt_globally_stopped
);
457 ss
.RunningTags
.erase(TRunningTag::rt_stopped_for_patch
);
459 if (ss
.StopRequestDate
!= 0)
461 // still not stopped, check for slow to stop service
462 if (now
- ss
.StopRequestDate
> SLOW_TO_STOP_THRESHOLD
)
465 ss
.RunningTags
.insert(TRunningTag::rt_slow_to_stop
);
469 if (ss
.RunningState
!= TRunningState::rs_online
)
471 // tag slow to start service
472 if (now
- ss
.StartRequestDate
> SLOW_TO_START_THRESHOLD
)
475 ss
.RunningTags
.insert(TRunningTag::rt_slow_to_start
);
479 ss
.RunningState
= TRunningState::rs_running
;
485 // the service is stopped
486 ss
.RunningState
= TRunningState::rs_stopped
;
487 ss
.RunningTags
.erase(TRunningTag::rt_locally_started
);
488 ss
.RunningTags
.erase(TRunningTag::rt_externaly_started
);
489 ss
.RunningTags
.erase(TRunningTag::rt_slow_to_stop
);
491 // clean the stop request date
492 ss
.StopRequestDate
= 0;
495 // try to obtains service orders from its shard
496 TShardOrders
shardOrders(TShardOrders::so_autostart_on
);
497 if (_ShardOrders
.find(ss
.ShardName
) != _ShardOrders
.end())
499 shardOrders
= _ShardOrders
[ss
.ShardName
];
501 // little check, the service must have a entry in the service orders container.
502 nlassert(_PersistentServiceOrders
.find(aliasName
) != _PersistentServiceOrders
.end());
504 TRunningOrders serviceOrders
= _PersistentServiceOrders
[aliasName
];
506 // look if service need to be started
507 if (ss
.RunningState
== TRunningState::rs_stopped
// its stopped
508 && serviceOrders
== TRunningOrders::ro_activated
// and activated
509 && shardOrders
== TShardOrders::so_autostart_on
// and shard is autostart on
510 && !ss
.DontUseShardOrders
// and this service follow its shard orders
511 && !_ShutdownForPatch
// and no patch pending
514 // we must start this service !
515 startService(aliasName
);
518 // look for service that need to be stopped
519 if (ss
.RunningState
!= TRunningState::rs_stopped
// its not stopped
520 && (serviceOrders
== TRunningOrders::ro_deactivated
// and deactivated
521 || _ShutdownForPatch
) // or patch pending
522 && ss
.StopRequestDate
== 0 // no stop request send
525 // we must stop this service
526 stopService(aliasName
);
529 ss
.StopRequestDate
= now
;
531 // shuted down for patch ?
532 if (_ShutdownForPatch
)
533 ss
.RunningTags
.insert(TRunningTag::rt_stopped_for_patch
);
535 ss
.RunningTags
.erase(TRunningTag::rt_stopped_for_patch
);
538 if (ss
.RunnerLoopCounter
.getSum() > CRASH_COUNTER_CHAIN_THRESHOLD
)
539 ss
.RunningTags
.insert(TRunningTag::rt_chain_crashing
);
541 ss
.RunningTags
.erase(TRunningTag::rt_chain_crashing
);
543 // update the crash counter
544 ss
.RunnerLoopCounter
.updateCounter(getServiceStartLoopCounter(aliasName
));
551 // if we have an admin service connected, send it an update of service state
552 if (_AdminService
!= NULL
)
553 sendUpServiceUpdate();
556 if ((now
& 0xf) == 0)
558 // every 8 seconds for low frequency work
560 // check for shutdown request from patchman
561 checkShutdownRequest();
564 // check for shard to stop (and warning to send)
565 checkServiceToStop();
567 // time to output the nagios report ?
568 if (now
> _LastNagiosReport
+_NagiosReportDelay
)
570 // write the nagios report
571 FILE *fp
= nlfopen("aes_nagios_report.txt", "wt");
574 // output the current date
576 fprintf(fp
, "AESReportDate=%s", ::ctime(&t
));
578 fprintf(fp
, "NBService=%u\n", (uint
)_ServiceStates
.size());
579 // output state of each service
580 TServiceStates::iterator
first(_ServiceStates
.begin()), last(_ServiceStates
.end());
581 for (; first
!= last
; ++first
)
583 CSString serviceLine
;
584 TServiceState
&ss
= first
->second
;
585 const string
&aliasName
= first
->first
;
587 CSString runningTags
;
588 set
<TRunningTag
>::iterator
rtf(ss
.RunningTags
.begin()), rte(ss
.RunningTags
.end());
589 for (; rtf
!= rte
; ++rtf
)
591 runningTags
<<"<"<<rtf
->toString()<<">";
594 bool registered
= _RegisteredServices
.find(aliasName
) != _RegisteredServices
.end();
596 serviceLine
<< "ServiceAlias='"<<aliasName
<<"' RunningState='"<<ss
.RunningState
.toString()<<"' RunningTag='"<<runningTags
<<"'";
597 serviceLine
<< " NoReportSince="<<now
-ss
.LastStateDate
;
598 serviceLine
<< " State='"<<ss
.State
<<"'";
600 fprintf(fp
, "%s\n", serviceLine
.c_str());
608 nlwarning("Can't open the nagios report file !");
611 _LastNagiosReport
= now
;
615 // update the last report date
616 _LastStateReport
= now
;
619 // check runner loop counter roll timer
620 if (_LastRunnerLoopCounterRoll
+CRASH_COUNTER_ROLL_DELAY
< now
)
622 // it's time to roll the crash counter
623 TServiceStates::iterator
first(_ServiceStates
.begin()), last(_ServiceStates
.end());
624 for (; first
!= last
; ++first
)
626 first
->second
.RunnerLoopCounter
.rollCounter();
629 _LastRunnerLoopCounterRoll
= now
;
632 if (_NeedToWriteStateFile
)
634 /// The persistent service orders need to be saved
635 string filename
= CPath::standardizePath(IService::getInstance()->SaveFilesDirectory
.toString(), true)+AESPersistentStateFilename
;
636 FILE *fp
= nlfopen(filename
, "wt");
641 TShardsOrders::iterator
first(_ShardOrders
.begin()), last(_ShardOrders
.end());
642 for (; first
!= last
; ++first
)
644 line
<< "ShardOrders "<<first
->first
<<" "<<first
->second
.toString()<<"\n";
647 fputs(line
.c_str(), fp
);
651 TPersistentServiceOrders::iterator
first(_PersistentServiceOrders
.begin()), last(_PersistentServiceOrders
.end());
652 for (; first
!= last
; ++first
)
655 line
<< "ServiceState "<<first
->first
<<" "<<first
->second
.toString()<<"\n";
656 fputs(line
.c_str(), fp
);
659 // clear the flag because 'setGlobalState' has set it
660 _NeedToWriteStateFile
= false;
667 void sendUpServiceUpdate()
669 if (_AdminService
!= NULL
)
671 vector
<TServiceStatus
> serviceStatus
;
672 set
<TAliasName
> missingServices
= _RegisteredServices
;
673 // send an updated list to AES
674 TServiceStates::iterator
first(_ServiceStates
.begin()), last(_ServiceStates
.end());
675 for (; first
!= last
; ++first
)
677 const string
&aliasName
= first
->first
;
678 bool registered
= _RegisteredServices
.find(aliasName
) != _RegisteredServices
.end();
679 TServiceState
&ss
= first
->second
;
680 serviceStatus
.push_back(TServiceStatus());
681 TServiceStatus
&ts
= serviceStatus
.back();
682 ts
.setShardName(ss
.ShardName
);
683 ts
.setServiceLongName(ss
.LongName
);
684 ts
.setServiceShortName(ss
.ShortName
);
685 ts
.setServiceAliasName(aliasName
);
686 ts
.setRunningState(ss
.RunningState
);
688 ts
.setRunningOrders(_PersistentServiceOrders
[aliasName
]);
690 ts
.setRunningOrders(TRunningOrders::invalid_val
);
691 ts
.setRunningTags(ss
.RunningTags
);
693 state
<< ss
.State
<< "\tNoReportSince=" << (NLMISC::CTime::getSecondsSince1970()-ss
.LastStateDate
);
696 state
<< "\tHostname=" << IService::getInstance()->getHostName();
700 // this is a registered service, send the start counter
701 uint32 oneSlot
, treeSlots
, allSlots
;
702 ss
.RunnerLoopCounter
.getCounters(oneSlot
, treeSlots
, allSlots
);
703 state
<< "\tStartCounter="<<oneSlot
<<" "<<treeSlots
<<" "<<allSlots
;
707 missingServices
.erase(aliasName
);
710 CAdminServiceProxy
as(_AdminService
);
711 as
.upServiceUpdate(this, serviceStatus
);
715 IModuleProxy
*findOnlineService(const std::string
&serviceAlias
)
717 TConnectedServiceIndex::iterator
first(_ConnectedServiceIndex
.begin()), last(_ConnectedServiceIndex
.end());
718 for (; first
!= last
; ++first
)
720 if (first
->second
== serviceAlias
)
731 void checkShutdownRequest()
733 // if there's no ctrl file to be found then giveup
734 if (!NLMISC::CFile::fileExists(ShutdownRequestFileName
)) return;
736 // if a shutdown ctrl file exists then read it's contents (if the file doesn't exist this returns an empty string)
737 CSString fileContents
;
738 fileContents
.readFromFile(ShutdownRequestFileName
.c_str());
740 // see if the file exists
741 if (!fileContents
.empty())
743 NLMISC::CFile::deleteFile(ShutdownRequestFileName
);
744 fileContents
= fileContents
.strip().splitToOneOfSeparators(" \t\n\r\x1a");
746 NLMISC::fromString(fileContents
, _ShutdownForPatch
);
747 _ShutdownForPatch
= !_ShutdownForPatch
;
751 void checkServiceToStop()
753 uint32 now
= CTime::getSecondsSince1970();
754 // for each shard to stop
755 for (uint i
=0; i
<_StopingShards
.size(); ++i
)
757 const TStopingShardInfo
&stopShardInfo
= _StopingShards
[i
];
759 bool timeToStop
= stopShardInfo
.BeginDate
+ stopShardInfo
.Delay
<= now
;
760 uint32 timeLeft
= (stopShardInfo
.BeginDate
+ stopShardInfo
.Delay
) - now
;
761 // check every service
762 TServiceStates::iterator
first(_ServiceStates
.begin()), last(_ServiceStates
.end());
763 for (; first
!= last
; ++first
)
765 TServiceState
&serviceState
= first
->second
;
767 if (serviceState
.ServiceModule
!= NULL
&& serviceState
.ShardName
== stopShardInfo
.ShardName
)
769 // this one belong to the shard to stop
772 // send a warning every 30 s
773 if (((now
- stopShardInfo
.BeginDate
) % 30) == 0)
775 CAdminExecutorServiceClientProxy
aec(serviceState
.ServiceModule
);
776 nlinfo("Sending command 'quitDelay' to service '%s'", first
->first
.c_str());
777 aec
.serviceCmdNoReturn(this, toString("quitDelay %u", timeLeft
));
783 stopService(first
->first
);
790 nlinfo("All local service for shard %s are stopped", stopShardInfo
.ShardName
.c_str());
791 // shard stopped, erase this entry
792 _StopingShards
.erase(_StopingShards
.begin()+i
);
799 // the following routine reads the text string contained in the ".state" file for this service
800 // it's used to provide a 'state' value for services that are registered but are not connected
801 // to give info on whether they've been launched, whether their launcher is online, etc
802 std::string
getOfflineServiceState(const std::string
& serviceAlias
)
804 // open the file for reading
805 FILE* f
= nlfopen(getServiceStateFileName(serviceAlias
), "rt");
806 if (f
==NULL
) return "STOPPED";
808 // setup a buffer to hold the text read from the file
809 uint32 fileSize
= NLMISC::CFile::getFileSize(f
);
811 txt
.resize(fileSize
);
813 // read the text from the file - note that the number of bytes read may be less than the
814 // number of bytes requested because we've opened the file in text mode and not binary mode
815 uint32 bytesRead
= (uint32
)fread(&txt
[0],1,fileSize
,f
);
816 txt
.resize(bytesRead
);
819 // return the text read from the file
824 // the following routine reads the text string contained in the "pid.state" file for this service
825 // it's used to provide a early pid information to the AES (before the service is connected)
826 uint32
getOfflineServicePID(const std::string
& serviceAlias
)
828 // open the file for reading
829 FILE* f
= nlfopen(getServicePIDFileName(serviceAlias
), "rt");
830 if (f
==NULL
) return 0;
832 // setup a buffer to hold the text read from the file
833 uint32 fileSize
= NLMISC::CFile::getFileSize(f
);
835 txt
.resize(fileSize
);
837 // read the text from the file - note that the number of bytes read may be less than the
838 // number of bytes requested because we've opened the file in text mode and not binary mode
839 uint32 bytesRead
= (uint32
)fread(&txt
[0],1,fileSize
,f
);
840 txt
.resize(bytesRead
);
843 // return the pid read from the file
845 NLMISC::fromString(txt
, pid
);
851 // the following routine reads the text string contained in the ".start_counter" file for this service
852 // it's used to provide the number of start done by the runner loop on the service
853 // This is used for the chain crash detector system.
854 uint32
getServiceStartLoopCounter(const std::string
& serviceAlias
)
856 // open the file for reading
857 FILE* f
= nlfopen(getServiceLoopCounterFileName(serviceAlias
), "rt");
861 // setup a buffer to hold the text read from the file
862 uint32 fileSize
= NLMISC::CFile::getFileSize(f
);
864 txt
.resize(fileSize
);
866 // read the text from the file - note that the number of bytes read may be less than the
867 // number of bytes requested because we've opened the file in text mode and not binary mode
868 uint32 bytesRead
= (uint32
)fread(&txt
[0],1,fileSize
,f
);
869 txt
.resize(bytesRead
);
872 // parse the text in the buffer
874 NLMISC::fromString(txt
, counter
);
879 // retrieve service launch info in the config file
880 bool getServiceLaunchInfo(const string
& serviceAlias
, string
& path
)
883 CConfigFile::CVar
*launchDir
= IService::getInstance()->ConfigFile
.getVarPtr("AESLauncherDir");
884 if (launchDir
!= NULL
)
886 basePath
= launchDir
->asString()+"/";
889 if (_RegisteredServices
.find(serviceAlias
) == _RegisteredServices
.end())
891 path
= basePath
+ serviceAlias
+"/";
897 std::string
getServiceStateFileName(const std::string
& serviceAlias
)
900 if (getServiceLaunchInfo(serviceAlias
, servicePath
))
901 return NLMISC::CPath::standardizePath(servicePath
)+serviceAlias
+".state";
906 std::string
getServicePIDFileName(const std::string
& serviceAlias
)
909 if (getServiceLaunchInfo(serviceAlias
, servicePath
))
910 return NLMISC::CPath::standardizePath(servicePath
)+"pid.state";
915 std::string
getServiceLoopCounterFileName(const std::string
& serviceAlias
)
918 if (getServiceLaunchInfo(serviceAlias
, servicePath
))
919 return NLMISC::CPath::standardizePath(servicePath
)+serviceAlias
+".start_count";
924 std::string
getServiceLaunchCtrlFileName(const std::string
& serviceAlias
,const std::string
& serviceExecutionPath
, bool deferred
)
926 return NLMISC::CPath::standardizePath(serviceExecutionPath
)+serviceAlias
+(deferred
?".deferred_":".")+"launch_ctrl";
930 bool writeServiceLaunchCtrl(const std::string
& serviceAlias
, bool deferred
, const std::string
& txt
)
933 if (!getServiceLaunchInfo(serviceAlias
, path
))
936 // make sure the path exists
937 NLMISC::CFile::createDirectoryTree(path
);
939 // open the file for writing
940 FILE* f
= nlfopen(getServiceLaunchCtrlFileName(serviceAlias
, path
, deferred
).c_str(),"wt");
941 if (f
==NULL
) return false;
943 // write the text to the file
944 fprintf(f
,"%s",txt
.c_str());
950 bool startService(const std::string
&serviceAlias
)
952 if (_ServiceStates
.find(serviceAlias
) != _ServiceStates
.end())
954 TServiceState
&ss
= _ServiceStates
[serviceAlias
];
955 if (ss
.RunningState
!= TRunningState::rs_stopped
)
957 nlwarning("startService '%s' : the service is already running", serviceAlias
.c_str());
961 // store the start date
962 ss
.StartRequestDate
= CTime::getSecondsSince1970();
965 if (_RegisteredServices
.find(serviceAlias
) == _RegisteredServices
.end())
967 nlwarning("startService '%s' : the service in not registered, can't start it", serviceAlias
.c_str());
971 // write the start command
972 bool ret
= writeServiceLaunchCtrl(serviceAlias
, false, LAUNCH_CTRL_START
);
977 bool stopService(const std::string
&serviceAlias
)
979 // check that the service is running
980 TServiceStates::iterator
it(_ServiceStates
.find(serviceAlias
));
981 if (it
== _ServiceStates
.end())
983 nlwarning("stopService : Failed to found service '%s' in the list of services", serviceAlias
.c_str());
987 TServiceState
&ss
= it
->second
;
988 // write the launch control to stop
989 if (_RegisteredServices
.find(serviceAlias
) != _RegisteredServices
.end())
991 if (!writeServiceLaunchCtrl(serviceAlias
, false, LAUNCH_CTRL_STOP
))
993 nlwarning("Failed to write launch control file for service '%s'", serviceAlias
.c_str());
997 nlinfo("Service '%s' launch control file updated", serviceAlias
.c_str());
1000 // set the stopre request date if needed
1001 if (ss
.StopRequestDate
!= 0)
1003 ss
.StopRequestDate
= CTime::getSecondsSince1970();
1006 if (ss
.ServiceModule
== NULL
)
1008 nlwarning("stopService : The service '%s' is not connected, can't ask him to stop", serviceAlias
.c_str());
1012 // send the "quit" command to the service
1013 CAdminExecutorServiceClientProxy
aec(ss
.ServiceModule
);
1014 nlinfo("Sending command 'quit' to service '%s'", serviceAlias
.c_str());
1015 aec
.serviceCmdNoReturn(this, "quit");
1021 ///////////////////////////////////////////////////////////////////////
1022 //// Virtuals from IModuleTrackerCb
1023 ///////////////////////////////////////////////////////////////////////
1025 virtual void onTrackedModuleUp(IModuleProxy
*moduleProxy
)
1027 nldebug("Service module '%s' UP", moduleProxy
->getModuleName().c_str());
1030 TParsedCommandLine pcl
;
1031 if (!pcl
.parseParamList(moduleProxy
->getModuleManifest()))
1033 nlwarning("CAdminExecutorService:onTrackedModuleUp : failed to parse manifest");
1036 const TParsedCommandLine
*pclLongName
= pcl
.getParam("LongName");
1037 const TParsedCommandLine
*pclShortName
= pcl
.getParam("ShortName");
1038 const TParsedCommandLine
*pclAliasName
= pcl
.getParam("AliasName");
1039 const TParsedCommandLine
*pclPID
= pcl
.getParam("PID");
1040 const TParsedCommandLine
*pclDontUseShardOrders
= pcl
.getParam("DontUseShardOrders");
1042 string aliasName
= pclAliasName
!= NULL
? pclAliasName
->ParamValue
: moduleProxy
->getModuleName();
1044 // remove the temporary state and update connected service index
1045 _ServiceStates
.erase(moduleProxy
->getModuleName());
1046 _ConnectedServiceIndex
[moduleProxy
] = aliasName
;
1048 nlinfo("AES client module %s for service %s is up",
1049 moduleProxy
->getModuleName().c_str(),
1052 // create a new entry or get an existing one
1053 TServiceState
&ss
= _ServiceStates
[aliasName
];
1054 // update the service state
1055 ss
.RunningState
= TRunningState::rs_online
;
1056 if (pclDontUseShardOrders
)
1057 NLMISC::fromString(pclDontUseShardOrders
->ParamValue
, ss
.DontUseShardOrders
);
1059 ss
.DontUseShardOrders
= false;
1060 ss
.LongName
= pclLongName
!= NULL
? pclLongName
->ParamValue
: "unknown";
1061 ss
.ShortName
= pclShortName
!= NULL
? pclShortName
->ParamValue
: "unknown";
1065 NLMISC::fromString(pclPID
->ParamValue
, ss
.PID
);
1073 ss
.LastStateDate
= NLMISC::CTime::getSecondsSince1970();
1074 ss
.ServiceModule
= moduleProxy
;
1075 ss
.StartRequestDate
= 0;
1076 ss
.RunningTags
.erase(TRunningTag::rt_slow_to_start
);
1077 if (_RegisteredServices
.find(aliasName
) == _RegisteredServices
.end())
1079 ss
.RunningTags
.insert(TRunningTag::rt_externaly_started
);
1083 // // if this service is locally stopped or if the shard it belong to is stopped,
1084 // // then flag it as 'localy started'
1085 // if (_PersistentServiceOrders.find(aliasName) != _PersistentServiceOrders.end()
1086 // && _PersistentServiceOrders[aliasName] == TRunningOrders::ro_stopped)
1088 // // flag it as started
1089 // _PersistentServiceOrders[aliasName] = TRunningOrders::ro_running;
1090 // ss.RunningTags.insert(TRunningTag::rt_locally_started);
1091 // _NeedToWriteStateFile = true;
1093 // else if (_ShardOrders.find(ss.ShardName) != _ShardOrders.end()
1094 // && _ShardOrders[ss.ShardName] == TRunningOrders::ro_stopped)
1096 // // the shard is stopped, flag the service as started
1097 // _PersistentServiceOrders[aliasName] = TRunningOrders::ro_running;
1098 // ss.RunningTags.insert(TRunningTag::rt_locally_started);
1099 // _NeedToWriteStateFile = true;
1103 sendUpServiceUpdate();
1105 virtual void onTrackedModuleDown(IModuleProxy
*moduleProxy
)
1107 nldebug("Service module '%s' DOWN", moduleProxy
->getModuleName().c_str());
1109 TConnectedServiceIndex::iterator
it(_ConnectedServiceIndex
.find(moduleProxy
));
1110 if (it
!= _ConnectedServiceIndex
.end())
1112 string
&aliasName
= it
->second
;
1113 nlinfo("AES client module %s of service %s is down",
1114 moduleProxy
->getModuleName().c_str(),
1116 BOMB_IF(_ServiceStates
.find(aliasName
) == _ServiceStates
.end(), "Service down from "<<moduleProxy
->getModuleName()<<" with alias "<<aliasName
<<" not found in _ServiceStates table", _ConnectedServiceIndex
.erase(it
); return);
1117 if (_RegisteredServices
.find(aliasName
) == _RegisteredServices
.end())
1119 // this is not a registered service, remove the status record
1120 _ServiceStates
.erase(aliasName
);
1124 TServiceState
&ss
= _ServiceStates
[aliasName
];
1125 // update the running state
1126 ss
.RunningState
= TRunningState::rs_running
;
1128 ss
.ServiceModule
= NULL
;
1130 // kill the service to be sure that it is really dead !
1133 nlinfo("Killing process %u (%s) because aes client module '%s' is down",
1136 moduleProxy
->getModuleName().c_str());
1137 killProgram(ss
.PID
);
1141 retry_pending_command_loop
:
1142 // check for pending command
1143 TPendingWebCommands::iterator
first(_PendingWebCommands
.begin()), last(_PendingWebCommands
.end());
1144 for (; first
!= last
; ++first
)
1146 TPendingWebCommand
&pwc
= first
->second
;
1147 if (pwc
.ServiceAlias
== aliasName
)
1149 if (_AdminService
!= NULL
)
1151 CAdminServiceProxy
as(_AdminService
);
1152 as
.commandResult(this, first
->first
, pwc
.ServiceAlias
, "ERROR : AES : service connection lost during command");
1155 _PendingWebCommands
.erase(first
);
1156 // goto to avoid iterator dodging
1157 goto retry_pending_command_loop
;
1162 // remove the index record
1163 _ConnectedServiceIndex
.erase(it
);
1167 nlinfo("AES client module %s is not associated with a service",
1168 moduleProxy
->getModuleName().c_str());
1172 sendUpServiceUpdate();
1175 ///////////////////////////////////////////////////////////////////////
1176 //// Virtuals from CAdminExecutorServiceSkel
1177 ///////////////////////////////////////////////////////////////////////
1179 // AS send orders for a shard
1180 virtual void setShardOrders(NLNET::IModuleProxy
*sender
, const std::string
&shardName
, const TShardOrders
&shardOrders
)
1182 nlinfo("AS setShardOrders for shard '%s' to '%s'", shardName
.c_str(), shardOrders
.toString().c_str());
1184 if (_ShardOrders
[shardName
] == shardOrders
)
1189 _ShardOrders
[shardName
] = shardOrders
;
1190 _NeedToWriteStateFile
= true;
1192 // nothing more to do, if service need to be started, they are started
1193 // by the module update function
1197 // AS send a command to shutdown a shard with a delay
1198 virtual void shutdownShard(NLNET::IModuleProxy
*sender
, const std::string
&shardName
, uint32 delay
)
1200 TStopingShardInfo ssi
;
1201 ssi
.ShardName
= shardName
;
1203 ssi
.BeginDate
= CTime::getSecondsSince1970();
1205 _StopingShards
.push_back(ssi
);
1207 nlinfo("Received command to stop all service of shard %s in %us", ssi
.ShardName
.c_str(), ssi
.Delay
);
1209 // force a first update (to send the first warning message or stop immediately)
1210 checkServiceToStop();
1214 // AS send a control command to this AES
1215 virtual void controlCmd(NLNET::IModuleProxy
*sender
, uint32 commandId
, const std::string
&serviceAlias
, const std::string
&command
)
1217 // create a displayer to gather the output of the command
1218 class CStringDisplayer
: public IDisplayer
1221 virtual void doDisplay( const CLog::TDisplayInfo
& args
, const char *message
)
1229 nldebug("Control command from '%s' : '%s' '%s'",
1230 sender
->getModuleName().c_str(),
1231 serviceAlias
.c_str(),
1234 // look in the list of service for a matching one
1235 IModuleProxy
*service
= findOnlineService(serviceAlias
);
1236 if (service
== NULL
&& _RegisteredServices
.find(serviceAlias
) == _RegisteredServices
.end())
1238 CAdminServiceProxy
as(sender
);
1239 as
.commandResult(this, commandId
, serviceAlias
, "ERROR : AES : service not found will dispatching the control command");
1243 // ok, we can execute the command concerning the service.
1244 CStringDisplayer stringDisplayer
;
1245 IService::getInstance()->CommandLog
.addDisplayer(&stringDisplayer
);
1247 // build the command line
1248 CSString
args(command
);
1249 CSString cmdName
= args
.firstWord(true);
1251 cmdLine
<< getCommandHandlerName() << "." << cmdName
<< " " << serviceAlias
<< " " << args
;
1252 // retrieve the command from the input message and execute it
1253 nlinfo ("ADMIN: Executing control command : '%s' for service '%s'", cmdLine
.c_str(), serviceAlias
.c_str());
1254 ICommand::execute (cmdLine
, IService::getInstance()->CommandLog
);
1256 // unhook our displayer as it's work is now done
1257 IService::getInstance()->CommandLog
.removeDisplayer(&stringDisplayer
);
1259 // send the result back to AS
1260 CAdminServiceProxy
as(sender
);
1261 as
.commandResult(this, commandId
, serviceAlias
, stringDisplayer
._Data
);
1264 //The return is sent back by another message
1265 virtual void serviceCmd(NLNET::IModuleProxy
*sender
, uint32 commandId
, const std::string
&serviceAlias
, const std::string
&command
)
1267 // look in the list of service for a matching one
1268 IModuleProxy
*proxy
= findOnlineService(serviceAlias
);
1271 CAdminServiceProxy
as(sender
);
1272 as
.commandResult(this, commandId
, serviceAlias
, "ERROR AES : unknown service");
1276 // ok, we found it !
1277 TPendingWebCommand pwc
;
1278 pwc
.Command
= command
;
1279 pwc
.ReceptionDate
= NLMISC::CTime::getSecondsSince1970();
1280 pwc
.ServiceAlias
= serviceAlias
;
1282 _PendingWebCommands
.insert(make_pair(commandId
, pwc
));
1284 CAdminExecutorServiceClientProxy
service(proxy
);
1285 service
.serviceCmd(this, commandId
, command
);
1288 // AES client send back the result of execution of a command
1289 virtual void commandResult(NLNET::IModuleProxy
*sender
, uint32 commandId
, const std::string
&serviceAlias
, const std::string
&result
)
1291 // check for waiting commands
1292 TPendingWebCommands::iterator
it(_PendingWebCommands
.find(commandId
));
1294 if (it
== _PendingWebCommands
.end())
1297 nlwarning("CAdminExecutor::commandResult : service '%s' sent result for command ID %u but not in pending command table",
1298 sender
->getModuleName().c_str(),
1303 // send the result back to AS
1304 if (_AdminService
!= NULL
)
1306 CAdminServiceProxy
as(_AdminService
);
1308 as
.commandResult(this, commandId
, serviceAlias
, result
);
1311 _PendingWebCommands
.erase(commandId
);
1315 // An AES send graph data update
1316 virtual void graphUpdate(NLNET::IModuleProxy
*sender
, const TGraphDatas
&graphDatas
)
1318 if (_AdminService
!= NULL
)
1320 CAdminServiceProxy
as(_AdminService
);
1321 as
.graphUpdate(this, graphDatas
);
1325 // A service high rez graph data update
1326 virtual void highRezGraphUpdate(NLNET::IModuleProxy
*sender
, const THighRezDatas
&graphDatas
)
1328 if (_AdminService
!= NULL
)
1330 CAdminServiceProxy
as(_AdminService
);
1331 as
.highRezGraphUpdate(this, graphDatas
);
1335 // A service send an update of of it's status string
1336 virtual void serviceStatusUpdate(NLNET::IModuleProxy
*sender
, const std::string
&status
)
1338 TConnectedServiceIndex::iterator
it(_ConnectedServiceIndex
.find(sender
));
1339 if (it
== _ConnectedServiceIndex
.end())
1341 nlwarning("serviceStatusUpdate : service '%s' send status but is unknown !", sender
->getModuleName().c_str());
1345 string
&aliasName
= it
->second
;
1346 TServiceStates::iterator
it2(_ServiceStates
.find(aliasName
));
1347 BOMB_IF(it2
== _ServiceStates
.end(), "serviceStateUpdate : service '"
1348 <<sender
->getModuleName()
1349 <<"' send an update, but alias '"<<aliasName
<<"' is not found in service status", return);
1351 TServiceState
&ss
= it2
->second
;
1353 ss
.LastStateDate
= NLMISC::CTime::getSecondsSince1970();
1357 ///////////////////////////////////////////////////////////////////////
1358 //// commands handlers
1359 ///////////////////////////////////////////////////////////////////////
1360 NLMISC_COMMAND_HANDLER_TABLE_EXTEND_BEGIN(CAdminExecutorService
, CModuleBase
)
1361 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, dump
, "Dump a status report to appropriate output logger", "no args")
1362 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, addRegisteredService
, "add a registered service", "<serviceAlias> <shardName>")
1363 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, removeRegisteredService
, "remove a registered service", "<serviceAlias>")
1364 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, startService
, "start a registered service", "<serviceAlias>")
1365 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, restartService
, "stop then start a registered service", "<serviceAlias>")
1366 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, stopService
, "stop a service (registered or not)", "<serviceAlias>")
1367 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, killService
, "kill a (possibly not responding) service (registered or not)", "<serviceAlias>")
1368 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, abortService
, "abort a (possibly not responding) service with SIGABORT (registered or not)", "<serviceAlias>")
1369 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, activateService
, "activate a service, i.e make it startable either manually or from a shard orders", "<serviceAlias>")
1370 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, deactivateService
, "deactivate a service, i.e make it unstartable (either manually or from a shard orders) and stop it if needed", "<serviceAlias>")
1371 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, execScript
, "execute the predefined bash script '/home/nevrax/patchman/aes_runnable_script.sh' and give it the passed parameters", "<any parameter>")
1372 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, resetStartCounter
, "reset the start counter to 0", "no params")
1373 NLMISC_COMMAND_HANDLER_ADD(CAdminExecutorService
, stopShard
, "Stop all service of a given shard aftert the provided delay", "<shardName> <delay (in s)>")
1374 NLMISC_COMMAND_HANDLER_TABLE_END
1377 NLMISC_CLASS_COMMAND_DECL(stopShard
)
1379 if (args
.size() != 2)
1382 string shardName
= args
[0];
1384 NLMISC::fromString(args
[1], delay
);
1386 log
.displayNL("Received command to stop all service of shard %s in %us", shardName
.c_str(), delay
);
1388 shutdownShard(NULL
, shardName
, delay
);
1394 NLMISC_CLASS_COMMAND_DECL(resetStartCounter
)
1396 if (args
.size() != 0)
1400 TServiceStates::iterator
first(_ServiceStates
.begin()), last(_ServiceStates
.end());
1401 for (; first
!= last
; ++first
)
1403 TServiceState
&ss
= first
->second
;
1405 ss
.RunnerLoopCounter
.resetCounter();
1412 NLMISC_CLASS_COMMAND_DECL(execScript
)
1414 string
cmdLine("/home/nevrax/patchman/aes_runnable_script.sh");
1417 for (uint i
=0; i
<args
.size(); ++i
)
1419 cmdLine
+= " "+args
[i
];
1423 string logFile
= CPath::getTemporaryDirectory() + "aes_command_output.log";
1425 cmdLine
+= " > "+logFile
;
1427 log
.displayNL("Executing '%s'", cmdLine
.c_str());
1428 // execute the command
1429 int ret
= system(cmdLine
.c_str());
1431 // echo the output to the requester
1433 output
.readFromFile(logFile
);
1435 vector
<CSString
> lines
;
1436 output
.splitLines(lines
);
1438 log
.displayNL("Command returned value %d", ret
);
1439 log
.displayNL("-------------------- Command output begin -----------------------");
1440 for (uint i
=0; i
<lines
.size(); ++i
)
1442 log
.displayNL("%s", lines
[i
].c_str());
1444 log
.displayNL("-------------------- Command output end -----------------------");
1448 NLMISC_CLASS_COMMAND_DECL(deactivateService
)
1450 if (args
.size() != 1)
1453 string serviceAlias
= args
[0];
1455 if (_PersistentServiceOrders
.find(serviceAlias
) == _PersistentServiceOrders
.end())
1457 log
.displayNL("Unregistered service '%s', could not deactivate it", serviceAlias
.c_str());
1461 _PersistentServiceOrders
[serviceAlias
] = TRunningOrders::ro_deactivated
;
1463 _NeedToWriteStateFile
= true;
1465 log
.displayNL("Service '%s' deactivated", serviceAlias
.c_str());
1470 NLMISC_CLASS_COMMAND_DECL(activateService
)
1472 if (args
.size() != 1)
1475 string serviceAlias
= args
[0];
1477 if (_PersistentServiceOrders
.find(serviceAlias
) == _PersistentServiceOrders
.end())
1479 log
.displayNL("Unregistered service '%s', could not activate it", serviceAlias
.c_str());
1483 _PersistentServiceOrders
[serviceAlias
] = TRunningOrders::ro_activated
;
1485 _NeedToWriteStateFile
= true;
1487 log
.displayNL("Service '%s' activated", serviceAlias
.c_str());
1492 NLMISC_CLASS_COMMAND_DECL(abortService
)
1494 if (args
.size() != 1)
1497 string serviceAlias
= args
[0];
1499 // check that the service is running
1500 TServiceStates::iterator
it(_ServiceStates
.find(serviceAlias
));
1501 if (it
== _ServiceStates
.end())
1503 log
.displayNL("Failed to found service '%s' in the list of running services", serviceAlias
.c_str());
1507 TServiceState
&ss
= it
->second
;
1508 if (ss
.RunningState
== TRunningState::rs_stopped
)
1510 log
.displayNL("The service to abort '%s' is currently stopped", serviceAlias
.c_str());
1515 log
.displayNL("AES have no valid PID to abort the service '%s'", serviceAlias
.c_str());
1520 log
.displayNL("Aborting service '%s' with pid %u", serviceAlias
.c_str(), ss
.PID
);
1521 abortProgram(ss
.PID
);
1526 NLMISC_CLASS_COMMAND_DECL(killService
)
1528 if (args
.size() != 1)
1531 string serviceAlias
= args
[0];
1533 // check that the service is running
1534 TServiceStates::iterator
it(_ServiceStates
.find(serviceAlias
));
1535 if (it
== _ServiceStates
.end())
1537 log
.displayNL("Failed to found service '%s' in the list of running services", serviceAlias
.c_str());
1541 TServiceState
&ss
= it
->second
;
1542 if (ss
.RunningState
== TRunningState::rs_stopped
)
1544 log
.displayNL("The service to kill '%s' is currently stopped", serviceAlias
.c_str());
1549 log
.displayNL("AES have no valid PID to kill the service '%s'", serviceAlias
.c_str());
1553 log
.displayNL("Killing service '%s' with pid %u", serviceAlias
.c_str(), ss
.PID
);
1554 killProgram(ss
.PID
);
1559 NLMISC_CLASS_COMMAND_DECL(stopService
)
1561 if (args
.size() != 1)
1564 string serviceAlias
= args
[0];
1566 if (_ServiceStates
.find(serviceAlias
) == _ServiceStates
.end())
1568 log
.displayNL("Unknown service '%s', could not stop it", serviceAlias
.c_str());
1572 TServiceState
&ss
= _ServiceStates
[serviceAlias
];
1573 // look for a shard orders for this service
1574 TShardsOrders::iterator
it(_ShardOrders
.find(ss
.ShardName
));
1575 if (it
!= _ShardOrders
.end())
1577 TShardOrders
&so
= it
->second
;
1578 if (so
== TShardOrders::so_autostart_on
)
1580 log
.displayNL("Can't stop service '%s' because shard '%s' is autostarting, considers either to deactivate the service or just restart it",
1581 serviceAlias
.c_str(),
1582 ss
.ShardName
.c_str());
1587 if (stopService(serviceAlias
))
1588 log
.displayNL("Failed to stop the service '%s'", serviceAlias
.c_str());
1590 log
.displayNL("Service '%s' stop request done", serviceAlias
.c_str());
1595 NLMISC_CLASS_COMMAND_DECL(restartService
)
1597 if (args
.size() != 1)
1600 string serviceAlias
= args
[0];
1602 if (_RegisteredServices
.find(serviceAlias
) == _RegisteredServices
.end())
1604 log
.displayNL("startService %s : the service in not registered, can't restart it", serviceAlias
.c_str());
1608 // look for service orders for this service
1609 if (_PersistentServiceOrders
.find(serviceAlias
) != _PersistentServiceOrders
.end())
1611 if (_PersistentServiceOrders
[serviceAlias
] == TRunningOrders::ro_deactivated
)
1613 log
.displayNL("Can't restart service '%s' because it is currently deactivated", serviceAlias
.c_str());
1620 // check that the service is running
1621 TServiceStates::iterator
it(_ServiceStates
.find(serviceAlias
));
1622 if (it
== _ServiceStates
.end())
1624 log
.displayNL("Failed to found service '%s' in the list of running services", serviceAlias
.c_str());
1628 // write the deferred start command
1629 if (!writeServiceLaunchCtrl(serviceAlias
, true, LAUNCH_CTRL_START
))
1631 log
.displayNL("Failed to write deferred start control file to restart service '%s'", serviceAlias
.c_str());
1635 log
.displayNL("Service '%s' start command written", serviceAlias
.c_str());
1637 if (it
->second
.ServiceModule
== NULL
)
1639 log
.displayNL("The AES client module proxy is null ! can't send 'quit' command");
1642 // send the "quit" command to the service
1643 CAdminExecutorServiceClientProxy
aec(it
->second
.ServiceModule
);
1644 aec
.serviceCmd(this, 0, "quit");
1645 log
.displayNL("Service '%s' command 'quit' sent", serviceAlias
.c_str());
1650 NLMISC_CLASS_COMMAND_DECL(startService
)
1652 if (args
.size() != 1)
1655 string serviceAlias
= args
[0];
1657 if (_ServiceStates
.find(serviceAlias
) == _ServiceStates
.end())
1659 log
.displayNL("Unknown service '%s', could not start it", serviceAlias
.c_str());
1663 TServiceState
&ss
= _ServiceStates
[serviceAlias
];
1665 // look for service orders for this service
1666 if (_PersistentServiceOrders
.find(serviceAlias
) != _PersistentServiceOrders
.end())
1668 if (_PersistentServiceOrders
[serviceAlias
] == TRunningOrders::ro_deactivated
)
1670 log
.displayNL("Can't start service '%s' because it is curently deactivated", serviceAlias
.c_str());
1675 // look for a shard orders for this service
1676 TShardsOrders::iterator
it(_ShardOrders
.find(ss
.ShardName
));
1677 if (it
!= _ShardOrders
.end())
1679 TShardOrders
&so
= it
->second
;
1680 if (so
== TShardOrders::so_autostart_on
)
1682 log
.displayNL("Can't start service '%s' because shard '%s' is autostarting, consider to restart it",
1683 serviceAlias
.c_str(),
1684 ss
.ShardName
.c_str());
1689 if (!startService(serviceAlias
))
1690 log
.displayNL("Failed to start service '%s'", serviceAlias
.c_str());
1692 log
.displayNL("Service '%s' start command written", serviceAlias
.c_str());
1697 NLMISC_CLASS_COMMAND_DECL(removeRegisteredService
)
1699 if (args
.size() != 1)
1702 string serviceAlias
= args
[0];
1704 if (_ServiceStates
.find(serviceAlias
) == _ServiceStates
.end())
1706 log
.displayNL("Unknown service '%s', could not start it", serviceAlias
.c_str());
1710 TServiceState
&ss
= _ServiceStates
[serviceAlias
];
1712 _RegisteredServices
.erase(serviceAlias
);
1714 if (ss
.RunningState
== TRunningState::rs_stopped
)
1716 // remove the record
1717 _ServiceStates
.erase(serviceAlias
);
1721 // just update some data related the registered service
1723 ss
.RunningTags
.erase(TRunningTag::rt_locally_started
);
1724 ss
.RunningTags
.erase(TRunningTag::rt_chain_crashing
);
1725 ss
.RunningTags
.insert(TRunningTag::rt_externaly_started
);
1729 _PersistentServiceOrders
.erase(serviceAlias
);
1730 _NeedToWriteStateFile
= true;
1732 // update the state of services to the AS
1733 sendUpServiceUpdate();
1738 NLMISC_CLASS_COMMAND_DECL(addRegisteredService
)
1740 if (args
.size() != 2)
1743 string serviceAlias
= args
[0];
1744 string shardName
= args
[1];
1746 _RegisteredServices
.insert(serviceAlias
);
1747 _ServiceStates
.insert(make_pair(serviceAlias
, TServiceState()));
1748 _ServiceStates
[serviceAlias
].ShardName
= shardName
;
1749 // _ServiceRunnerLoopCounters.insert(make_pair(serviceAlias, TRunnerLoopCounter()));
1751 if (_PersistentServiceOrders
.find(serviceAlias
) == _PersistentServiceOrders
.end())
1753 _PersistentServiceOrders
[serviceAlias
] = TRunningOrders::ro_activated
;
1754 _NeedToWriteStateFile
= true;
1757 // update the state of services to the AS
1758 sendUpServiceUpdate();
1763 NLMISC_CLASS_COMMAND_DECL(dump
)
1765 NLMISC_CLASS_COMMAND_CALL_BASE(CModuleBase
, dump
);
1767 log
.displayNL("===============================");
1768 log
.displayNL(" Dumping Admin executor states");
1769 log
.displayNL("===============================");
1772 log
.displayNL(" There are %u known shard :", _ShardOrders
.size());
1774 TShardsOrders::iterator
first(_ShardOrders
.begin()), last(_ShardOrders
.end());
1775 for (; first
!= last
; ++first
)
1777 log
.displayNL(" + Shard '%s' orders is '%s'", first
->first
.c_str(), first
->second
.toString().c_str());
1780 if (_ShutdownForPatch
)
1781 log
.displayNL(" All service are shuting down for patch !");
1782 log
.displayNL(" There are %u known services :", _ServiceStates
.size());
1783 TServiceStates::iterator
first(_ServiceStates
.begin()), last(_ServiceStates
.end());
1784 for (; first
!= last
; ++first
)
1786 TServiceState
&ss
= first
->second
;
1787 const string
&aliasName
= first
->first
;
1789 CSString runningTags
;
1790 set
<TRunningTag
>::iterator
rtf(ss
.RunningTags
.begin()), rte(ss
.RunningTags
.end());
1791 for (; rtf
!= rte
; ++rtf
)
1793 runningTags
<<"<"<<rtf
->toString()<<">";
1796 bool registered
= _RegisteredServices
.find(aliasName
) != _RegisteredServices
.end();
1798 log
.displayNL(" + Service alias='%s' (%s) ShardName = '%s' RunningState='%s' RunningTag='%s'",
1800 registered
? "REGISTERED" : "NOT REGISTERED",
1801 ss
.ShardName
.c_str(),
1802 ss
.RunningState
.toString().c_str(),
1803 runningTags
.c_str());
1805 log
.display(" | %s", ss
.DontUseShardOrders
? "DontUseShardOders" : "UseShardOrders");
1807 if (ss
.RunningState
!= TRunningState::rs_stopped
)
1809 // the pid should be valid
1810 log
.display(" PID=%u", ss
.PID
);
1814 log
.display(" ServiceOrders=%s", _PersistentServiceOrders
[aliasName
].toString().c_str());
1819 if (ss
.ServiceModule
!= NULL
)
1821 // dump a connected service
1822 log
.displayNL(" | longName='%s' shortName='%s' moduleName='%s'",
1823 ss
.LongName
.c_str(),
1824 ss
.ShortName
.c_str(),
1825 ss
.ServiceModule
->getModuleName().c_str());
1826 log
.displayNL(" | State '%s' (last received %sago)", ss
.State
.c_str(), NLMISC::CTime::getHumanRelativeTime(NLMISC::CTime::getSecondsSince1970() - ss
.LastStateDate
).c_str());
1830 // dump a offline registered service
1831 // dump a connected service
1832 log
.displayNL(" | longName='%s' shortName='%s' ",
1833 ss
.LongName
.c_str(),
1834 ss
.ShortName
.c_str());
1835 log
.displayNL(" | State '%s' (last received %sago)", ss
.State
.c_str(), NLMISC::CTime::getHumanRelativeTime(NLMISC::CTime::getSecondsSince1970() - ss
.LastStateDate
).c_str());
1840 ss
.RunnerLoopCounter
.getCounters(c1
, c2
, c3
);
1841 log
.displayNL(" | Service Runner Start counter (%u mn:%u run, %u mn:%u run, %u mn:%u run)",
1842 CRASH_COUNTER_ROLL_DELAY
/60, c1
,
1843 (CRASH_COUNTER_ROLL_DELAY
*3)/60, c2
,
1844 (CRASH_COUNTER_ROLL_DELAY
*CRASH_COUNTER_SLOT
)/60, c3
);
1855 NLNET_REGISTER_MODULE_FACTORY(CAdminExecutorService
, "AdminExecutorService");
1857 } // namespace ADMIN