Updated EllieBot.Coordinator
This commit is contained in:
parent
960f6f093c
commit
c69f7951a7
18 changed files with 993 additions and 2 deletions
16
EllieBot.sln
16
EllieBot.sln
|
@ -17,7 +17,11 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "EllieBot", "src\EllieBot\El
|
|||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ayu", "ayu", "{872A4C63-833C-4AE0-91AB-3CE348D3E6F8}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Ayu.Discord.Voice", "src\ayu\Ayu.Discord.Voice\Ayu.Discord.Voice.csproj", "{5AD2EFFB-7774-49B2-A791-3BAC4DAEE067}"
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Ayu.Discord.Voice", "src\ayu\Ayu.Discord.Voice\Ayu.Discord.Voice.csproj", "{5AD2EFFB-7774-49B2-A791-3BAC4DAEE067}"
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "EllieBot.Tests", "src\EllieBot.Tests\EllieBot.Tests.csproj", "{179DF3B3-AD32-4335-8231-9818338DF3A2}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "EllieBot.Coordinator", "src\EllieBot.Coordinator\EllieBot.Coordinator.csproj", "{A631DDF0-3AD1-4CB9-8458-314B1320868A}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
|
@ -33,6 +37,14 @@ Global
|
|||
{5AD2EFFB-7774-49B2-A791-3BAC4DAEE067}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{5AD2EFFB-7774-49B2-A791-3BAC4DAEE067}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{5AD2EFFB-7774-49B2-A791-3BAC4DAEE067}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{179DF3B3-AD32-4335-8231-9818338DF3A2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{179DF3B3-AD32-4335-8231-9818338DF3A2}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{179DF3B3-AD32-4335-8231-9818338DF3A2}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{179DF3B3-AD32-4335-8231-9818338DF3A2}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{A631DDF0-3AD1-4CB9-8458-314B1320868A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{A631DDF0-3AD1-4CB9-8458-314B1320868A}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{A631DDF0-3AD1-4CB9-8458-314B1320868A}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{A631DDF0-3AD1-4CB9-8458-314B1320868A}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
@ -41,6 +53,8 @@ Global
|
|||
{BCB21472-84D2-4B63-B5DD-31E6A3EC9791} = {B28FB883-9688-41EB-BF5A-945F4A4EB628}
|
||||
{872A4C63-833C-4AE0-91AB-3CE348D3E6F8} = {B28FB883-9688-41EB-BF5A-945F4A4EB628}
|
||||
{5AD2EFFB-7774-49B2-A791-3BAC4DAEE067} = {872A4C63-833C-4AE0-91AB-3CE348D3E6F8}
|
||||
{179DF3B3-AD32-4335-8231-9818338DF3A2} = {B28FB883-9688-41EB-BF5A-945F4A4EB628}
|
||||
{A631DDF0-3AD1-4CB9-8458-314B1320868A} = {B28FB883-9688-41EB-BF5A-945F4A4EB628}
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {79F61C2C-CDBB-4361-A234-91A0B334CFE4}
|
||||
|
|
3
TODO.md
3
TODO.md
|
@ -1,3 +1,4 @@
|
|||
# List of things to do
|
||||
|
||||
- Finish the full system rewrite
|
||||
- Finish the full system rewrite
|
||||
- Finish the EllieBot.Tests project
|
47
src/EllieBot.Coordinator/CoordStartup.cs
Normal file
47
src/EllieBot.Coordinator/CoordStartup.cs
Normal file
|
@ -0,0 +1,47 @@
|
|||
using Microsoft.AspNetCore.Builder;
|
||||
using Microsoft.AspNetCore.Hosting;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
|
||||
namespace EllieBot.Coordinator
|
||||
{
|
||||
public class CoordStartup
|
||||
{
|
||||
public IConfiguration Configuration { get; }
|
||||
|
||||
public CoordStartup(IConfiguration config)
|
||||
=> Configuration = config;
|
||||
|
||||
public void ConfigureServices(IServiceCollection services)
|
||||
{
|
||||
services.AddGrpc();
|
||||
services.AddSingleton<CoordinatorRunner>();
|
||||
services.AddSingleton<IHostedService, CoordinatorRunner>(
|
||||
serviceProvider => serviceProvider.GetRequiredService<CoordinatorRunner>());
|
||||
}
|
||||
|
||||
public void Configure(IApplicationBuilder app, IWebHostEnvironment env)
|
||||
{
|
||||
if (env.IsDevelopment())
|
||||
{
|
||||
app.UseDeveloperExceptionPage();
|
||||
}
|
||||
|
||||
app.UseRouting();
|
||||
|
||||
app.UseEndpoints(endpoints =>
|
||||
{
|
||||
endpoints.MapGrpcService<CoordinatorService>();
|
||||
|
||||
endpoints.MapGet("/",
|
||||
async context =>
|
||||
{
|
||||
await context.Response.WriteAsync(
|
||||
"Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909");
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
19
src/EllieBot.Coordinator/EllieBot.Coordinator.csproj
Normal file
19
src/EllieBot.Coordinator/EllieBot.Coordinator.csproj
Normal file
|
@ -0,0 +1,19 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk.Web">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Protobuf Include="Protos\coordinator.proto" GrpcServices="Server" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Grpc.AspNetCore" Version="2.47.0" />
|
||||
<PackageReference Include="Serilog" Version="2.11.0" />
|
||||
<PackageReference Include="Serilog.Sinks.Console" Version="4.0.1" />
|
||||
<PackageReference Include="Serilog.Sinks.File" Version="5.0.0" />
|
||||
<PackageReference Include="YamlDotNet" Version="11.2.1" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
43
src/EllieBot.Coordinator/LogSetup.cs
Normal file
43
src/EllieBot.Coordinator/LogSetup.cs
Normal file
|
@ -0,0 +1,43 @@
|
|||
using System;
|
||||
using System.Text;
|
||||
using Serilog;
|
||||
using Serilog.Events;
|
||||
using Serilog.Sinks.SystemConsole.Themes;
|
||||
|
||||
namespace EllieBot.Services
|
||||
{
|
||||
public static class LogSetup
|
||||
{
|
||||
public static void SetupLogger(object source)
|
||||
{
|
||||
Log.Logger = new LoggerConfiguration()
|
||||
.MinimumLevel.Override("Microsoft", LogEventLevel.Information)
|
||||
.MinimumLevel.Override("System", LogEventLevel.Information)
|
||||
.MinimumLevel.Override("Microsoft.AspNetCore", LogEventLevel.Warning)
|
||||
.Enrich.FromLogContext()
|
||||
.WriteTo.File("coord.log", LogEventLevel.Information,
|
||||
rollOnFileSizeLimit: true,
|
||||
fileSizeLimitBytes: 10_000_000)
|
||||
.WriteTo.Console(LogEventLevel.Information,
|
||||
theme: GetTheme(),
|
||||
outputTemplate: "[{Timestamp:HH:mm:ss} {Level:u3}] | #{LogSource} | {Message:lj}{NewLine}{Exception}")
|
||||
.Enrich.WithProperty("LogSource", source)
|
||||
.CreateLogger();
|
||||
|
||||
Console.OutputEncoding = Encoding.UTF8;
|
||||
}
|
||||
|
||||
private static ConsoleTheme GetTheme()
|
||||
{
|
||||
if (Environment.OSVersion.Platform == PlatformID.Unix)
|
||||
return AnsiConsoleTheme.Code;
|
||||
|
||||
|
||||
#if DEBUG
|
||||
return AnsiConsoleTheme.Code;
|
||||
#else
|
||||
return ConsoleTheme.None;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
20
src/EllieBot.Coordinator/Program.cs
Normal file
20
src/EllieBot.Coordinator/Program.cs
Normal file
|
@ -0,0 +1,20 @@
|
|||
using System;
|
||||
using Microsoft.AspNetCore.Hosting;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using EllieBot.Coordinator;
|
||||
using EllieBot.Services;
|
||||
using Serilog;
|
||||
|
||||
// Additional configuration is required to successfully run gRPC on macOS.
|
||||
// For instructions on how to configure Kestrel and gRPC clients on macOS, visit https://go.microsoft.com/fwlink/?linkid=2099682
|
||||
static IHostBuilder CreateHostBuilder(string[] args) =>
|
||||
Host.CreateDefaultBuilder(args)
|
||||
.ConfigureWebHostDefaults(webBuilder =>
|
||||
{
|
||||
webBuilder.UseStartup<CoordStartup>();
|
||||
});
|
||||
|
||||
LogSetup.SetupLogger("coord");
|
||||
Log.Information("Starting coordinator... Pid: {ProcessId}", Environment.ProcessId);
|
||||
|
||||
CreateHostBuilder(args).Build().Run();
|
13
src/EllieBot.Coordinator/Properties/launchSettings.json
Normal file
13
src/EllieBot.Coordinator/Properties/launchSettings.json
Normal file
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"profiles": {
|
||||
"EllieBot.Coordinator": {
|
||||
"commandName": "Project",
|
||||
"dotnetRunMessages": "true",
|
||||
"launchBrowser": false,
|
||||
"applicationUrl": "http://localhost:3442;https://localhost:3443",
|
||||
"environmentVariables": {
|
||||
"ASPNETCORE_ENVIRONMENT": "Development"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
127
src/EllieBot.Coordinator/Protos/coordinator.proto
Normal file
127
src/EllieBot.Coordinator/Protos/coordinator.proto
Normal file
|
@ -0,0 +1,127 @@
|
|||
syntax = "proto3";
|
||||
import "google/protobuf/timestamp.proto";
|
||||
|
||||
option csharp_namespace = "EllieBot.Coordinator";
|
||||
|
||||
package elliebot;
|
||||
|
||||
service Coordinator {
|
||||
// sends update to coordinator to let it know that the shard is alive
|
||||
rpc Heartbeat(HeartbeatRequest) returns (HeartbeatReply);
|
||||
// restarts a shard given the id
|
||||
rpc RestartShard(RestartShardRequest) returns (RestartShardReply);
|
||||
// reshards given the new number of shards
|
||||
rpc Reshard(ReshardRequest) returns (ReshardReply);
|
||||
// Reload config
|
||||
rpc Reload(ReloadRequest) returns (ReloadReply);
|
||||
// Gets status of a single shard
|
||||
rpc GetStatus(GetStatusRequest) returns (GetStatusReply);
|
||||
// Get status of all shards
|
||||
rpc GetAllStatuses(GetAllStatusesRequest) returns (GetAllStatusesReply);
|
||||
// Restarts all shards. Queues them to be restarted at a normal rate. Setting Nuke to true will kill all shards right
|
||||
// away
|
||||
rpc RestartAllShards(RestartAllRequest) returns (RestartAllReply);
|
||||
|
||||
// kill coordinator (and all shards as a consequence)
|
||||
rpc Die(DieRequest) returns (DieReply);
|
||||
|
||||
rpc SetConfigText(SetConfigTextRequest) returns (SetConfigTextReply);
|
||||
|
||||
rpc GetConfigText(GetConfigTextRequest) returns (GetConfigTextReply);
|
||||
}
|
||||
|
||||
enum ConnState {
|
||||
Disconnected = 0;
|
||||
Connecting = 1;
|
||||
Connected = 2;
|
||||
}
|
||||
|
||||
message HeartbeatRequest {
|
||||
int32 shardId = 1;
|
||||
int32 guildCount = 2;
|
||||
ConnState state = 3;
|
||||
}
|
||||
|
||||
message HeartbeatReply {
|
||||
bool gracefulImminent = 1;
|
||||
}
|
||||
|
||||
message RestartShardRequest {
|
||||
int32 shardId = 1;
|
||||
// should it be queued for restart, set false to kill it and restart immediately with priority
|
||||
bool queue = 2;
|
||||
}
|
||||
|
||||
message RestartShardReply {
|
||||
|
||||
}
|
||||
|
||||
message ReshardRequest {
|
||||
int32 shards = 1;
|
||||
}
|
||||
|
||||
message ReshardReply {
|
||||
|
||||
}
|
||||
|
||||
message ReloadRequest {
|
||||
|
||||
}
|
||||
|
||||
message ReloadReply {
|
||||
|
||||
}
|
||||
|
||||
message GetStatusRequest {
|
||||
int32 shardId = 1;
|
||||
}
|
||||
|
||||
message GetStatusReply {
|
||||
int32 shardId = 1;
|
||||
ConnState state = 2;
|
||||
int32 guildCount = 3;
|
||||
google.protobuf.Timestamp lastUpdate = 4;
|
||||
bool scheduledForRestart = 5;
|
||||
google.protobuf.Timestamp startedAt = 6;
|
||||
}
|
||||
|
||||
message GetAllStatusesRequest {
|
||||
|
||||
}
|
||||
|
||||
message GetAllStatusesReply {
|
||||
repeated GetStatusReply Statuses = 1;
|
||||
}
|
||||
|
||||
message RestartAllRequest {
|
||||
bool nuke = 1;
|
||||
}
|
||||
|
||||
message RestartAllReply {
|
||||
|
||||
}
|
||||
|
||||
message DieRequest {
|
||||
bool graceful = 1;
|
||||
}
|
||||
|
||||
message DieReply {
|
||||
|
||||
}
|
||||
|
||||
message GetConfigTextRequest {
|
||||
|
||||
}
|
||||
|
||||
message GetConfigTextReply {
|
||||
string configYml = 1;
|
||||
}
|
||||
|
||||
message SetConfigTextRequest {
|
||||
string configYml = 1;
|
||||
}
|
||||
|
||||
message SetConfigTextReply {
|
||||
bool success = 1;
|
||||
string error = 2;
|
||||
}
|
11
src/EllieBot.Coordinator/README.md
Normal file
11
src/EllieBot.Coordinator/README.md
Normal file
|
@ -0,0 +1,11 @@
|
|||
# Coordinator project
|
||||
|
||||
Grpc-based coordinator useful for sharded EllieBot. Its purpose is controlling the lifetime and checking status of the shards it creates.
|
||||
|
||||
### Supports
|
||||
|
||||
- Checking status
|
||||
- Individual shard restarts
|
||||
- Full shard restarts
|
||||
- Graceful coordinator restarts (restart/update coordinator without killing shards)
|
||||
- Kill/Stop
|
457
src/EllieBot.Coordinator/Services/CoordinatorRunner.cs
Normal file
457
src/EllieBot.Coordinator/Services/CoordinatorRunner.cs
Normal file
|
@ -0,0 +1,457 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Text.Json;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Serilog;
|
||||
using YamlDotNet.Serialization;
|
||||
|
||||
namespace EllieBot.Coordinator
|
||||
{
|
||||
public sealed class CoordinatorRunner : BackgroundService
|
||||
{
|
||||
private const string CONFIG_PATH = "coord.yml";
|
||||
|
||||
private const string GRACEFUL_STATE_PATH = "graceful.json";
|
||||
private const string GRACEFUL_STATE_BACKUP_PATH = "graceful_old.json";
|
||||
|
||||
private readonly Serializer _serializer;
|
||||
private readonly Deserializer _deserializer;
|
||||
|
||||
private Config _config;
|
||||
private ShardStatus[] _shardStatuses;
|
||||
|
||||
private readonly object locker = new object();
|
||||
private readonly Random _rng;
|
||||
private bool _gracefulImminent;
|
||||
|
||||
public CoordinatorRunner()
|
||||
{
|
||||
_serializer = new();
|
||||
_deserializer = new();
|
||||
_config = LoadConfig();
|
||||
_rng = new Random();
|
||||
|
||||
if (!TryRestoreOldState())
|
||||
InitAll();
|
||||
}
|
||||
|
||||
private Config LoadConfig()
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
return _deserializer.Deserialize<Config>(File.ReadAllText(CONFIG_PATH));
|
||||
}
|
||||
}
|
||||
|
||||
private void SaveConfig(in Config config)
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
var output = _serializer.Serialize(config);
|
||||
File.WriteAllText(CONFIG_PATH, output);
|
||||
}
|
||||
}
|
||||
|
||||
public void ReloadConfig()
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
var oldConfig = _config;
|
||||
var newConfig = LoadConfig();
|
||||
if (oldConfig.TotalShards != newConfig.TotalShards)
|
||||
{
|
||||
KillAll();
|
||||
}
|
||||
_config = newConfig;
|
||||
if (oldConfig.TotalShards != newConfig.TotalShards)
|
||||
{
|
||||
InitAll();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
// Log.Information("Executing");
|
||||
|
||||
bool first = true;
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
bool hadAction = false;
|
||||
lock (locker)
|
||||
{
|
||||
var shardIds = Enumerable.Range(0, 1) // shard 0 is always first
|
||||
.Append((int)((1173494918812024863 >> 22) % _config.TotalShards)) // then ellie server shard
|
||||
.Concat(Enumerable.Range(1, _config.TotalShards - 1)
|
||||
.OrderBy(_ => _rng.Next())) // then all other shards in a random order
|
||||
.Distinct()
|
||||
.ToList();
|
||||
|
||||
if (first)
|
||||
{
|
||||
// Log.Information("Startup order: {StartupOrder}",string.Join(' ', shardIds));
|
||||
first = false;
|
||||
}
|
||||
|
||||
foreach (var shardId in shardIds)
|
||||
{
|
||||
if (stoppingToken.IsCancellationRequested)
|
||||
break;
|
||||
|
||||
var status = _shardStatuses[shardId];
|
||||
|
||||
if (status.ShouldRestart)
|
||||
{
|
||||
Log.Warning("Shard {ShardId} is restarting (scheduled)...", shardId);
|
||||
hadAction = true;
|
||||
StartShard(shardId);
|
||||
break;
|
||||
}
|
||||
|
||||
if (DateTime.UtcNow - status.LastUpdate >
|
||||
TimeSpan.FromSeconds(_config.UnresponsiveSec))
|
||||
{
|
||||
Log.Warning("Shard {ShardId} is restarting (unresponsive)...", shardId);
|
||||
hadAction = true;
|
||||
StartShard(shardId);
|
||||
break;
|
||||
}
|
||||
|
||||
if (status.StateCounter > 8 && status.State != ConnState.Connected)
|
||||
{
|
||||
Log.Warning("Shard {ShardId} is restarting (stuck)...", shardId);
|
||||
hadAction = true;
|
||||
StartShard(shardId);
|
||||
break;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
if (status.Process is null or { HasExited: true })
|
||||
{
|
||||
Log.Warning("Shard {ShardId} is starting (process)...", shardId);
|
||||
hadAction = true;
|
||||
StartShard(shardId);
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (InvalidOperationException)
|
||||
{
|
||||
Log.Warning("Process for shard {ShardId} is bugged... ", shardId);
|
||||
hadAction = true;
|
||||
StartShard(shardId);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (hadAction)
|
||||
{
|
||||
await Task.Delay(_config.RecheckIntervalMs, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Error(ex, "Error in coordinator: {Message}", ex.Message);
|
||||
}
|
||||
|
||||
await Task.Delay(5000, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private void StartShard(int shardId)
|
||||
{
|
||||
var status = _shardStatuses[shardId];
|
||||
try
|
||||
{
|
||||
status.Process?.Kill(true);
|
||||
}
|
||||
catch
|
||||
{
|
||||
}
|
||||
try
|
||||
{
|
||||
status.Process?.Dispose();
|
||||
}
|
||||
catch
|
||||
{
|
||||
}
|
||||
|
||||
var proc = StartShardProcess(shardId);
|
||||
_shardStatuses[shardId] = status with
|
||||
{
|
||||
Process = proc,
|
||||
LastUpdate = DateTime.UtcNow,
|
||||
State = ConnState.Disconnected,
|
||||
ShouldRestart = false,
|
||||
StateCounter = 0,
|
||||
};
|
||||
}
|
||||
|
||||
private Process StartShardProcess(int shardId)
|
||||
=> Process.Start(new ProcessStartInfo()
|
||||
{
|
||||
FileName = _config.ShardStartCommand,
|
||||
Arguments = string.Format(_config.ShardStartArgs,
|
||||
shardId,
|
||||
_config.TotalShards),
|
||||
EnvironmentVariables =
|
||||
{
|
||||
{"ELLIEBOT_IS_COORDINATED", "1"}
|
||||
}
|
||||
// CreateNoWindow = true,
|
||||
// UseShellExecute = false,
|
||||
});
|
||||
|
||||
public bool Heartbeat(int shardId, int guildCount, ConnState state)
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
if (shardId >= _shardStatuses.Length)
|
||||
throw new ArgumentOutOfRangeException(nameof(shardId));
|
||||
|
||||
var status = _shardStatuses[shardId];
|
||||
status = _shardStatuses[shardId] = status with
|
||||
{
|
||||
GuildCount = guildCount,
|
||||
State = state,
|
||||
LastUpdate = DateTime.UtcNow,
|
||||
StateCounter = status.State == state
|
||||
? status.StateCounter + 1
|
||||
: 1
|
||||
};
|
||||
if (status.StateCounter > 1 && status.State == ConnState.Disconnected)
|
||||
{
|
||||
Log.Warning("Shard {ShardId} is in DISCONNECTED state! ({StateCounter})",
|
||||
status.ShardId,
|
||||
status.StateCounter);
|
||||
}
|
||||
|
||||
return _gracefulImminent;
|
||||
}
|
||||
}
|
||||
|
||||
public void SetShardCount(int totalShards)
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
SaveConfig(new Config(
|
||||
totalShards,
|
||||
_config.RecheckIntervalMs,
|
||||
_config.ShardStartCommand,
|
||||
_config.ShardStartArgs,
|
||||
_config.UnresponsiveSec));
|
||||
}
|
||||
}
|
||||
|
||||
public void RestartShard(int shardId, bool queue)
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
if (shardId >= _shardStatuses.Length)
|
||||
throw new ArgumentOutOfRangeException(nameof(shardId));
|
||||
|
||||
_shardStatuses[shardId] = _shardStatuses[shardId] with
|
||||
{
|
||||
ShouldRestart = true,
|
||||
StateCounter = 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
public void RestartAll(bool nuke)
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
if (nuke)
|
||||
{
|
||||
KillAll();
|
||||
}
|
||||
|
||||
QueueAll();
|
||||
}
|
||||
}
|
||||
|
||||
private void KillAll()
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
for (var shardId = 0; shardId < _shardStatuses.Length; shardId++)
|
||||
{
|
||||
var status = _shardStatuses[shardId];
|
||||
if (status.Process is Process p)
|
||||
{
|
||||
try { p.Kill(); } catch { }
|
||||
try { p.Dispose(); } catch { }
|
||||
_shardStatuses[shardId] = status with
|
||||
{
|
||||
Process = null,
|
||||
ShouldRestart = true,
|
||||
LastUpdate = DateTime.UtcNow,
|
||||
State = ConnState.Disconnected,
|
||||
StateCounter = 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void SaveState()
|
||||
{
|
||||
var coordState = new CoordState()
|
||||
{
|
||||
StatusObjects = _shardStatuses
|
||||
.Select(x => new JsonStatusObject()
|
||||
{
|
||||
Pid = x.Process?.Id,
|
||||
ConnectionState = x.State,
|
||||
GuildCount = x.GuildCount,
|
||||
})
|
||||
.ToList()
|
||||
};
|
||||
var jsonState = JsonSerializer.Serialize(coordState, new JsonSerializerOptions()
|
||||
{
|
||||
WriteIndented = true,
|
||||
});
|
||||
File.WriteAllText(GRACEFUL_STATE_PATH, jsonState);
|
||||
}
|
||||
private bool TryRestoreOldState()
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
if (!File.Exists(GRACEFUL_STATE_PATH))
|
||||
return false;
|
||||
|
||||
Log.Information("Restoring old coordinator state...");
|
||||
|
||||
CoordState savedState;
|
||||
try
|
||||
{
|
||||
savedState = JsonSerializer.Deserialize<CoordState>(File.ReadAllText(GRACEFUL_STATE_PATH));
|
||||
|
||||
if (savedState is null)
|
||||
throw new Exception("Old state is null?!");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Error(ex, "Error deserializing old state: {Message}", ex.Message);
|
||||
File.Move(GRACEFUL_STATE_PATH, GRACEFUL_STATE_BACKUP_PATH, overwrite: true);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (savedState.StatusObjects.Count != _config.TotalShards)
|
||||
{
|
||||
Log.Error("Unable to restore old state because shard count doesn't match");
|
||||
File.Move(GRACEFUL_STATE_PATH, GRACEFUL_STATE_BACKUP_PATH, overwrite: true);
|
||||
return false;
|
||||
}
|
||||
|
||||
_shardStatuses = new ShardStatus[_config.TotalShards];
|
||||
|
||||
for (int shardId = 0; shardId < _shardStatuses.Length; shardId++)
|
||||
{
|
||||
var statusObj = savedState.StatusObjects[shardId];
|
||||
Process p = null;
|
||||
if (statusObj.Pid is { } pid)
|
||||
{
|
||||
try
|
||||
{
|
||||
p = Process.GetProcessById(pid);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Log.Warning(ex, "Process for shard {ShardId} is not runnning", shardId);
|
||||
}
|
||||
}
|
||||
|
||||
_shardStatuses[shardId] = new(
|
||||
shardId,
|
||||
DateTime.UtcNow,
|
||||
statusObj.GuildCount,
|
||||
statusObj.ConnectionState,
|
||||
p is null,
|
||||
p);
|
||||
}
|
||||
|
||||
File.Move(GRACEFUL_STATE_PATH, GRACEFUL_STATE_BACKUP_PATH, overwrite: true);
|
||||
Log.Information("Old state restored!");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private void InitAll()
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
_shardStatuses = new ShardStatus[_config.TotalShards];
|
||||
for (var shardId = 0; shardId < _shardStatuses.Length; shardId++)
|
||||
{
|
||||
_shardStatuses[shardId] = new ShardStatus(shardId, DateTime.UtcNow);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void QueueAll()
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
for (var shardId = 0; shardId < _shardStatuses.Length; shardId++)
|
||||
{
|
||||
_shardStatuses[shardId] = _shardStatuses[shardId] with
|
||||
{
|
||||
ShouldRestart = true
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public ShardStatus GetShardStatus(int shardId)
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
if (shardId >= _shardStatuses.Length)
|
||||
throw new ArgumentOutOfRangeException(nameof(shardId));
|
||||
|
||||
return _shardStatuses[shardId];
|
||||
}
|
||||
}
|
||||
|
||||
public List<ShardStatus> GetAllStatuses()
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
var toReturn = new List<ShardStatus>(_shardStatuses.Length);
|
||||
toReturn.AddRange(_shardStatuses);
|
||||
return toReturn;
|
||||
}
|
||||
}
|
||||
|
||||
public void PrepareGracefulShutdown()
|
||||
{
|
||||
lock (locker)
|
||||
{
|
||||
_gracefulImminent = true;
|
||||
}
|
||||
}
|
||||
|
||||
public string GetConfigText()
|
||||
=> File.ReadAllText(CONFIG_PATH);
|
||||
|
||||
public void SetConfigText(string text)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
throw new ArgumentNullException(nameof(text), "coord.yml can't be empty");
|
||||
var config = _deserializer.Deserialize<Config>(text);
|
||||
SaveConfig(in config);
|
||||
ReloadConfig();
|
||||
}
|
||||
}
|
||||
}
|
144
src/EllieBot.Coordinator/Services/CoordinatorService.cs
Normal file
144
src/EllieBot.Coordinator/Services/CoordinatorService.cs
Normal file
|
@ -0,0 +1,144 @@
|
|||
using System;
|
||||
using System.Threading.Tasks;
|
||||
using Google.Protobuf.WellKnownTypes;
|
||||
using Grpc.Core;
|
||||
|
||||
namespace EllieBot.Coordinator
|
||||
{
|
||||
public sealed class CoordinatorService : Coordinator.CoordinatorBase
|
||||
{
|
||||
private readonly CoordinatorRunner _runner;
|
||||
|
||||
public CoordinatorService(CoordinatorRunner runner)
|
||||
=> _runner = runner;
|
||||
|
||||
public override Task<HeartbeatReply> Heartbeat(HeartbeatRequest request, ServerCallContext context)
|
||||
{
|
||||
var gracefulImminent = _runner.Heartbeat(request.ShardId, request.GuildCount, request.State);
|
||||
return Task.FromResult(new HeartbeatReply()
|
||||
{
|
||||
GracefulImminent = gracefulImminent
|
||||
});
|
||||
}
|
||||
|
||||
public override Task<ReshardReply> Reshard(ReshardRequest request, ServerCallContext context)
|
||||
{
|
||||
_runner.SetShardCount(request.Shards);
|
||||
return Task.FromResult(new ReshardReply());
|
||||
}
|
||||
|
||||
public override Task<RestartShardReply> RestartShard(RestartShardRequest request, ServerCallContext context)
|
||||
{
|
||||
_runner.RestartShard(request.ShardId, request.Queue);
|
||||
return Task.FromResult(new RestartShardReply());
|
||||
}
|
||||
|
||||
public override Task<ReloadReply> Reload(ReloadRequest request, ServerCallContext context)
|
||||
{
|
||||
_runner.ReloadConfig();
|
||||
return Task.FromResult(new ReloadReply());
|
||||
}
|
||||
|
||||
public override Task<GetStatusReply> GetStatus(GetStatusRequest request, ServerCallContext context)
|
||||
{
|
||||
var status = _runner.GetShardStatus(request.ShardId);
|
||||
|
||||
|
||||
return Task.FromResult(StatusToStatusReply(status));
|
||||
}
|
||||
|
||||
public override Task<GetAllStatusesReply> GetAllStatuses(GetAllStatusesRequest request,
|
||||
ServerCallContext context)
|
||||
{
|
||||
var statuses = _runner
|
||||
.GetAllStatuses();
|
||||
|
||||
var reply = new GetAllStatusesReply();
|
||||
foreach (var status in statuses)
|
||||
reply.Statuses.Add(StatusToStatusReply(status));
|
||||
|
||||
return Task.FromResult(reply);
|
||||
}
|
||||
|
||||
private static GetStatusReply StatusToStatusReply(ShardStatus status)
|
||||
{
|
||||
DateTime startTime;
|
||||
try
|
||||
{
|
||||
startTime = status.Process is null or { HasExited: true }
|
||||
? DateTime.MinValue.ToUniversalTime()
|
||||
: status.Process.StartTime.ToUniversalTime();
|
||||
}
|
||||
catch
|
||||
{
|
||||
startTime = DateTime.MinValue.ToUniversalTime();
|
||||
}
|
||||
|
||||
var reply = new GetStatusReply()
|
||||
{
|
||||
State = status.State,
|
||||
GuildCount = status.GuildCount,
|
||||
ShardId = status.ShardId,
|
||||
LastUpdate = Timestamp.FromDateTime(status.LastUpdate),
|
||||
ScheduledForRestart = status.ShouldRestart,
|
||||
StartedAt = Timestamp.FromDateTime(startTime)
|
||||
};
|
||||
|
||||
return reply;
|
||||
}
|
||||
|
||||
public override Task<RestartAllReply> RestartAllShards(RestartAllRequest request, ServerCallContext context)
|
||||
{
|
||||
_runner.RestartAll(request.Nuke);
|
||||
return Task.FromResult(new RestartAllReply());
|
||||
}
|
||||
|
||||
public override async Task<DieReply> Die(DieRequest request, ServerCallContext context)
|
||||
{
|
||||
if (request.Graceful)
|
||||
{
|
||||
_runner.PrepareGracefulShutdown();
|
||||
await Task.Delay(10_000);
|
||||
}
|
||||
|
||||
_runner.SaveState();
|
||||
_ = Task.Run(async () =>
|
||||
{
|
||||
await Task.Delay(250);
|
||||
Environment.Exit(0);
|
||||
});
|
||||
|
||||
return new DieReply();
|
||||
}
|
||||
|
||||
public override Task<SetConfigTextReply> SetConfigText(SetConfigTextRequest request, ServerCallContext context)
|
||||
{
|
||||
var error = string.Empty;
|
||||
var success = true;
|
||||
try
|
||||
{
|
||||
_runner.SetConfigText(request.ConfigYml);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
error = ex.Message;
|
||||
success = false;
|
||||
}
|
||||
|
||||
return Task.FromResult<SetConfigTextReply>(new(new()
|
||||
{
|
||||
Success = success,
|
||||
Error = error
|
||||
}));
|
||||
}
|
||||
|
||||
public override Task<GetConfigTextReply> GetConfigText(GetConfigTextRequest request, ServerCallContext context)
|
||||
{
|
||||
var text = _runner.GetConfigText();
|
||||
return Task.FromResult(new GetConfigTextReply()
|
||||
{
|
||||
ConfigYml = text,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
21
src/EllieBot.Coordinator/Shared/Config.cs
Normal file
21
src/EllieBot.Coordinator/Shared/Config.cs
Normal file
|
@ -0,0 +1,21 @@
|
|||
namespace EllieBot.Coordinator
|
||||
{
|
||||
public readonly struct Config
|
||||
{
|
||||
public int TotalShards { get; init; }
|
||||
public int RecheckIntervalMs { get; init; }
|
||||
public string ShardStartCommand { get; init; }
|
||||
public string ShardStartArgs { get; init; }
|
||||
public double UnresponsiveSec { get; init; }
|
||||
|
||||
public Config(int totalShards, int recheckIntervalMs, string shardStartCommand, string shardStartArgs, double unresponsiveSec)
|
||||
{
|
||||
TotalShards = totalShards;
|
||||
RecheckIntervalMs = recheckIntervalMs;
|
||||
ShardStartCommand = shardStartCommand;
|
||||
ShardStartArgs = shardStartArgs;
|
||||
UnresponsiveSec = unresponsiveSec;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
9
src/EllieBot.Coordinator/Shared/CoordState.cs
Normal file
9
src/EllieBot.Coordinator/Shared/CoordState.cs
Normal file
|
@ -0,0 +1,9 @@
|
|||
using System.Collections.Generic;
|
||||
|
||||
namespace EllieBot.Coordinator
|
||||
{
|
||||
public class CoordState
|
||||
{
|
||||
public List<JsonStatusObject> StatusObjects { get; init; }
|
||||
}
|
||||
}
|
9
src/EllieBot.Coordinator/Shared/JsonStatusObject.cs
Normal file
9
src/EllieBot.Coordinator/Shared/JsonStatusObject.cs
Normal file
|
@ -0,0 +1,9 @@
|
|||
namespace EllieBot.Coordinator
|
||||
{
|
||||
public class JsonStatusObject
|
||||
{
|
||||
public int? Pid { get; init; }
|
||||
public int GuildCount { get; init; }
|
||||
public ConnState ConnectionState { get; init; }
|
||||
}
|
||||
}
|
15
src/EllieBot.Coordinator/Shared/ShardStatus.cs
Normal file
15
src/EllieBot.Coordinator/Shared/ShardStatus.cs
Normal file
|
@ -0,0 +1,15 @@
|
|||
using System;
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace EllieBot.Coordinator
|
||||
{
|
||||
public sealed record ShardStatus(
|
||||
int ShardId,
|
||||
DateTime LastUpdate,
|
||||
int GuildCount = 0,
|
||||
ConnState State = ConnState.Disconnected,
|
||||
bool ShouldRestart = false,
|
||||
Process Process = null,
|
||||
int StateCounter = 0
|
||||
);
|
||||
}
|
9
src/EllieBot.Coordinator/appsettings.Development.json
Normal file
9
src/EllieBot.Coordinator/appsettings.Development.json
Normal file
|
@ -0,0 +1,9 @@
|
|||
{
|
||||
"Logging": {
|
||||
"LogLevel": {
|
||||
"Default": "Information",
|
||||
"Microsoft": "Warning",
|
||||
"Microsoft.Hosting.Lifetime": "Information"
|
||||
}
|
||||
}
|
||||
}
|
20
src/EllieBot.Coordinator/appsettings.json
Normal file
20
src/EllieBot.Coordinator/appsettings.json
Normal file
|
@ -0,0 +1,20 @@
|
|||
{
|
||||
"Logging": {
|
||||
"LogLevel": {
|
||||
"Default": "Information",
|
||||
"Microsoft": "Warning",
|
||||
"Microsoft.Hosting.Lifetime": "Information"
|
||||
}
|
||||
},
|
||||
"AllowedHosts": "*",
|
||||
"Kestrel": {
|
||||
"EndpointDefaults": {
|
||||
"Protocols": "Http2"
|
||||
},
|
||||
"Endpoints": {
|
||||
"Http": {
|
||||
"Url": "http://localhost:3442"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
12
src/EllieBot.Coordinator/coord.yml
Normal file
12
src/EllieBot.Coordinator/coord.yml
Normal file
|
@ -0,0 +1,12 @@
|
|||
# total number of shards
|
||||
TotalShards: 3
|
||||
# How often do shards ping their state back to the coordinator
|
||||
RecheckIntervalMs: 5000
|
||||
# Command to run the shard
|
||||
ShardStartCommand: dotnet
|
||||
# Arguments to run the shard
|
||||
# {0} = shard id
|
||||
# {1} = total number of shards
|
||||
ShardStartArgs: run -p "..\EllieBot\EllieBot.csproj" --no-build -- {0} {1}
|
||||
# How long does it take for the shard to be forcefully restarted once it stops reporting its state
|
||||
UnresponsiveSec: 30
|
Loading…
Reference in a new issue