mirror of
https://github.com/moghtech/komodo.git
synced 2026-04-29 21:27:26 -05:00
progress on alerting
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use monitor_types::entities::{
|
||||
alert::Alert, alerter::*, deployment::DockerContainerState, server::stats::SystemProcess,
|
||||
alert::{Alert, AlertData},
|
||||
alerter::*,
|
||||
deployment::DockerContainerState,
|
||||
server::stats::SystemProcess,
|
||||
};
|
||||
use reqwest::StatusCode;
|
||||
use slack::types::Block;
|
||||
@@ -13,8 +16,9 @@ pub async fn send_alert(alerter: &Alerter, alert: &Alert) -> anyhow::Result<()>
|
||||
}
|
||||
|
||||
pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
|
||||
let (text, blocks): (_, Option<_>) = match alert {
|
||||
Alert::ServerUnreachable { name, region, .. } => {
|
||||
let level = alert.level;
|
||||
let (text, blocks): (_, Option<_>) = match &alert.data {
|
||||
AlertData::ServerUnreachable { name, region, .. } => {
|
||||
let region = fmt_region(region);
|
||||
let text = format!("CRITICAL 🚨 | *{name}*{region} is *unreachable* ❌");
|
||||
let blocks = vec![
|
||||
@@ -23,19 +27,18 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
|
||||
];
|
||||
(text, blocks.into())
|
||||
}
|
||||
Alert::ServerCpu {
|
||||
AlertData::ServerCpu {
|
||||
name,
|
||||
region,
|
||||
state,
|
||||
percentage,
|
||||
top_procs,
|
||||
..
|
||||
} => {
|
||||
let region = fmt_region(region);
|
||||
let text =
|
||||
format!("{state} 🚨 | *{name}*{region} cpu usage at *{percentage:.1}%* 📈 🚨");
|
||||
format!("{level} 🚨 | *{name}*{region} cpu usage at *{percentage:.1}%* 📈 🚨");
|
||||
let blocks = vec![
|
||||
Block::header(format!("{state} 🚨")),
|
||||
Block::header(format!("{level} 🚨")),
|
||||
Block::section(format!(
|
||||
"*{name}*{region} cpu usage at *{percentage:.1}%* 📈 🚨"
|
||||
)),
|
||||
@@ -43,10 +46,9 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
|
||||
];
|
||||
(text, blocks.into())
|
||||
}
|
||||
Alert::ServerMem {
|
||||
AlertData::ServerMem {
|
||||
name,
|
||||
region,
|
||||
state,
|
||||
used_gb,
|
||||
total_gb,
|
||||
top_procs,
|
||||
@@ -55,9 +57,9 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
|
||||
let region = fmt_region(region);
|
||||
let percentage = 100.0 * used_gb / total_gb;
|
||||
let text =
|
||||
format!("{state} 🚨 | *{name}*{region} memory usage at *{percentage:.1}%* 💾 🚨");
|
||||
format!("{level} 🚨 | *{name}*{region} memory usage at *{percentage:.1}%* 💾 🚨");
|
||||
let blocks = vec![
|
||||
Block::header(format!("{state} 🚨")),
|
||||
Block::header(format!("{level} 🚨")),
|
||||
Block::section(format!(
|
||||
"*{name}*{region} memory usage at *{percentage:.1}%* 💾 🚨"
|
||||
)),
|
||||
@@ -66,10 +68,9 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
|
||||
];
|
||||
(text, blocks.into())
|
||||
}
|
||||
Alert::ServerDisk {
|
||||
AlertData::ServerDisk {
|
||||
name,
|
||||
region,
|
||||
state,
|
||||
path,
|
||||
used_gb,
|
||||
total_gb,
|
||||
@@ -78,9 +79,9 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
|
||||
let region = fmt_region(region);
|
||||
let percentage = 100.0 * used_gb / total_gb;
|
||||
let text =
|
||||
format!("{state} 🚨 | *{name}*{region} disk usage at *{percentage:.1}%* | mount point: *{path}* 💿 🚨");
|
||||
format!("{level} 🚨 | *{name}*{region} disk usage at *{percentage:.1}%* | mount point: *{path}* 💿 🚨");
|
||||
let blocks = vec![
|
||||
Block::header(format!("{state} 🚨")),
|
||||
Block::header(format!("{level} 🚨")),
|
||||
Block::section(format!(
|
||||
"*{name}*{region} disk usage at *{percentage:.1}%* 💿 🚨"
|
||||
)),
|
||||
@@ -90,27 +91,26 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
|
||||
];
|
||||
(text, blocks.into())
|
||||
}
|
||||
Alert::ServerTemp {
|
||||
AlertData::ServerTemp {
|
||||
name,
|
||||
region,
|
||||
state,
|
||||
temp,
|
||||
max,
|
||||
..
|
||||
} => {
|
||||
let region = fmt_region(region);
|
||||
let text = format!(
|
||||
"{state} 🚨 | *{name}*{region} temp at {temp:.0} °C (max: {max:.0} °C) 🌡️ 🚨"
|
||||
"{level} 🚨 | *{name}*{region} temp at {temp:.0} °C (max: {max:.0} °C) 🌡️ 🚨"
|
||||
);
|
||||
let blocks = vec![
|
||||
Block::header(format!("{state} 🚨")),
|
||||
Block::header(format!("{level} 🚨")),
|
||||
Block::section(format!(
|
||||
"*{name}*{region} temp at {temp:.0} °C (max: {max:.0} °C) 🌡️ 🚨"
|
||||
)),
|
||||
];
|
||||
(text, blocks.into())
|
||||
}
|
||||
Alert::ContainerStateChange {
|
||||
AlertData::ContainerStateChange {
|
||||
name,
|
||||
server,
|
||||
from,
|
||||
@@ -125,7 +125,7 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
|
||||
];
|
||||
(text, blocks.into())
|
||||
}
|
||||
Alert::None {} => Default::default(),
|
||||
AlertData::None {} => Default::default(),
|
||||
};
|
||||
if !text.is_empty() {
|
||||
let slack = slack::Client::new(url);
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use anyhow::Context;
|
||||
use monitor_types::entities::{
|
||||
alert::Alert,
|
||||
alerter::Alerter,
|
||||
build::Build,
|
||||
builder::Builder,
|
||||
@@ -25,6 +26,7 @@ pub struct DbClient {
|
||||
pub tags: Collection<CustomTag>,
|
||||
pub alerters: Collection<Alerter>,
|
||||
pub updates: Collection<Update>,
|
||||
pub alerts: Collection<Alert>,
|
||||
}
|
||||
|
||||
impl DbClient {
|
||||
@@ -69,6 +71,7 @@ impl DbClient {
|
||||
users: User::collection(&mungos, db_name, true).await?,
|
||||
tags: CustomTag::collection(&mungos, db_name, true).await?,
|
||||
updates: Update::collection(&mungos, db_name, true).await?,
|
||||
alerts: Alert::collection(&mungos, db_name, true).await?,
|
||||
stats: SystemStatsRecord::collection(&mungos, db_name, true).await?,
|
||||
servers: resource_collection(&mungos, db_name, "Server").await?,
|
||||
deployments: resource_collection(&mungos, db_name, "Deployment").await?,
|
||||
|
||||
@@ -1,16 +1,13 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use monitor_types::entities::{
|
||||
alert::{Alert, AlertVariant},
|
||||
deployment::DockerContainerState,
|
||||
server::ServerStatus,
|
||||
};
|
||||
use tokio::sync::RwLock;
|
||||
use monitor_types::entities::{deployment::DockerContainerState, server::ServerStatus};
|
||||
|
||||
use crate::state::State;
|
||||
|
||||
impl State {
|
||||
// called after cache update
|
||||
pub async fn check_alerts(&self) {
|
||||
tokio::join!(self.alert_servers(), self.alert_deployments());
|
||||
}
|
||||
|
||||
pub async fn alert_servers(&self) {
|
||||
let server_status = self.server_status_cache.get_list().await;
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ use monitor_types::entities::{
|
||||
deployment::{Deployment, DockerContainerState},
|
||||
server::{
|
||||
stats::{
|
||||
AllSystemStats, BasicSystemStats, ServerHealth, SingleDiskUsage, StatsState,
|
||||
AllSystemStats, BasicSystemStats, ServerHealth, SeverityLevel, SingleDiskUsage,
|
||||
SystemComponent,
|
||||
},
|
||||
Server, ServerConfig, ServerStatus,
|
||||
@@ -83,23 +83,23 @@ fn get_server_health(server: &Server, stats: &AllSystemStats) -> ServerHealth {
|
||||
let mut health = ServerHealth::default();
|
||||
|
||||
if cpu_perc >= cpu_critical {
|
||||
health.cpu = StatsState::Critical
|
||||
health.cpu = SeverityLevel::Critical
|
||||
} else if cpu_perc >= cpu_warning {
|
||||
health.cpu = StatsState::Warning
|
||||
health.cpu = SeverityLevel::Warning
|
||||
}
|
||||
|
||||
let mem_perc = 100.0 * mem_used_gb / mem_total_gb;
|
||||
if mem_perc >= *mem_critical {
|
||||
health.mem = StatsState::Critical
|
||||
health.mem = SeverityLevel::Critical
|
||||
} else if mem_perc >= *mem_warning {
|
||||
health.mem = StatsState::Warning
|
||||
health.mem = SeverityLevel::Warning
|
||||
}
|
||||
|
||||
let disk_perc = 100.0 * disk_used_gb / disk_total_gb;
|
||||
if disk_perc >= *disk_critical {
|
||||
health.disk = StatsState::Critical
|
||||
health.disk = SeverityLevel::Critical
|
||||
} else if disk_perc >= *disk_warning {
|
||||
health.disk = StatsState::Warning
|
||||
health.disk = SeverityLevel::Warning
|
||||
}
|
||||
|
||||
for SingleDiskUsage {
|
||||
@@ -110,11 +110,11 @@ fn get_server_health(server: &Server, stats: &AllSystemStats) -> ServerHealth {
|
||||
{
|
||||
let perc = 100.0 * used_gb / total_gb;
|
||||
let stats_state = if perc >= *disk_critical {
|
||||
StatsState::Critical
|
||||
SeverityLevel::Critical
|
||||
} else if perc >= *disk_warning {
|
||||
StatsState::Warning
|
||||
SeverityLevel::Warning
|
||||
} else {
|
||||
StatsState::Ok
|
||||
SeverityLevel::Ok
|
||||
};
|
||||
health.disks.insert(mount.clone(), stats_state);
|
||||
}
|
||||
@@ -129,14 +129,14 @@ fn get_server_health(server: &Server, stats: &AllSystemStats) -> ServerHealth {
|
||||
let stats_state = if let Some(critical) = critical {
|
||||
let perc = temp / critical;
|
||||
if perc >= 0.95 {
|
||||
StatsState::Critical
|
||||
SeverityLevel::Critical
|
||||
} else if perc >= 0.85 {
|
||||
StatsState::Warning
|
||||
SeverityLevel::Warning
|
||||
} else {
|
||||
StatsState::Ok
|
||||
SeverityLevel::Ok
|
||||
}
|
||||
} else {
|
||||
StatsState::Ok
|
||||
SeverityLevel::Ok
|
||||
};
|
||||
health.temps.insert(label.clone(), stats_state);
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ impl State {
|
||||
self.update_cache_for_server(&server).await;
|
||||
});
|
||||
join_all(futures).await;
|
||||
self.record_server_stats(ts).await;
|
||||
tokio::join!(self.check_alerts(), self.record_server_stats(ts));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::{net::SocketAddr, str::FromStr, sync::Arc};
|
||||
use anyhow::Context;
|
||||
use axum::Extension;
|
||||
use monitor_types::entities::{
|
||||
alert::{Alert, AlertVariant},
|
||||
build::BuildActionState,
|
||||
deployment::{DeploymentActionState, DockerContainerState},
|
||||
repo::RepoActionState,
|
||||
@@ -35,7 +34,6 @@ pub struct State {
|
||||
pub deployment_status_cache:
|
||||
Cache<String, Arc<History<CachedDeploymentStatus, DockerContainerState>>>,
|
||||
pub server_status_cache: Cache<String, Arc<CachedServerStatus>>,
|
||||
pub alerts: Cache<(String, AlertVariant), Arc<Alert>>,
|
||||
|
||||
// channels
|
||||
pub build_cancel: BroadcastChannel<String>, // build id to cancel
|
||||
@@ -64,7 +62,6 @@ impl State {
|
||||
action_states: Default::default(),
|
||||
deployment_status_cache: Default::default(),
|
||||
server_status_cache: Default::default(),
|
||||
alerts: Default::default(),
|
||||
update: BroadcastChannel::new(100),
|
||||
build_cancel: BroadcastChannel::new(10),
|
||||
config,
|
||||
|
||||
Reference in New Issue
Block a user