progress on alerting

This commit is contained in:
mbecker20
2023-08-16 23:31:23 -04:00
parent a5fdfc3b67
commit 10beeab4b5
8 changed files with 63 additions and 69 deletions

View File

@@ -1,6 +1,9 @@
use anyhow::{anyhow, Context};
use monitor_types::entities::{
alert::Alert, alerter::*, deployment::DockerContainerState, server::stats::SystemProcess,
alert::{Alert, AlertData},
alerter::*,
deployment::DockerContainerState,
server::stats::SystemProcess,
};
use reqwest::StatusCode;
use slack::types::Block;
@@ -13,8 +16,9 @@ pub async fn send_alert(alerter: &Alerter, alert: &Alert) -> anyhow::Result<()>
}
pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
let (text, blocks): (_, Option<_>) = match alert {
Alert::ServerUnreachable { name, region, .. } => {
let level = alert.level;
let (text, blocks): (_, Option<_>) = match &alert.data {
AlertData::ServerUnreachable { name, region, .. } => {
let region = fmt_region(region);
let text = format!("CRITICAL 🚨 | *{name}*{region} is *unreachable* ❌");
let blocks = vec![
@@ -23,19 +27,18 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
];
(text, blocks.into())
}
Alert::ServerCpu {
AlertData::ServerCpu {
name,
region,
state,
percentage,
top_procs,
..
} => {
let region = fmt_region(region);
let text =
format!("{state} 🚨 | *{name}*{region} cpu usage at *{percentage:.1}%* 📈 🚨");
format!("{level} 🚨 | *{name}*{region} cpu usage at *{percentage:.1}%* 📈 🚨");
let blocks = vec![
Block::header(format!("{state} 🚨")),
Block::header(format!("{level} 🚨")),
Block::section(format!(
"*{name}*{region} cpu usage at *{percentage:.1}%* 📈 🚨"
)),
@@ -43,10 +46,9 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
];
(text, blocks.into())
}
Alert::ServerMem {
AlertData::ServerMem {
name,
region,
state,
used_gb,
total_gb,
top_procs,
@@ -55,9 +57,9 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
let region = fmt_region(region);
let percentage = 100.0 * used_gb / total_gb;
let text =
format!("{state} 🚨 | *{name}*{region} memory usage at *{percentage:.1}%* 💾 🚨");
format!("{level} 🚨 | *{name}*{region} memory usage at *{percentage:.1}%* 💾 🚨");
let blocks = vec![
Block::header(format!("{state} 🚨")),
Block::header(format!("{level} 🚨")),
Block::section(format!(
"*{name}*{region} memory usage at *{percentage:.1}%* 💾 🚨"
)),
@@ -66,10 +68,9 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
];
(text, blocks.into())
}
Alert::ServerDisk {
AlertData::ServerDisk {
name,
region,
state,
path,
used_gb,
total_gb,
@@ -78,9 +79,9 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
let region = fmt_region(region);
let percentage = 100.0 * used_gb / total_gb;
let text =
format!("{state} 🚨 | *{name}*{region} disk usage at *{percentage:.1}%* | mount point: *{path}* 💿 🚨");
format!("{level} 🚨 | *{name}*{region} disk usage at *{percentage:.1}%* | mount point: *{path}* 💿 🚨");
let blocks = vec![
Block::header(format!("{state} 🚨")),
Block::header(format!("{level} 🚨")),
Block::section(format!(
"*{name}*{region} disk usage at *{percentage:.1}%* 💿 🚨"
)),
@@ -90,27 +91,26 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
];
(text, blocks.into())
}
Alert::ServerTemp {
AlertData::ServerTemp {
name,
region,
state,
temp,
max,
..
} => {
let region = fmt_region(region);
let text = format!(
"{state} 🚨 | *{name}*{region} temp at {temp:.0} °C (max: {max:.0} °C) 🌡️ 🚨"
"{level} 🚨 | *{name}*{region} temp at {temp:.0} °C (max: {max:.0} °C) 🌡️ 🚨"
);
let blocks = vec![
Block::header(format!("{state} 🚨")),
Block::header(format!("{level} 🚨")),
Block::section(format!(
"*{name}*{region} temp at {temp:.0} °C (max: {max:.0} °C) 🌡️ 🚨"
)),
];
(text, blocks.into())
}
Alert::ContainerStateChange {
AlertData::ContainerStateChange {
name,
server,
from,
@@ -125,7 +125,7 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
];
(text, blocks.into())
}
Alert::None {} => Default::default(),
AlertData::None {} => Default::default(),
};
if !text.is_empty() {
let slack = slack::Client::new(url);

View File

@@ -1,5 +1,6 @@
use anyhow::Context;
use monitor_types::entities::{
alert::Alert,
alerter::Alerter,
build::Build,
builder::Builder,
@@ -25,6 +26,7 @@ pub struct DbClient {
pub tags: Collection<CustomTag>,
pub alerters: Collection<Alerter>,
pub updates: Collection<Update>,
pub alerts: Collection<Alert>,
}
impl DbClient {
@@ -69,6 +71,7 @@ impl DbClient {
users: User::collection(&mungos, db_name, true).await?,
tags: CustomTag::collection(&mungos, db_name, true).await?,
updates: Update::collection(&mungos, db_name, true).await?,
alerts: Alert::collection(&mungos, db_name, true).await?,
stats: SystemStatsRecord::collection(&mungos, db_name, true).await?,
servers: resource_collection(&mungos, db_name, "Server").await?,
deployments: resource_collection(&mungos, db_name, "Deployment").await?,

View File

@@ -1,16 +1,13 @@
use std::collections::HashMap;
use monitor_types::entities::{
alert::{Alert, AlertVariant},
deployment::DockerContainerState,
server::ServerStatus,
};
use tokio::sync::RwLock;
use monitor_types::entities::{deployment::DockerContainerState, server::ServerStatus};
use crate::state::State;
impl State {
// called after cache update
pub async fn check_alerts(&self) {
tokio::join!(self.alert_servers(), self.alert_deployments());
}
pub async fn alert_servers(&self) {
let server_status = self.server_status_cache.get_list().await;

View File

@@ -2,7 +2,7 @@ use monitor_types::entities::{
deployment::{Deployment, DockerContainerState},
server::{
stats::{
AllSystemStats, BasicSystemStats, ServerHealth, SingleDiskUsage, StatsState,
AllSystemStats, BasicSystemStats, ServerHealth, SeverityLevel, SingleDiskUsage,
SystemComponent,
},
Server, ServerConfig, ServerStatus,
@@ -83,23 +83,23 @@ fn get_server_health(server: &Server, stats: &AllSystemStats) -> ServerHealth {
let mut health = ServerHealth::default();
if cpu_perc >= cpu_critical {
health.cpu = StatsState::Critical
health.cpu = SeverityLevel::Critical
} else if cpu_perc >= cpu_warning {
health.cpu = StatsState::Warning
health.cpu = SeverityLevel::Warning
}
let mem_perc = 100.0 * mem_used_gb / mem_total_gb;
if mem_perc >= *mem_critical {
health.mem = StatsState::Critical
health.mem = SeverityLevel::Critical
} else if mem_perc >= *mem_warning {
health.mem = StatsState::Warning
health.mem = SeverityLevel::Warning
}
let disk_perc = 100.0 * disk_used_gb / disk_total_gb;
if disk_perc >= *disk_critical {
health.disk = StatsState::Critical
health.disk = SeverityLevel::Critical
} else if disk_perc >= *disk_warning {
health.disk = StatsState::Warning
health.disk = SeverityLevel::Warning
}
for SingleDiskUsage {
@@ -110,11 +110,11 @@ fn get_server_health(server: &Server, stats: &AllSystemStats) -> ServerHealth {
{
let perc = 100.0 * used_gb / total_gb;
let stats_state = if perc >= *disk_critical {
StatsState::Critical
SeverityLevel::Critical
} else if perc >= *disk_warning {
StatsState::Warning
SeverityLevel::Warning
} else {
StatsState::Ok
SeverityLevel::Ok
};
health.disks.insert(mount.clone(), stats_state);
}
@@ -129,14 +129,14 @@ fn get_server_health(server: &Server, stats: &AllSystemStats) -> ServerHealth {
let stats_state = if let Some(critical) = critical {
let perc = temp / critical;
if perc >= 0.95 {
StatsState::Critical
SeverityLevel::Critical
} else if perc >= 0.85 {
StatsState::Warning
SeverityLevel::Warning
} else {
StatsState::Ok
SeverityLevel::Ok
}
} else {
StatsState::Ok
SeverityLevel::Ok
};
health.temps.insert(label.clone(), stats_state);
}

View File

@@ -51,7 +51,7 @@ impl State {
self.update_cache_for_server(&server).await;
});
join_all(futures).await;
self.record_server_stats(ts).await;
tokio::join!(self.check_alerts(), self.record_server_stats(ts));
}
}

View File

@@ -3,7 +3,6 @@ use std::{net::SocketAddr, str::FromStr, sync::Arc};
use anyhow::Context;
use axum::Extension;
use monitor_types::entities::{
alert::{Alert, AlertVariant},
build::BuildActionState,
deployment::{DeploymentActionState, DockerContainerState},
repo::RepoActionState,
@@ -35,7 +34,6 @@ pub struct State {
pub deployment_status_cache:
Cache<String, Arc<History<CachedDeploymentStatus, DockerContainerState>>>,
pub server_status_cache: Cache<String, Arc<CachedServerStatus>>,
pub alerts: Cache<(String, AlertVariant), Arc<Alert>>,
// channels
pub build_cancel: BroadcastChannel<String>, // build id to cancel
@@ -64,7 +62,6 @@ impl State {
action_states: Default::default(),
deployment_status_cache: Default::default(),
server_status_cache: Default::default(),
alerts: Default::default(),
update: BroadcastChannel::new(100),
build_cancel: BroadcastChannel::new(10),
config,