alerting should work

This commit is contained in:
mbecker20
2023-08-21 02:40:52 -04:00
parent 5e033e1f76
commit 8a0321bf67
3 changed files with 147 additions and 14 deletions

View File

@@ -133,14 +133,14 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
let region = fmt_region(region);
let percentage = 100.0 * used_gb / total_gb;
let text =
format!("{level} | *{name}*{region} disk usage at *{percentage:.1}%* | mount point: *{path}* 💿 🚨");
format!("{level} | *{name}*{region} disk usage at *{percentage:.1}%* | mount point: *{path:?}* 💿 🚨");
let blocks = vec![
Block::header(level),
Block::section(format!(
"*{name}*{region} disk usage at *{percentage:.1}%* 💿 🚨"
)),
Block::section(format!(
"mount point: {path} | using *{used_gb:.1} GiB* / *{total_gb:.1} GiB*"
"mount point: {path:?} | using *{used_gb:.1} GiB* / *{total_gb:.1} GiB*"
)),
];
(text, blocks.into())
@@ -148,17 +148,18 @@ pub async fn send_slack_alert(url: &str, alert: &Alert) -> anyhow::Result<()> {
AlertData::ServerTemp {
name,
region,
component,
temp,
max,
..
} => {
let region = fmt_region(region);
let text =
format!("{level} | *{name}*{region} temp at {temp:.0} °C (max: {max:.0} °C) 🌡️ 🚨");
format!("{level} | *{name}*{region} | {component} | temp at {temp:.0} °C (max: {max:.0} °C) 🌡️ 🚨");
let blocks = vec![
Block::header(level),
Block::section(format!(
"*{name}*{region} temp at {temp:.0} °C (max: {max:.0} °C) 🌡️ 🚨"
"*{name}*{region} | {component} | temp at {temp:.0} °C (max: {max:.0} °C) 🌡️ 🚨"
)),
];
(text, blocks.into())

View File

@@ -1,4 +1,4 @@
use std::{cmp::Ordering, collections::HashMap, str::FromStr};
use std::{cmp::Ordering, collections::HashMap, path::PathBuf, str::FromStr};
use anyhow::Context;
use monitor_types::{
@@ -16,7 +16,9 @@ use mungos::{
use crate::{auth::InnerRequestUser, helpers::resource::StateResource, state::State};
type OpenAlertMap = HashMap<ResourceTarget, HashMap<AlertDataVariant, Alert>>;
type OpenAlertMap<T = AlertDataVariant> = HashMap<ResourceTarget, HashMap<T, Alert>>;
type OpenDiskAlertMap = OpenAlertMap<PathBuf>;
type OpenTempAlertMap = OpenAlertMap<String>;
impl State {
pub async fn alert_servers(&self) {
@@ -37,7 +39,7 @@ impl State {
return;
}
let alerts = alerts.unwrap();
let (alerts, disk_alerts, temp_alerts) = alerts.unwrap();
let mut alerts_to_open = Vec::<Alert>::new();
let mut alerts_to_update = Vec::<Alert>::new();
@@ -265,8 +267,119 @@ impl State {
// SERVER DISK
// ===================
// alerts possible on multiple disks make this complicated (multiple ServerDisk alerts possible on same server)
// should handle disk alerts seperately
let server_disk_alerts =
disk_alerts.get(&ResourceTarget::Server(server_status.id.clone()));
for (path, health) in &health.disks {
let disk_alert = server_disk_alerts
.as_ref()
.and_then(|alerts| alerts.get(path))
.cloned();
match (*health, disk_alert) {
(SeverityLevel::Warning | SeverityLevel::Critical, None) => {
let disk = server_status
.stats
.as_ref()
.and_then(|s| s.disk.disks.iter().find(|disk| disk.mount == *path));
let alert = Alert {
id: Default::default(),
ts: monitor_timestamp(),
resolved: false,
resolved_ts: None,
level: *health,
target: ResourceTarget::Server(server_status.id.clone()),
variant: AlertDataVariant::ServerDisk,
data: AlertData::ServerDisk {
id: server_status.id.clone(),
name: server.name.clone(),
region: optional_string(&server.info.region),
path: path.to_owned(),
total_gb: disk.map(|d| d.total_gb).unwrap_or_default(),
used_gb: disk.map(|d| d.used_gb).unwrap_or_default(),
},
};
alerts_to_open.push(alert);
}
(SeverityLevel::Warning | SeverityLevel::Critical, Some(mut alert)) => {
if *health != alert.level {
let disk = server_status
.stats
.as_ref()
.and_then(|s| s.disk.disks.iter().find(|disk| disk.mount == *path));
alert.level = *health;
alert.data = AlertData::ServerDisk {
id: server_status.id.clone(),
name: server.name.clone(),
region: optional_string(&server.info.region),
path: path.to_owned(),
total_gb: disk.map(|d| d.total_gb).unwrap_or_default(),
used_gb: disk.map(|d| d.used_gb).unwrap_or_default(),
};
alerts_to_update.push(alert);
}
}
(SeverityLevel::Ok, Some(alert)) => alert_ids_to_close.push(alert.id.clone()),
_ => {}
}
}
// ===================
// SERVER TEMP
// ===================
let server_temp_alerts =
temp_alerts.get(&ResourceTarget::Server(server_status.id.clone()));
for (component, health) in &health.temps {
let temp_alert = server_temp_alerts
.as_ref()
.and_then(|alerts| alerts.get(component))
.cloned();
match (*health, temp_alert) {
(SeverityLevel::Warning | SeverityLevel::Critical, None) => {
let comp = server_status.stats.as_ref().and_then(|s| {
s.components.iter().find(|comp| comp.label == *component)
});
let alert = Alert {
id: Default::default(),
ts: monitor_timestamp(),
resolved: false,
resolved_ts: None,
level: *health,
target: ResourceTarget::Server(server_status.id.clone()),
variant: AlertDataVariant::ServerTemp,
data: AlertData::ServerTemp {
id: server_status.id.clone(),
name: server.name.clone(),
region: optional_string(&server.info.region),
component: component.to_owned(),
temp: comp.map(|c| c.temp).unwrap_or_default() as f64,
max: comp.map(|c| c.max).unwrap_or_default() as f64,
},
};
alerts_to_open.push(alert);
}
(SeverityLevel::Warning | SeverityLevel::Critical, Some(mut alert)) => {
if *health != alert.level {
let comp = server_status.stats.as_ref().and_then(|s| {
s.components.iter().find(|comp| comp.label == *component)
});
alert.level = *health;
alert.data = AlertData::ServerTemp {
id: server_status.id.clone(),
name: server.name.clone(),
region: optional_string(&server.info.region),
component: component.to_owned(),
temp: comp.map(|c| c.temp).unwrap_or_default() as f64,
max: comp.map(|c| c.max).unwrap_or_default() as f64,
};
alerts_to_update.push(alert);
}
}
(SeverityLevel::Ok, Some(alert)) => alert_ids_to_close.push(alert.id.clone()),
_ => {}
}
}
}
tokio::join!(
@@ -353,7 +466,9 @@ impl State {
}
}
async fn get_open_alerts(&self) -> anyhow::Result<OpenAlertMap> {
async fn get_open_alerts(
&self,
) -> anyhow::Result<(OpenAlertMap, OpenDiskAlertMap, OpenTempAlertMap)> {
let alerts = self
.db
.alerts
@@ -362,13 +477,27 @@ impl State {
.context("failed to get open alerts from db")?;
let mut map = OpenAlertMap::new();
let mut disk_map = OpenDiskAlertMap::new();
let mut temp_map = OpenTempAlertMap::new();
for alert in alerts {
let inner = map.entry(alert.target.clone()).or_default();
inner.insert(alert.variant, alert);
match &alert.data {
AlertData::ServerDisk { path, .. } => {
let inner = disk_map.entry(alert.target.clone()).or_default();
inner.insert(path.to_owned(), alert);
}
AlertData::ServerTemp { component, .. } => {
let inner = temp_map.entry(alert.target.clone()).or_default();
inner.insert(component.to_owned(), alert);
}
_ => {
let inner = map.entry(alert.target.clone()).or_default();
inner.insert(alert.variant, alert);
}
}
}
Ok(map)
Ok((map, disk_map, temp_map))
}
async fn get_all_servers_map(&self) -> anyhow::Result<HashMap<String, ServerListItem>> {

View File

@@ -1,3 +1,5 @@
use std::path::PathBuf;
use derive_variants::EnumVariants;
use mungos::{
derive::{MungosIndexed, StringObjectId},
@@ -82,7 +84,7 @@ pub enum AlertData {
id: String,
name: String,
region: Option<String>,
path: String,
path: PathBuf,
used_gb: f64,
total_gb: f64,
},
@@ -90,6 +92,7 @@ pub enum AlertData {
id: String,
name: String,
region: Option<String>,
component: String,
temp: f64,
max: f64,
},