imporve slack alerting

This commit is contained in:
mbecker20
2024-06-22 01:10:13 -07:00
parent 6fe250244b
commit 8fee04607d
2 changed files with 185 additions and 92 deletions

View File

@@ -180,18 +180,36 @@ async fn send_slack_alert(
percentage,
} => {
let region = fmt_region(region);
let text = format!("{level} | *{name}*{region} cpu usage at *{percentage:.1}%* 📈 🚨");
let blocks = vec![
Block::header(format!("{level} 🚨")),
Block::section(format!(
"*{name}*{region} cpu usage at *{percentage:.1}%* 📈 🚨"
)),
Block::section(resource_link(
ResourceTargetVariant::Server,
id,
)),
];
(text, blocks.into())
match alert.level {
SeverityLevel::Ok => {
let text = format!("{level} | *{name}*{region} cpu usage at *{percentage:.1}%* ✅");
let blocks = vec![
Block::header(format!("{level}")),
Block::section(format!(
"*{name}*{region} cpu usage at *{percentage:.1}%*"
)),
Block::section(resource_link(
ResourceTargetVariant::Server,
id,
)),
];
(text, blocks.into())
}
_ => {
let text = format!("{level} | *{name}*{region} cpu usage at *{percentage:.1}%* 📈 🚨");
let blocks = vec![
Block::header(format!("{level} 🚨")),
Block::section(format!(
"*{name}*{region} cpu usage at *{percentage:.1}%* 📈"
)),
Block::section(resource_link(
ResourceTargetVariant::Server,
id,
)),
];
(text, blocks.into())
}
}
}
AlertData::ServerMem {
id,
@@ -202,22 +220,42 @@ async fn send_slack_alert(
} => {
let region = fmt_region(region);
let percentage = 100.0 * used_gb / total_gb;
let text =
format!("{level} | *{name}*{region} memory usage at *{percentage:.1}%* 💾 🚨");
let blocks = vec![
Block::header(level),
Block::section(format!(
"*{name}*{region} memory usage at *{percentage:.1}%* 💾 🚨"
)),
Block::section(format!(
"using *{used_gb:.1} GiB* / *{total_gb:.1} GiB*"
)),
Block::section(resource_link(
ResourceTargetVariant::Server,
id,
)),
];
(text, blocks.into())
match alert.level {
SeverityLevel::Ok => {
let text = format!("{level} | *{name}*{region} memory usage at *{percentage:.1}%* 💾 ✅");
let blocks = vec![
Block::header(format!("{level}")),
Block::section(format!(
"*{name}*{region} memory usage at *{percentage:.1}%* 💾"
)),
Block::section(format!(
"using *{used_gb:.1} GiB* / *{total_gb:.1} GiB*"
)),
Block::section(resource_link(
ResourceTargetVariant::Server,
id,
)),
];
(text, blocks.into())
}
_ => {
let text = format!("{level} | *{name}*{region} memory usage at *{percentage:.1}%* 💾 🚨");
let blocks = vec![
Block::header(format!("{level} 🚨")),
Block::section(format!(
"*{name}*{region} memory usage at *{percentage:.1}%* 💾"
)),
Block::section(format!(
"using *{used_gb:.1} GiB* / *{total_gb:.1} GiB*"
)),
Block::section(resource_link(
ResourceTargetVariant::Server,
id,
)),
];
(text, blocks.into())
}
}
}
AlertData::ServerDisk {
id,
@@ -229,18 +267,36 @@ async fn send_slack_alert(
} => {
let region = fmt_region(region);
let percentage = 100.0 * used_gb / total_gb;
let text = format!("{level} | *{name}*{region} disk usage at *{percentage:.1}%* | mount point: *{path:?}* 💿 🚨");
let blocks = vec![
Block::header(level),
Block::section(format!(
"*{name}*{region} disk usage at *{percentage:.1}%* 💿 🚨"
)),
Block::section(format!(
"mount point: {path:?} | using *{used_gb:.1} GiB* / *{total_gb:.1} GiB*"
)),
Block::section(resource_link(ResourceTargetVariant::Server, id)),
];
(text, blocks.into())
match alert.level {
SeverityLevel::Ok => {
let text = format!("{level} | *{name}*{region} disk usage at *{percentage:.1}%* | mount point: *{path:?}* 💿 ✅");
let blocks = vec![
Block::header(format!("{level}")),
Block::section(format!(
"*{name}*{region} disk usage at *{percentage:.1}%* 💿"
)),
Block::section(format!(
"mount point: {path:?} | using *{used_gb:.1} GiB* / *{total_gb:.1} GiB*"
)),
Block::section(resource_link(ResourceTargetVariant::Server, id)),
];
(text, blocks.into())
}
_ => {
let text = format!("{level} | *{name}*{region} disk usage at *{percentage:.1}%* | mount point: *{path:?}* 💿 🚨");
let blocks = vec![
Block::header(format!("{level} 🚨")),
Block::section(format!(
"*{name}*{region} disk usage at *{percentage:.1}%* 💿"
)),
Block::section(format!(
"mount point: {path:?} | using *{used_gb:.1} GiB* / *{total_gb:.1} GiB*"
)),
Block::section(resource_link(ResourceTargetVariant::Server, id)),
];
(text, blocks.into())
}
}
}
AlertData::ContainerStateChange {
name,
@@ -269,12 +325,12 @@ async fn send_slack_alert(
message,
} => {
let text = format!(
"{level} | Failed to terminated AWS builder instance"
"{level} | Failed to terminated AWS builder instance 🚨"
);
let blocks = vec![
Block::header(text.clone()),
Block::section(format!(
"instance id: **{instance_id}**\n{message}"
"instance id: *{instance_id}*\n{message}"
)),
];
(text, blocks.into())
@@ -285,7 +341,7 @@ async fn send_slack_alert(
let blocks = vec![
Block::header(text.clone()),
Block::section(format!(
"sync id: **{id}**\nsync name: **{name}**",
"sync id: *{id}*\nsync name: *{name}*",
)),
Block::section(resource_link(
ResourceTargetVariant::ResourceSync,
@@ -300,20 +356,23 @@ async fn send_slack_alert(
version,
err,
} => {
let text = format!("{level} | Build {name} has failed");
let text = format!("{level} | Build {name} has failed 🚨");
let err = err
.as_ref()
.map(|log| {
format!(
"\nfailed at stage: {}\nstdout: {}\nstderr: {}",
log.stage, log.stdout, log.stderr
)
let stdout = (!log.stdout.is_empty())
.then(|| format!("\nstdout: {}", log.stdout))
.unwrap_or_default();
let stderr = (!log.stderr.is_empty())
.then(|| format!("\nstderr: {}", log.stderr))
.unwrap_or_default();
format!("\nfailed at stage: {}{stdout}{stderr}", log.stage)
})
.unwrap_or_default();
let blocks = vec![
Block::header(text.clone()),
Block::section(format!(
"build id: **{id}**\nbuild name: **{name}**\nversion: v{version}{err}",
"build id: *{id}*\nbuild name: *{name}*\nversion: v{version}{err}",
)),
Block::section(resource_link(ResourceTargetVariant::Build, id))
];

View File

@@ -42,7 +42,7 @@ pub async fn alert_servers(
let mut alerts_to_open = Vec::<(Alert, SendAlerts)>::new();
let mut alerts_to_update = Vec::<(Alert, SendAlerts)>::new();
let mut alert_ids_to_close = Vec::<(String, SendAlerts)>::new();
let mut alert_ids_to_close = Vec::<(Alert, SendAlerts)>::new();
for server_status in server_statuses {
let Some(server) = servers.remove(&server_status.id) else {
@@ -101,13 +101,10 @@ pub async fn alert_servers(
}
// Close an open alert
(
ServerState::Ok | ServerState::Disabled,
Some(health_alert),
) => alert_ids_to_close.push((
health_alert.id.clone(),
server.info.send_unreachable_alerts,
)),
(ServerState::Ok | ServerState::Disabled, Some(alert)) => {
alert_ids_to_close
.push((alert.clone(), server.info.send_unreachable_alerts));
}
_ => {}
}
@@ -165,8 +162,20 @@ pub async fn alert_servers(
alerts_to_update.push((alert, server.info.send_cpu_alerts));
}
}
(SeverityLevel::Ok, Some(alert)) => alert_ids_to_close
.push((alert.id.clone(), server.info.send_cpu_alerts)),
(SeverityLevel::Ok, Some(alert)) => {
let mut alert = alert.clone();
alert.data = AlertData::ServerCpu {
id: server_status.id.clone(),
name: server.name.clone(),
region: optional_string(&server.info.region),
percentage: server_status
.stats
.as_ref()
.map(|s| s.cpu_perc as f64)
.unwrap_or(0.0),
};
alert_ids_to_close.push((alert, server.info.send_cpu_alerts))
}
_ => {}
}
@@ -229,8 +238,25 @@ pub async fn alert_servers(
alerts_to_update.push((alert, server.info.send_mem_alerts));
}
}
(SeverityLevel::Ok, Some(alert)) => alert_ids_to_close
.push((alert.id.clone(), server.info.send_mem_alerts)),
(SeverityLevel::Ok, Some(alert)) => {
let mut alert = alert.clone();
alert.data = AlertData::ServerMem {
id: server_status.id.clone(),
name: server.name.clone(),
region: optional_string(&server.info.region),
total_gb: server_status
.stats
.as_ref()
.map(|s| s.mem_total_gb)
.unwrap_or(0.0),
used_gb: server_status
.stats
.as_ref()
.map(|s| s.mem_used_gb)
.unwrap_or(0.0),
};
alert_ids_to_close.push((alert, server.info.send_mem_alerts))
}
_ => {}
}
@@ -291,8 +317,23 @@ pub async fn alert_servers(
.push((alert, server.info.send_disk_alerts));
}
}
(SeverityLevel::Ok, Some(alert)) => alert_ids_to_close
.push((alert.id.clone(), server.info.send_disk_alerts)),
(SeverityLevel::Ok, Some(alert)) => {
let mut alert = alert.clone();
let disk = server_status.stats.as_ref().and_then(|stats| {
stats.disks.iter().find(|disk| disk.mount == *path)
});
alert.level = *health;
alert.data = AlertData::ServerDisk {
id: server_status.id.clone(),
name: server.name.clone(),
region: optional_string(&server.info.region),
path: path.to_owned(),
total_gb: disk.map(|d| d.total_gb).unwrap_or_default(),
used_gb: disk.map(|d| d.used_gb).unwrap_or_default(),
};
alert_ids_to_close
.push((alert, server.info.send_disk_alerts))
}
_ => {}
}
}
@@ -402,22 +443,20 @@ async fn update_alerts(alerts: &[(Alert, SendAlerts)]) {
}
#[instrument(level = "debug")]
async fn resolve_alerts(alert_ids: &[(String, SendAlerts)]) {
if alert_ids.is_empty() {
async fn resolve_alerts(alerts: &[(Alert, SendAlerts)]) {
if alerts.is_empty() {
return;
}
let send_alerts_map =
alert_ids.iter().cloned().collect::<HashMap<_, _>>();
let close = || async {
let alert_ids = alert_ids
let close = || async move {
let alert_ids = alerts
.iter()
.map(|(id, _)| {
ObjectId::from_str(id)
.map(|(alert, _)| {
ObjectId::from_str(&alert.id)
.context("failed to convert alert id to ObjectId")
})
.collect::<anyhow::Result<Vec<_>>>()?;
db_client()
.await
.alerts
@@ -432,28 +471,23 @@ async fn resolve_alerts(alert_ids: &[(String, SendAlerts)]) {
None,
)
.await
.context("failed to resolve alerts on db")?;
let mut closed = find_collect(
&db_client().await.alerts,
doc! { "_id": { "$in": &alert_ids } },
None,
)
.await
.context("failed to get closed alerts from db")?;
.context("failed to resolve alerts on db")
.inspect_err(|e| warn!("{e:#}"))
.ok();
for closed in &mut closed {
closed.level = SeverityLevel::Ok;
}
let ts = monitor_timestamp();
let closed = closed
.into_iter()
.filter(|closed| {
if let ResourceTarget::Server(id) = &closed.target {
send_alerts_map.get(id).cloned().unwrap_or(true)
} else {
error!("got resource target other than server in resolve_server_alerts");
true
}
let closed = alerts
.iter()
.filter(|(_, send)| *send)
.map(|(alert, _)| {
let mut alert = alert.clone();
alert.resolved = true;
alert.resolved_ts = Some(ts);
alert.level = SeverityLevel::Ok;
alert
})
.collect::<Vec<_>>();