Compare commits

..

21 Commits
1.4.9 ... 1.6.0

Author SHA1 Message Date
Owen
527321a415 Update cicd 2025-10-27 21:26:14 -07:00
Owen
8d3ae5afd7 Add doc for SKIP_TLS_VERIFY 2025-10-27 21:25:12 -07:00
Owen Schwartz
f1e07272bd Merge pull request #166 from marcschaeferger/gh-action
Adding GHCR to CI/CD Release Workflow & further improvements
2025-10-20 17:21:05 -07:00
Marc Schäfer
a1a3d63fcf ci(actions): change runner from ubuntu-latest to amd64-runner for CI/CD workflows 2025-10-21 02:17:49 +02:00
Marc Schäfer
2a273dc435 ci(actions): add GHCR mirroring and cosign signing for Docker images
- mirror images from Docker Hub to GHCR using skopeo (preserves multi-arch manifests)
- login to GHCR via docker/login-action for signing/pushing
- install cosign and perform dual signing: keyless (OIDC) + key-based; verify signatures
- add required permissions for id-token/packages and reference necessary secrets
2025-10-21 00:22:32 +02:00
Marc Schäfer
ec05686523 ci(actions): pin action versions to commit SHAs for security
- Pin actions/checkout to SHA for v5.0.0
- Pin docker/setup-qemu-action to SHA for v3.6.0
- Pin docker/setup-buildx-action to SHA for v3.11.1
- Pin docker/login-action to SHA for v3.6.0
- Pin actions/setup-go to SHA for v6.0.0
- Pin actions/upload-artifact to SHA for v4.6.2
2025-10-21 00:21:28 +02:00
Owen Schwartz
915e7e44d1 Merge pull request #165 from marcschaeferger/ghcr
feat(actions): Sync Images from Docker to GHCR
2025-10-20 12:32:41 -07:00
Marc Schäfer
a729b91ac3 feat(actions): Sync Images from Docker to GHCR 2025-10-20 21:30:31 +02:00
Owen
ddc37658df Update domain 2025-10-19 15:12:15 -07:00
Owen
7c780f7a4f Merge branch 'dev' of github.com:fosrl/newt into dev 2025-10-16 21:09:41 -07:00
Owen
6b1c1ed077 Merge branch 'main' of github.com:fosrl/newt 2025-10-16 21:06:33 -07:00
Owen Schwartz
7a07437b22 Merge pull request #162 from marcschaeferger/otel
Adding OpenTelemetry Metrics and Tracing
2025-10-16 20:48:37 -07:00
Owen
d63d8d6f5e Add log message that the server is on 2025-10-16 20:42:02 -07:00
Owen
bda1d04f67 Add documentation for cli and reporg 2025-10-16 20:39:41 -07:00
Owen
7f8ee37c7f Update runner 2025-10-16 17:51:25 -07:00
Marc Schäfer
6d2073a478 Remove Coolify Code 2025-10-11 18:46:02 +02:00
Owen Schwartz
6048f244f1 Merge pull request #158 from fosrl/dependabot/go_modules/prod-patch-updates-46361b25de
Bump github.com/docker/docker from 28.5.0+incompatible to 28.5.1+incompatible in the prod-patch-updates group
2025-10-11 09:41:30 -07:00
Owen Schwartz
9fec22a53b Merge pull request #159 from fosrl/dependabot/go_modules/prod-minor-updates-a55d2abe4a
Bump the prod-minor-updates group with 2 updates
2025-10-11 09:41:20 -07:00
Marc Schäfer
c086e69dd0 Adding OpenTelemetry Metrics and Tracing 2025-10-11 18:19:51 +02:00
dependabot[bot]
c729ab5fc6 Bump the prod-minor-updates group with 2 updates
Bumps the prod-minor-updates group with 2 updates: [golang.org/x/crypto](https://github.com/golang/crypto) and [golang.org/x/net](https://github.com/golang/net).


Updates `golang.org/x/crypto` from 0.42.0 to 0.43.0
- [Commits](https://github.com/golang/crypto/compare/v0.42.0...v0.43.0)

Updates `golang.org/x/net` from 0.45.0 to 0.46.0
- [Commits](https://github.com/golang/net/compare/v0.45.0...v0.46.0)

---
updated-dependencies:
- dependency-name: golang.org/x/crypto
  dependency-version: 0.43.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-minor-updates
- dependency-name: golang.org/x/net
  dependency-version: 0.46.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: prod-minor-updates
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-10-09 09:34:56 +00:00
dependabot[bot]
552617cbb5 Bump github.com/docker/docker in the prod-patch-updates group
Bumps the prod-patch-updates group with 1 update: [github.com/docker/docker](https://github.com/docker/docker).


Updates `github.com/docker/docker` from 28.5.0+incompatible to 28.5.1+incompatible
- [Release notes](https://github.com/docker/docker/releases)
- [Commits](https://github.com/docker/docker/compare/v28.5.0...v28.5.1)

---
updated-dependencies:
- dependency-name: github.com/docker/docker
  dependency-version: 28.5.1+incompatible
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: prod-patch-updates
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-10-09 09:34:49 +00:00
27 changed files with 151 additions and 2783 deletions

View File

@@ -25,7 +25,7 @@ concurrency:
jobs:
release:
name: Build and Release
runs-on: ubuntu-latest
runs-on: amd64-runner
# Job-level timeout to avoid runaway or stuck runs
timeout-minutes: 120
env:
@@ -78,13 +78,6 @@ jobs:
echo "Built & pushed to: ${{ env.DOCKERHUB_IMAGE }}:${TAG}"
shell: bash
- name: Login in to GHCR
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Install skopeo + jq
# skopeo: copy/inspect images between registries
# jq: JSON parsing tool used to extract digest values
@@ -94,6 +87,11 @@ jobs:
skopeo --version
shell: bash
- name: Login to GHCR
run: |
skopeo login ghcr.io -u "${{ github.actor }}" -p "${{ secrets.GITHUB_TOKEN }}"
shell: bash
- name: Copy tag from Docker Hub to GHCR
# Mirror the already-built image (all architectures) to GHCR so we can sign it
run: |

View File

@@ -14,7 +14,7 @@ env:
jobs:
mirror-and-dual-sign:
runs-on: ubuntu-latest
runs-on: amd64-runner
steps:
- name: Install skopeo + jq
run: |
@@ -36,17 +36,11 @@ jobs:
run: |
skopeo login ghcr.io -u "${{ github.actor }}" -p "${{ secrets.GITHUB_TOKEN }}"
# >>> IMPORTANT: Auth for cosign (docker-config) <<<
# Auth for cosign (docker-config)
- name: Docker login to GHCR (for cosign)
run: |
echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
# Optional (if Docker Hub private / tight limits)
# - name: Login to Docker Hub (skopeo and cosign share this via docker login)
# run: |
# echo "${{ secrets.DOCKERHUB_TOKEN }}" | docker login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin
# skopeo login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" -p "${{ secrets.DOCKERHUB_TOKEN }}"
- name: List source tags
run: |
set -euo pipefail
@@ -136,4 +130,3 @@ jobs:
echo "Skipped : $skipped"
echo "Verified OK : $v_ok"
echo "Errors : $errs"

View File

@@ -11,7 +11,7 @@ on:
jobs:
test:
runs-on: ubuntu-latest
runs-on: amd64-runner
steps:
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

1
.gitignore vendored
View File

@@ -6,4 +6,3 @@ nohup.out
*.iml
certs/
newt_arm64
.env

View File

@@ -4,11 +4,7 @@ Contributions are welcome!
Please see the contribution and local development guide on the docs page before getting started:
https://docs.fossorial.io/development
For ideas about what features to work on and our future plans, please see the roadmap:
https://docs.fossorial.io/roadmap
https://docs.pangolin.net/development/contributing
### Licensing Considerations
@@ -21,4 +17,4 @@ By creating this pull request, I grant the project maintainers an unlimited,
perpetual license to use, modify, and redistribute these contributions under any terms they
choose, including both the AGPLv3 and the Fossorial Commercial license terms. I
represent that I have the right to grant this license for all contributed content.
```
```

View File

@@ -1,5 +1,3 @@
#ghcr.io/marcschaeferger/newt-private:1.0.0-otel
#tademsh/newt:1.0.0-otel
FROM golang:1.25-alpine AS builder
# Install git and ca-certificates
@@ -11,13 +9,8 @@ WORKDIR /app
# Copy go mod and sum files
COPY go.mod go.sum ./
# Coolify specific Test - set Go proxy to direct to avoid issues
# ENV GOSUMDB=off
ENV GOPROXY=https://goproxy.io,https://proxy.golang.org,direct
RUN go env | grep -E 'GOPROXY|GOSUMDB|GOPRIVATE' && go mod download
# Download all dependencies
#RUN go mod download
RUN go mod download
# Copy the source code into the container
COPY . .

202
README.md
View File

@@ -1,18 +1,15 @@
<!-- markdownlint-disable MD033 -->
# Newt
[![PkgGoDev](https://pkg.go.dev/badge/github.com/fosrl/newt)](https://pkg.go.dev/github.com/fosrl/newt)
[![GitHub License](https://img.shields.io/github/license/fosrl/newt)](https://github.com/fosrl/newt/blob/main/LICENSE)
[![Go Report Card](https://goreportcard.com/badge/github.com/fosrl/newt)](https://goreportcard.com/report/github.com/fosrl/newt)
Newt is a fully user space [WireGuard](https://www.wireguard.com/) tunnel client and TCP/UDP proxy, designed to securely expose private resources controlled by Pangolin. By using Newt, you don't need to manage complex WireGuard tunnels and NATing.
## Installation and Documentation
### Installation and Documentation
Newt is used with Pangolin and Gerbil as part of the larger system. See documentation below:
- [Full Documentation](https://docs.fossorial.io)
- Observability Quickstart: see `docs/observability.md` — canonical Prometheus/OTel Collector quickstart and smoke tests
- [Full Documentation](https://docs.pangolin.net)
## Preview
@@ -36,91 +33,137 @@ When Newt receives WireGuard control messages, it will use the information encod
## CLI Args
### Core Configuration
- `id`: Newt ID generated by Pangolin to identify the client.
- `secret`: A unique secret (not shared and kept private) used to authenticate the client ID with the websocket in order to receive commands.
- `endpoint`: The endpoint where both Gerbil and Pangolin reside in order to connect to the websocket.
- `mtu` (optional): MTU for the internal WG interface. Default: 1280
- `dns` (optional): DNS server to use to resolve the endpoint. Default: 9.9.9.9
- `blueprint-file` (optional): Path to blueprint file to define Pangolin resources and configurations.
- `no-cloud` (optional): Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false
- `log-level` (optional): The log level to use (DEBUG, INFO, WARN, ERROR, FATAL). Default: INFO
- `enforce-hc-cert` (optional): Enforce certificate validation for health checks. Default: false (accepts any cert)
### Docker Integration
- `docker-socket` (optional): Set the Docker socket to use the container discovery integration
- `ping-interval` (optional): Interval for pinging the server. Default: 3s
- `ping-timeout` (optional): Timeout for each ping. Default: 5s
- `updown` (optional): A script to be called when targets are added or removed.
- `tls-client-cert` (optional): Client certificate (p12 or pfx) for mTLS. See [mTLS](#mtls)
- `tls-client-cert` (optional): Path to client certificate (PEM format, optional if using PKCS12). See [mTLS](#mtls)
- `tls-client-key` (optional): Path to private key for mTLS (PEM format, optional if using PKCS12)
- `tls-ca-cert` (optional): Path to CA certificate to verify server (PEM format, optional if using PKCS12)
- `docker-enforce-network-validation` (optional): Validate the container target is on the same network as the newt process. Default: false
- `health-file` (optional): Check if connection to WG server (pangolin) is ok. creates a file if ok, removes it if not ok. Can be used with docker healtcheck to restart newt
### Accpet Client Connection
- `accept-clients` (optional): Enable WireGuard server mode to accept incoming newt client connections. Default: false
- `generateAndSaveKeyTo` (optional): Path to save generated private key
- `native` (optional): Use native WireGuard interface when accepting clients (requires WireGuard kernel module and Linux, must run as root). Default: false (uses userspace netstack)
- `interface` (optional): Name of the WireGuard interface. Default: newt
- `keep-interface` (optional): Keep the WireGuard interface. Default: false
- `blueprint-file` (optional): Path to blueprint file to define Pangolin resources and configurations.
- `no-cloud` (optional): Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false
### Metrics & Observability
- `metrics` (optional): Enable Prometheus /metrics exporter. Default: true
- `otlp` (optional): Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT. Default: false
- `metrics-admin-addr` (optional): Admin/metrics bind address. Default: 127.0.0.1:2112
- `metrics-async-bytes` (optional): Enable async bytes counting (background flush; lower hot path overhead). Default: false
- `region` (optional): Optional region resource attribute for telemetry and metrics.
### Network Configuration
- `mtu` (optional): MTU for the internal WG interface. Default: 1280
- `dns` (optional): DNS server to use to resolve the endpoint. Default: 9.9.9.9
- `ping-interval` (optional): Interval for pinging the server. Default: 3s
- `ping-timeout` (optional): Timeout for each ping. Default: 5s
### Security & TLS
- `enforce-hc-cert` (optional): Enforce certificate validation for health checks. Default: false (accepts any cert)
- `tls-client-cert` (optional): Client certificate (p12 or pfx) for mTLS or path to client certificate (PEM format). See [mTLS](#mtls)
- `tls-client-key` (optional): Path to private key for mTLS (PEM format, optional if using PKCS12)
- `tls-ca-cert` (optional): Path to CA certificate to verify server (PEM format, optional if using PKCS12)
### Monitoring & Health
- `health-file` (optional): Check if connection to WG server (pangolin) is ok. creates a file if ok, removes it if not ok. Can be used with docker healtcheck to restart newt
- `updown` (optional): A script to be called when targets are added or removed.
## Environment Variables
All CLI arguments can be set using environment variables as an alternative to command line flags. Environment variables are particularly useful when running Newt in containerized environments.
### Core Configuration
- `PANGOLIN_ENDPOINT`: Endpoint of your pangolin server (equivalent to `--endpoint`)
- `NEWT_ID`: Newt ID generated by Pangolin (equivalent to `--id`)
- `NEWT_SECRET`: Newt secret for authentication (equivalent to `--secret`)
- `MTU`: MTU for the internal WG interface. Default: 1280 (equivalent to `--mtu`)
- `DNS`: DNS server to use to resolve the endpoint. Default: 9.9.9.9 (equivalent to `--dns`)
- `CONFIG_FILE`: Load the config json from this file instead of in the home folder.
- `BLUEPRINT_FILE`: Path to blueprint file to define Pangolin resources and configurations. (equivalent to `--blueprint-file`)
- `NO_CLOUD`: Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false (equivalent to `--no-cloud`)
- `LOG_LEVEL`: Log level (DEBUG, INFO, WARN, ERROR, FATAL). Default: INFO (equivalent to `--log-level`)
### Docker Integration
- `DOCKER_SOCKET`: Path to Docker socket for container discovery (equivalent to `--docker-socket`)
- `PING_INTERVAL`: Interval for pinging the server. Default: 3s (equivalent to `--ping-interval`)
- `PING_TIMEOUT`: Timeout for each ping. Default: 5s (equivalent to `--ping-timeout`)
- `UPDOWN_SCRIPT`: Path to updown script for target add/remove events (equivalent to `--updown`)
- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`)
- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`)
- `TLS_CLIENT_KEY`: Path to private key for mTLS (equivalent to `--tls-client-key`)
- `TLS_CA_CERT`: Path to CA certificate to verify server (equivalent to `--tls-ca-cert`)
- `DOCKER_ENFORCE_NETWORK_VALIDATION`: Validate container targets are on same network. Default: false (equivalent to `--docker-enforce-network-validation`)
- `ENFORCE_HC_CERT`: Enforce certificate validation for health checks. Default: false (equivalent to `--enforce-hc-cert`)
- `HEALTH_FILE`: Path to health file for connection monitoring (equivalent to `--health-file`)
### Accept Client Connections
- `ACCEPT_CLIENTS`: Enable WireGuard server mode. Default: false (equivalent to `--accept-clients`)
- `GENERATE_AND_SAVE_KEY_TO`: Path to save generated private key (equivalent to `--generateAndSaveKeyTo`)
- `USE_NATIVE_INTERFACE`: Use native WireGuard interface (Linux only). Default: false (equivalent to `--native`)
- `INTERFACE`: Name of the WireGuard interface. Default: newt (equivalent to `--interface`)
- `KEEP_INTERFACE`: Keep the WireGuard interface after shutdown. Default: false (equivalent to `--keep-interface`)
- `CONFIG_FILE`: Load the config json from this file instead of in the home folder.
- `BLUEPRINT_FILE`: Path to blueprint file to define Pangolin resources and configurations. (equivalent to `--blueprint-file`)
- `NO_CLOUD`: Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false (equivalent to `--no-cloud`)
### Monitoring & Health
- `HEALTH_FILE`: Path to health file for connection monitoring (equivalent to `--health-file`)
- `UPDOWN_SCRIPT`: Path to updown script for target add/remove events (equivalent to `--updown`)
### Metrics & Observability
- `NEWT_METRICS_PROMETHEUS_ENABLED`: Enable Prometheus /metrics exporter. Default: true (equivalent to `--metrics`)
- `NEWT_METRICS_OTLP_ENABLED`: Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT. Default: false (equivalent to `--otlp`)
- `NEWT_ADMIN_ADDR`: Admin/metrics bind address. Default: 127.0.0.1:2112 (equivalent to `--metrics-admin-addr`)
- `NEWT_METRICS_ASYNC_BYTES`: Enable async bytes counting (background flush; lower hot path overhead). Default: false (equivalent to `--metrics-async-bytes`)
- `NEWT_REGION`: Optional region resource attribute for telemetry and metrics (equivalent to `--region`)
### Network Configuration
- `MTU`: MTU for the internal WG interface. Default: 1280 (equivalent to `--mtu`)
- `DNS`: DNS server to use to resolve the endpoint. Default: 9.9.9.9 (equivalent to `--dns`)
- `PING_INTERVAL`: Interval for pinging the server. Default: 3s (equivalent to `--ping-interval`)
- `PING_TIMEOUT`: Timeout for each ping. Default: 5s (equivalent to `--ping-timeout`)
### Security & TLS
- `ENFORCE_HC_CERT`: Enforce certificate validation for health checks. Default: false (equivalent to `--enforce-hc-cert`)
- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`)
- `TLS_CLIENT_KEY`: Path to private key for mTLS (equivalent to `--tls-client-key`)
- `TLS_CA_CERT`: Path to CA certificate to verify server (equivalent to `--tls-ca-cert`)
- `SKIP_TLS_VERIFY`: Skip TLS verification for server connections. Default: false
## Loading secrets from files
You can use `CONFIG_FILE` to define a location of a config file to store the credentials between runs.
You can use `CONFIG_FILE` to define a location of a config file to store the credentials between runs.
```sh
```
$ cat ~/.config/newt-client/config.json
{
"id": "spmzu8rbpzj1qq6",
"secret": "f6v61mjutwme2kkydbw3fjo227zl60a2tsf5psw9r25hgae3",
"endpoint": "https://pangolin.fossorial.io",
"endpoint": "https://app.pangolin.net",
"tlsClientCert": ""
}
```
This file is also written to when newt first starts up. So you do not need to run every time with --id and secret if you have run it once!
This file is also written to when newt first starts up. So you do not need to run every time with --id and secret if you have run it once!
Default locations:
Default locations:
- **macOS**: `~/Library/Application Support/newt-client/config.json`
- **Windows**: `%PROGRAMDATA%\newt\newt-client\config.json`
- **Linux/Others**: `~/.config/newt-client/config.json`
<!-- Observability Quickstart moved to docs/observability.md (canonical). -->
## Examples
**Note**: When both environment variables and CLI arguments are provided, CLI arguments take precedence.
- Example:
- Example:
```bash
newt \
@@ -167,16 +210,16 @@ When the `--accept-clients` flag is enabled (or `ACCEPT_CLIENTS=true` environmen
In client acceptance mode, Newt:
- **Creates a WireGuard service** that can accept incoming connections from other WireGuard clients
- **Starts a connection testing server** (WGTester) that responds to connectivity checks from remote clients
- **Manages peer configurations** dynamically based on Pangolin's instructions
- **Enables bidirectional communication** between the Newt instance and connected clients
- **Creates a WireGuard service** that can accept incoming connections from other WireGuard clients
- **Starts a connection testing server** (WGTester) that responds to connectivity checks from remote clients
- **Manages peer configurations** dynamically based on Pangolin's instructions
- **Enables bidirectional communication** between the Newt instance and connected clients
### Use Cases
- **Site-to-site connectivity**: Connect multiple locations through a central Newt instance
- **Client access to private networks**: Allow remote clients to access resources behind the Newt instance
- **Development environments**: Provide developers secure access to internal services
- **Site-to-site connectivity**: Connect multiple locations through a central Newt instance
- **Client access to private networks**: Allow remote clients to access resources behind the Newt instance
- **Development environments**: Provide developers secure access to internal services
### Client Tunneling Modes
@@ -186,11 +229,11 @@ Newt supports two WireGuard tunneling modes:
By default, Newt uses a fully userspace WireGuard implementation using [netstack](https://github.com/WireGuard/wireguard-go/blob/master/tun/netstack/examples/http_server.go). This mode:
- **Does not require root privileges**
- **Works on all supported platforms** (Linux, Windows, macOS)
- **Does not require WireGuard kernel module** to be installed
- **Runs entirely in userspace** - no system network interface is created
- **Is containerization-friendly** - works seamlessly in Docker containers
- **Does not require root privileges**
- **Works on all supported platforms** (Linux, Windows, macOS)
- **Does not require WireGuard kernel module** to be installed
- **Runs entirely in userspace** - no system network interface is created
- **Is containerization-friendly** - works seamlessly in Docker containers
This is the recommended mode for most deployments, especially containerized environments.
@@ -200,11 +243,11 @@ In this mode, TCP and UDP is proxied out of newt from the remote client using TC
When using the `--native` flag or setting `USE_NATIVE_INTERFACE=true`, Newt uses the native WireGuard kernel module. This mode:
- **Requires root privileges** to create and manage network interfaces
- **Only works on Linux** with the WireGuard kernel module installed
- **Creates a real network interface** (e.g., `newt0`) on the system
- **May offer better performance** for high-throughput scenarios
- **Requires proper network permissions** and may conflict with existing network configurations
- **Requires root privileges** to create and manage network interfaces
- **Only works on Linux** with the WireGuard kernel module installed
- **Creates a real network interface** (e.g., `newt0`) on the system
- **May offer better performance** for high-throughput scenarios
- **Requires proper network permissions** and may conflict with existing network configurations
In this mode it functions like a traditional VPN interface - all data arrives on the interface and you must get it to the destination (or access things locally).
@@ -236,10 +279,10 @@ services:
When client acceptance is enabled:
- **WGTester Server**: Runs on `port + 1` (e.g., if WireGuard uses port 51820, WGTester uses 51821)
- **Connection Testing**: Responds to UDP packets with magic header `0xDEADBEEF` for connectivity verification
- **Dynamic Configuration**: Peer configurations are managed remotely through Pangolin
- **Proxy Integration**: Can work with both userspace (netstack) and native WireGuard modes
- **WGTester Server**: Runs on `port + 1` (e.g., if WireGuard uses port 51820, WGTester uses 51821)
- **Connection Testing**: Responds to UDP packets with magic header `0xDEADBEEF` for connectivity verification
- **Dynamic Configuration**: Peer configurations are managed remotely through Pangolin
- **Proxy Integration**: Can work with both userspace (netstack) and native WireGuard modes
**Note**: Client acceptance mode requires coordination with Pangolin for peer management and configuration distribution.
@@ -253,23 +296,24 @@ You can specify the Docker socket path using the `--docker-socket` CLI argument
Supported values include:
- Local UNIX socket (default):
- Local UNIX socket (default):
>You must mount the socket file into the container using a volume, so Newt can access it.
`unix:///var/run/docker.sock`
- TCP socket (e.g., via Docker Socket Proxy):
- TCP socket (e.g., via Docker Socket Proxy):
`tcp://localhost:2375`
- HTTP/HTTPS endpoints (e.g., remote Docker APIs):
- HTTP/HTTPS endpoints (e.g., remote Docker APIs):
`http://your-host:2375`
- SSH connections (experimental, requires SSH setup):
- SSH connections (experimental, requires SSH setup):
`ssh://user@host`
```yaml
services:
newt:
@@ -284,17 +328,16 @@ services:
- NEWT_SECRET=nnisrfsdfc7prqsp9ewo1dvtvci50j5uiqotez00dgap0ii2
- DOCKER_SOCKET=unix:///var/run/docker.sock
```
>If you previously used just a path like `/var/run/docker.sock`, it still works — Newt assumes it is a UNIX socket by default.
#### Hostnames vs IPs
When the Docker Socket Integration is used, depending on the network which Newt is run with, either the hostname (generally considered the container name) or the IP address of the container will be sent to Pangolin. Here are some of the scenarios where IPs or hostname of the container will be utilised:
- **Running in Network Mode 'host'**: IP addresses will be used
- **Running in Network Mode 'bridge'**: IP addresses will be used
- **Running in docker-compose without a network specification**: Docker compose creates a network for the compose by default, hostnames will be used
- **Running on docker-compose with defined network**: Hostnames will be used
- **Running in Network Mode 'host'**: IP addresses will be used
- **Running in Network Mode 'bridge'**: IP addresses will be used
- **Running in docker-compose without a network specification**: Docker compose creates a network for the compose by default, hostnames will be used
- **Running on docker-compose with defined network**: Hostnames will be used
### Docker Enforce Network Validation
@@ -330,12 +373,12 @@ Newt supports mutual TLS (mTLS) authentication if the server is configured to re
> This is the original method and still supported.
- File must contain:
* File must contain:
- Client private key
- Public certificate
- CA certificate
- Encrypted `.p12` files are **not supported**
* Client private key
* Public certificate
* CA certificate
* Encrypted `.p12` files are **not supported**
Example:
@@ -351,9 +394,9 @@ newt \
You can now provide separate files for:
- `--tls-client-cert`: client certificate (`.crt` or `.pem`)
- `--tls-client-key`: client private key (`.key` or `.pem`)
- `--tls-ca-cert`: CA cert to verify the server
* `--tls-client-cert`: client certificate (`.crt` or `.pem`)
* `--tls-client-key`: client private key (`.key` or `.pem`)
* `--tls-ca-cert`: CA cert to verify the server
Example:
@@ -367,6 +410,7 @@ newt \
--tls-ca-cert ./ca.crt
```
```yaml
services:
newt:

View File

@@ -3,7 +3,7 @@
If you discover a security vulnerability, please follow the steps below to responsibly disclose it to us:
1. **Do not create a public GitHub issue or discussion post.** This could put the security of other users at risk.
2. Send a detailed report to [security@fossorial.io](mailto:security@fossorial.io) or send a **private** message to a maintainer on [Discord](https://discord.gg/HCJR8Xhme4). Include:
2. Send a detailed report to [security@pangolin.net](mailto:security@pangolin.net) or send a **private** message to a maintainer on [Discord](https://discord.gg/HCJR8Xhme4). Include:
- Description and location of the vulnerability.
- Potential impact of the vulnerability.

View File

@@ -12,9 +12,9 @@ resources:
sso-roles:
- Member
sso-users:
- owen@fossorial.io
- owen@pangolin.net
whitelist-users:
- owen@fossorial.io
- owen@pangolin.net
targets:
# - site: glossy-plains-viscacha-rat
- hostname: localhost

View File

@@ -1,32 +0,0 @@
services:
otel-collector:
image: otel/opentelemetry-collector:0.111.0
command: ["--config=/etc/otelcol/config.yaml"]
volumes:
- ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
ports:
- "4317:4317" # OTLP gRPC in
- "8889:8889" # Prometheus scrape out
newt:
build: .
image: newt:dev
environment:
OTEL_SERVICE_NAME: newt
NEWT_METRICS_PROMETHEUS_ENABLED: "true"
NEWT_METRICS_OTLP_ENABLED: "true"
OTEL_EXPORTER_OTLP_ENDPOINT: "otel-collector:4317"
OTEL_EXPORTER_OTLP_INSECURE: "true"
OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative"
NEWT_ADMIN_ADDR: "0.0.0.0:2112"
ports:
- "2112:2112"
depends_on:
- otel-collector
prometheus:
image: prom/prometheus:v2.55.0
volumes:
- ./examples/prometheus.yml:/etc/prometheus/prometheus.yml:ro
ports:
- "9090:9090"

View File

@@ -1,3 +1,4 @@
name: Newt-Metrics
services:
# Recommended Variant A: Direct Prometheus scrape of Newt (/metrics)
# Optional: You may add the Collector service and enable OTLP export, but do NOT

View File

@@ -1,75 +0,0 @@
# Newt Metrics: Recommendations, Gaps, and Roadmap
This document captures the current state of Newt metrics, prioritized fixes, and a pragmatic roadmap for near-term improvements.
1) Current setup (summary)
- Export: Prometheus exposition (default), optional OTLP (gRPC)
- Existing instruments:
- Sites: newt_site_registrations_total, newt_site_online (0/1), newt_site_last_heartbeat_timestamp_seconds
- Tunnel/Traffic: newt_tunnel_sessions, newt_tunnel_bytes_total, newt_tunnel_latency_seconds, newt_tunnel_reconnects_total
- Connection lifecycle: newt_connection_attempts_total, newt_connection_errors_total
- Operations: newt_config_reloads_total, process_start_time_seconds, newt_build_info
- Operations: newt_config_reloads_total, process_start_time_seconds, newt_config_apply_seconds, newt_cert_rotation_total
- Build metadata: newt_build_info
- Control plane: newt_websocket_connect_latency_seconds, newt_websocket_messages_total, newt_websocket_connected, newt_websocket_reconnects_total
- Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_async_backlog_bytes, newt_proxy_drops_total, newt_proxy_accept_total, newt_proxy_connection_duration_seconds, newt_proxy_connections_total
- Go runtime: GC, heap, goroutines via runtime instrumentation
2) Main issues addressed now
- Attribute filter (allow-list) extended to include site_id and region in addition to existing keys (tunnel_id, transport, protocol, direction, result, reason, error_type, version, commit).
- site_id and region propagation: site_id/region remain resource attributes. Metric labels mirror them on per-site gauges and counters by default; set `NEWT_METRICS_INCLUDE_SITE_LABELS=false` to drop them for multi-tenant scrapes.
- Label semantics clarified:
- transport: control-plane mechanism (e.g., websocket, wireguard)
- protocol: L4 payload type (tcp, udp)
- newt_tunnel_bytes_total uses protocol and direction, not transport.
- Robustness improvements: removed duplicate clear logic on reconnect; avoided empty site_id by reading NEWT_SITE_ID/NEWT_ID and OTEL_RESOURCE_ATTRIBUTES.
3) Remaining gaps and deviations
- Some call sites still need initiator label on reconnect outcomes (client vs server). This is planned.
- Config apply duration and cert rotation counters are planned.
- Registration and config reload failures are not yet emitted; add failure code paths so result labels expose churn.
- Document using `process_start_time_seconds` (and `time()` in PromQL) to derive uptime; no explicit restart counter is needed.
- Metric helpers often use `context.Background()`. Where lightweight contexts exist (e.g., HTTP handlers), propagate them to ease future correlation.
- Tracing coverage is limited to admin HTTP and WebSocket connect spans; extend to blueprint fetches, proxy accept loops, and WireGuard updates when OTLP is enabled.
4) Roadmap (phased)
- Phase 1 (done in this iteration)
- Fix attribute filter (site_id, region)
- Propagate site_id (and optional region) across metrics
- Correct label semantics (transport vs protocol); fix sessions transport labelling
- Documentation alignment
- Phase 2 (next)
- Reconnect: add initiator label (client/server)
- Config & PKI: newt_config_apply_seconds{phase,result}; newt_cert_rotation_total{result}
- WebSocket disconnect and keepalive failure counters
- Proxy connection lifecycle metrics (accept totals, duration histogram)
- Pangolin blueprint/config fetch latency and status metrics
- Certificate rotation duration histogram to complement success/failure counter
5) Operational guidance
- Do not double scrape: scrape either Newt (/metrics) or the Collectors Prometheus exporter (not both) to avoid double-counting cumulative counters.
- For high cardinality tunnel_id, consider relabeling or dropping per-tunnel series in Prometheus to control cardinality.
- OTLP troubleshooting: enable TLS via OTEL_EXPORTER_OTLP_CERTIFICATE, use OTEL_EXPORTER_OTLP_HEADERS for auth; verify endpoint reachability.
6) Example alerts/recording rules (suggestions)
- Reconnect spikes:
- increase(newt_tunnel_reconnects_total[5m]) by (site_id)
- Sustained connection errors:
- rate(newt_connection_errors_total[5m]) by (site_id,transport,error_type)
- Heartbeat gaps:
- max_over_time(time() - newt_site_last_heartbeat_timestamp_seconds[15m]) by (site_id)
- Proxy drops:
- increase(newt_proxy_drops_total[5m]) by (site_id,protocol)
- WebSocket connect p95 (when added):
- histogram_quantile(0.95, sum(rate(newt_websocket_connect_latency_seconds_bucket[5m])) by (le,site_id))
7) Collector configuration
- Direct scrape variant requires no attribute promotion since site_id is already a metric label.
- Transform/promote variant remains optional for environments that rely on resource-to-label promotion.

View File

@@ -1,247 +0,0 @@
<!-- markdownlint-disable MD033 -->
# OpenTelemetry Observability for Newt
This document describes how Newt exposes metrics using the OpenTelemetry (OTel) Go SDK, how to enable Prometheus scraping, and how to send data to an OpenTelemetry Collector for further export.
Goals
- Provide a /metrics endpoint in Prometheus exposition format (via OTel Prometheus exporter)
- Keep metrics backend-agnostic; optional OTLP export to a Collector
- Use OTel semantic conventions where applicable and enforce SI units
- Low-cardinality, stable labels only
Enable via flags (ENV mirrors)
- --metrics (default: true) ↔ NEWT_METRICS_PROMETHEUS_ENABLED
- --metrics-admin-addr (default: 127.0.0.1:2112) ↔ NEWT_ADMIN_ADDR
- --otlp (default: false) ↔ NEWT_METRICS_OTLP_ENABLED
Enable exporters via environment variables (no code changes required)
- NEWT_METRICS_PROMETHEUS_ENABLED=true|false (default: true)
- NEWT_METRICS_OTLP_ENABLED=true|false (default: false)
- OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317
- OTEL_EXPORTER_OTLP_INSECURE=true|false (default: true for dev)
- OTEL_SERVICE_NAME=newt (default)
- OTEL_SERVICE_VERSION=<version>
- OTEL_RESOURCE_ATTRIBUTES=service.instance.id=<id>,site_id=<id>
- OTEL_METRIC_EXPORT_INTERVAL=15s (default)
- NEWT_ADMIN_ADDR=127.0.0.1:2112 (default admin HTTP with /metrics)
- NEWT_METRICS_INCLUDE_SITE_LABELS=true|false (default: true; disable to drop site_id/region as metric labels and rely on resource attributes only)
Runtime behavior
- When Prometheus exporter is enabled, Newt serves /metrics on NEWT_ADMIN_ADDR (default :2112)
- When OTLP is enabled, metrics and traces are exported to OTLP gRPC endpoint
- Go runtime metrics (goroutines, GC, memory) are exported automatically
Metric catalog (current)
Unless otherwise noted, `site_id` and `region` are available via resource attributes and, by default, as metric labels. Set `NEWT_METRICS_INCLUDE_SITE_LABELS=false` to drop them from counter/histogram label sets in high-cardinality environments.
| Metric | Instrument | Key attributes | Purpose | Example |
| --- | --- | --- | --- | --- |
| `newt_build_info` | Observable gauge (Int64) | `version`, `commit`, `site_id`, `region` (optional when site labels enabled) | Emits build metadata with value `1` for scrape-time verification. | `newt_build_info{version="1.5.0"} 1` |
| `newt_site_registrations_total` | Counter (Int64) | `result` (`success`/`failure`), `site_id`, `region` (optional) | Counts Pangolin registration attempts. | `newt_site_registrations_total{result="success",site_id="acme-edge-1"} 1` |
| `newt_site_online` | Observable gauge (Int64) | `site_id` | Reports whether the site is currently connected (`1`) or offline (`0`). | `newt_site_online{site_id="acme-edge-1"} 1` |
| `newt_site_last_heartbeat_timestamp_seconds` | Observable gauge (Float64) | `site_id` | Unix timestamp of the most recent Pangolin heartbeat (derive age via `time() - metric`). | `newt_site_last_heartbeat_timestamp_seconds{site_id="acme-edge-1"} 1.728e+09` |
| `newt_tunnel_sessions` | Observable gauge (Int64) | `site_id`, `tunnel_id` (when enabled) | Counts active tunnel sessions per peer; collapses to per-site when tunnel IDs are disabled. | `newt_tunnel_sessions{site_id="acme-edge-1",tunnel_id="wgpub..."} 3` |
| `newt_tunnel_bytes_total` | Counter (Int64) | `direction` (`ingress`/`egress`), `protocol` (`tcp`/`udp`), `tunnel_id` (optional), `site_id`, `region` (optional) | Measures proxied traffic volume across tunnels. | `newt_tunnel_bytes_total{direction="ingress",protocol="tcp",site_id="acme-edge-1"} 4096` |
| `newt_tunnel_latency_seconds` | Histogram (Float64) | `transport` (e.g., `wireguard`), `tunnel_id` (optional), `site_id`, `region` (optional) | Captures RTT or configuration-driven latency samples. | `newt_tunnel_latency_seconds_bucket{transport="wireguard",le="0.5"} 42` |
| `newt_tunnel_reconnects_total` | Counter (Int64) | `initiator` (`client`/`server`), `reason` (enumerated), `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks reconnect causes for troubleshooting flaps. | `newt_tunnel_reconnects_total{initiator="client",reason="timeout",site_id="acme-edge-1"} 5` |
| `newt_connection_attempts_total` | Counter (Int64) | `transport` (`auth`/`websocket`), `result`, `site_id`, `region` (optional) | Measures control-plane dial attempts and their outcomes. | `newt_connection_attempts_total{transport="websocket",result="success",site_id="acme-edge-1"} 8` |
| `newt_connection_errors_total` | Counter (Int64) | `transport`, `error_type`, `site_id`, `region` (optional) | Buckets connection failures by normalized error class. | `newt_connection_errors_total{transport="websocket",error_type="tls_handshake",site_id="acme-edge-1"} 1` |
| `newt_config_reloads_total` | Counter (Int64) | `result`, `site_id`, `region` (optional) | Counts remote blueprint/config reloads. | `newt_config_reloads_total{result="success",site_id="acme-edge-1"} 3` |
| `process_start_time_seconds` | Observable gauge (Float64) | — | Unix timestamp of the Newt process start time (use `time() - process_start_time_seconds` for uptime). | `process_start_time_seconds 1.728e+09` |
| `newt_config_apply_seconds` | Histogram (Float64) | `phase` (`interface`/`peer`), `result`, `site_id`, `region` (optional) | Measures time spent applying WireGuard configuration phases. | `newt_config_apply_seconds_sum{phase="peer",result="success",site_id="acme-edge-1"} 0.48` |
| `newt_cert_rotation_total` | Counter (Int64) | `result`, `site_id`, `region` (optional) | Tracks client certificate rotation attempts. | `newt_cert_rotation_total{result="success",site_id="acme-edge-1"} 2` |
| `newt_websocket_connect_latency_seconds` | Histogram (Float64) | `transport="websocket"`, `result`, `error_type` (on failure), `site_id`, `region` (optional) | Measures WebSocket dial latency and exposes failure buckets. | `newt_websocket_connect_latency_seconds_bucket{result="success",le="0.5",site_id="acme-edge-1"} 9` |
| `newt_websocket_messages_total` | Counter (Int64) | `direction` (`in`/`out`), `msg_type` (`text`/`ping`/`pong`), `site_id`, `region` (optional) | Accounts for control WebSocket traffic volume by type. | `newt_websocket_messages_total{direction="out",msg_type="ping",site_id="acme-edge-1"} 12` |
| `newt_websocket_connected` | Observable gauge (Int64) | `site_id`, `region` (optional) | Reports current WebSocket connectivity (`1` when connected). | `newt_websocket_connected{site_id="acme-edge-1"} 1` |
| `newt_websocket_reconnects_total` | Counter (Int64) | `reason` (`tls_handshake`, `dial_timeout`, `io_error`, `ping_write`, `timeout`, etc.), `site_id`, `region` (optional) | Counts reconnect attempts with normalized reasons for failure analysis. | `newt_websocket_reconnects_total{reason="timeout",site_id="acme-edge-1"} 3` |
| `newt_proxy_active_connections` | Observable gauge (Int64) | `protocol` (`tcp`/`udp`), `direction` (`ingress`/`egress`), `tunnel_id` (optional), `site_id`, `region` (optional) | Current proxy connections per tunnel and protocol. | `newt_proxy_active_connections{protocol="tcp",direction="egress",site_id="acme-edge-1"} 4` |
| `newt_proxy_buffer_bytes` | Observable gauge (Int64) | `protocol`, `direction`, `tunnel_id` (optional), `site_id`, `region` (optional) | Volume of buffered data awaiting flush in proxy queues. | `newt_proxy_buffer_bytes{protocol="udp",direction="egress",site_id="acme-edge-1"} 2048` |
| `newt_proxy_async_backlog_bytes` | Observable gauge (Int64) | `protocol`, `direction`, `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks async write backlog when deferred flushing is enabled. | `newt_proxy_async_backlog_bytes{protocol="tcp",direction="egress",site_id="acme-edge-1"} 512` |
| `newt_proxy_drops_total` | Counter (Int64) | `protocol`, `tunnel_id` (optional), `site_id`, `region` (optional) | Counts proxy drop events caused by downstream write errors. | `newt_proxy_drops_total{protocol="udp",site_id="acme-edge-1"} 1` |
| `newt_proxy_connections_total` | Counter (Int64) | `event` (`opened`/`closed`), `protocol`, `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks proxy connection lifecycle events for rate/SLO calculations. | `newt_proxy_connections_total{event="opened",protocol="tcp",site_id="acme-edge-1"} 10` |
Conventions
- Durations in seconds (unit: s), names end with _seconds
- Sizes in bytes (unit: By), names end with _bytes
- Counters end with _total
- Labels must be low-cardinality and stable
Histogram buckets
- Latency (seconds): 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30
Local quickstart
1) Direct Prometheus scrape (do not also scrape the Collector)
NEWT_METRICS_PROMETHEUS_ENABLED=true \
NEWT_METRICS_OTLP_ENABLED=false \
NEWT_ADMIN_ADDR="127.0.0.1:2112" \
./newt
curl -s <http://localhost:2112/metrics> | grep ^newt_
2) Using the Collector (compose-style)
NEWT_METRICS_PROMETHEUS_ENABLED=true \
NEWT_METRICS_OTLP_ENABLED=true \
OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \
OTEL_EXPORTER_OTLP_INSECURE=true \
OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=cumulative \
./newt
Collector config example: examples/otel-collector.yaml
Prometheus scrape config: examples/prometheus.yml
Adding new metrics
- Use helpers in internal/telemetry/metrics.go for counters/histograms
- Keep labels low-cardinality
- Add observable gauges through SetObservableCallback
Optional tracing
- When --otlp is enabled, you can wrap outbound HTTP clients with otelhttp.NewTransport to create spans for HTTP requests to Pangolin. This affects traces only and does not add metric labels.
OTLP TLS example
- Enable TLS to Collector with a custom CA and headers:
```sh
NEWT_METRICS_OTLP_ENABLED=true \
OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \
OTEL_EXPORTER_OTLP_INSECURE=false \
OTEL_EXPORTER_OTLP_CERTIFICATE=/etc/otel/custom-ca.pem \
OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer abc123,tenant=acme" \
./newt
```
Prometheus scrape strategy (choose one)
Important: Do not scrape both Newt (2112) and the Collectors Prometheus exporter (8889) at the same time for the same process. Doing so will double-count cumulative counters.
A) Scrape Newt directly:
```yaml
global:
scrape_interval: 15s
scrape_configs:
- job_name: newt
static_configs:
- targets: ["newt:2112"]
```
B) Scrape the Collectors Prometheus exporter:
```yaml
global:
scrape_interval: 15s
scrape_configs:
- job_name: otel-collector
static_configs:
- targets: ["otel-collector:8889"]
```
Reason mapping (source → reason)
- Server instructs reconnect/terminate → server_request
- Heartbeat/Ping threshold exceeded → timeout
- Peer closed connection gracefully → peer_close
- Route/Interface change detected → network_change
- Auth/token failure (HTTP 401/403) → auth_error
- TLS/WG handshake error → handshake_error
- Config reloaded/applied (causing reconnection) → config_change
- Other/unclassified errors → error
PromQL snippets
- Throughput in (5m):
```sh
sum(rate(newt_tunnel_bytes_total{direction="ingress"}[5m]))
```
- P95 latency (seconds):
```sh
histogram_quantile(0.95, sum(rate(newt_tunnel_latency_seconds_bucket[5m])) by (le))
```
- Active sessions:
```sh
sum(newt_tunnel_sessions)
```
Compatibility notes
- Gauges do not use the _total suffix (e.g., newt_tunnel_sessions).
- site_id/region remain resource attributes. Metric labels for these fields appear on per-site gauges (e.g., `newt_site_online`) and, by default, on counters/histograms; disable them with `NEWT_METRICS_INCLUDE_SITE_LABELS=false` if needed. `tunnel_id` is a metric label (WireGuard public key). Never expose secrets in labels.
- NEWT_METRICS_INCLUDE_TUNNEL_ID (default: true) toggles whether tunnel_id is included as a label on bytes/sessions/proxy/reconnect metrics. Disable in high-cardinality environments.
- Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both.
- Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write.
- No free text in labels; use only the enumerated constants for reason, protocol (tcp|udp), and transport (e.g., websocket|wireguard).
Further reading
- See docs/METRICS_RECOMMENDATIONS.md for roadmap, label guidance (transport vs protocol), and example alerts.
Cardinality tips
- tunnel_id can grow in larger fleets. Use relabeling to drop or retain a subset, for example:
```yaml
# Drop all tunnel_id on bytes to reduce series
- source_labels: [__name__]
regex: newt_tunnel_bytes_total
action: keep
- action: labeldrop
regex: tunnel_id
# Or drop only high-churn tunnels
- source_labels: [tunnel_id]
regex: .*
action: drop
```
Quickstart: direkte Prometheus-Erfassung (empfohlen)
```sh
# Start (direkter /metrics-Scrape, keine Doppel-Erfassung)
docker compose -f docker-compose.metrics.yml up -d
# Smoke-Checks
./scripts/smoke-metrics.sh
# Tunnel-IDs ausblenden (optional):
# EXPECT_TUNNEL_ID=false NEWT_METRICS_INCLUDE_TUNNEL_ID=false ./scripts/smoke-metrics.sh
```
- Prometheus UI: <http://localhost:9090>
- Standard-Scrape-Intervall: 15s
- Kein OTLP aktiv (NEWT_METRICS_OTLP_ENABLED=false in docker-compose.metrics.yml)
Häufige PromQL-Schnelltests
```yaml
# Online-Status einer Site in den letzten 5 Minuten
max_over_time(newt_site_online{site_id="$site"}[5m])
# TCP egress-Bytes pro Site/Tunnel (10m)
sum by (site_id, tunnel_id) (increase(newt_tunnel_bytes_total{protocol="tcp",direction="egress"}[10m]))
# WebSocket-Connect P95
histogram_quantile(0.95, sum by (le, site_id) (rate(newt_websocket_connect_latency_seconds_bucket[5m])))
# Reconnects nach Initiator
increase(newt_tunnel_reconnects_total{site_id="$site"}[30m]) by (initiator, reason)
```
Troubleshooting
- curl :2112/metrics ensure endpoint is reachable and includes newt_* metrics
- Check Collector logs for OTLP connection issues
- Verify Prometheus Targets are UP and scraping Newt or Collector

View File

@@ -1,129 +0,0 @@
# Newt OpenTelemetry Review
## Overview
This document summarises the current OpenTelemetry (OTel) instrumentation in Newt, assesses
compliance with OTel guidelines, and lists concrete improvements to pursue before release.
It is based on the implementation in `internal/telemetry` and the call-sites that emit
metrics and traces across the code base.
## Current metric instrumentation
All instruments are registered in `internal/telemetry/metrics.go`. They are grouped
into site, tunnel, connection, configuration, build, WebSocket, and proxy domains.
A global attribute filter (see `buildMeterProvider`) constrains exposed label keys to
`site_id`, `region`, and a curated list of low-cardinality dimensions so that Prometheus
exports stay bounded.
- **Site lifecycle**: `newt_site_registrations_total`, `newt_site_online`, and
`newt_site_last_heartbeat_timestamp_seconds` capture registration attempts and liveness. They
are fed either manually (`IncSiteRegistration`) or via the `TelemetryView` state
callback that publishes observable gauges for the active site.
- **Tunnel health and usage**: Counters and histograms track bytes, latency, reconnects,
and active sessions per tunnel (`newt_tunnel_*` family). Attribute helpers respect
the `NEWT_METRICS_INCLUDE_TUNNEL_ID` toggle to keep cardinality manageable on larger
fleets.
- **Connection attempts**: `newt_connection_attempts_total` and
`newt_connection_errors_total` are emitted throughout the WebSocket client to classify
authentication, dial, and transport failures.
- **Operations/configuration**: `newt_config_reloads_total`,
`process_start_time_seconds`, `newt_config_apply_seconds`, and
`newt_cert_rotation_total` provide visibility into blueprint reloads, process boots,
configuration timings, and certificate rotation outcomes.
- **Build metadata**: `newt_build_info` records the binary version/commit together
with optional site metadata when build information is supplied at startup.
- **WebSocket control-plane**: `newt_websocket_connect_latency_seconds`,
`newt_websocket_messages_total`, `newt_websocket_connected`, and
`newt_websocket_reconnects_total` report connect latency, ping/pong/text activity,
connection state, and reconnect reasons.
- **Proxy data-plane**: Observable gauges (`newt_proxy_active_connections`,
`newt_proxy_buffer_bytes`, `newt_proxy_async_backlog_bytes`) plus counters for
drops, accepts, connection lifecycle events (`newt_proxy_connections_total`), and
duration histograms (`newt_proxy_connection_duration_seconds`) surface backlog,
drop behaviour, and churn alongside per-protocol byte counters.
Refer to `docs/observability.md` for a tabular catalogue with instrument types,
attributes, and sample exposition lines.
## Tracing coverage
Tracing is optional and enabled only when OTLP export is configured. When active:
- The admin HTTP mux is wrapped with `otelhttp.NewHandler`, producing spans for
`/metrics` and `/healthz` requests.
- The WebSocket dial path creates a `ws.connect` span around the gRPC-based handshake.
No other subsystems currently create spans, so data-plane operations, blueprint fetches,
Docker discovery, and WireGuard reconfiguration happen without trace context.
## Guideline & best-practice alignment
The implementation adheres to most OTel Go recommendations:
- **Naming & units** Every instrument follows the `newt_*` prefix with `_total`
suffixes for counters and `_seconds`/`_bytes` unit conventions. Histograms are
registered with explicit second-based buckets.
- **Resource attributes** Service name/version and optional `site_id`/`region`
populate the `resource.Resource`. Metric labels mirror these by default (and on
per-site gauges) but can be disabled with `NEWT_METRICS_INCLUDE_SITE_LABELS=false`
to avoid unnecessary cardinality growth.
- **Attribute hygiene** A single attribute filter (`sdkmetric.WithView`) enforces
the allow-list of label keys to prevent accidental high-cardinality emission.
- **Runtime metrics** Go runtime instrumentation is enabled automatically through
`runtime.Start`.
- **Configuration via environment** `telemetry.FromEnv` honours `OTEL_*` variables
alongside `NEWT_*` overrides so operators can configure exporters without code
changes.
- **Shutdown handling** `Setup.Shutdown` iterates exporters in reverse order to
flush buffers before process exit.
## Adjustments & improvements
The review identified a few actionable adjustments:
1. **Record registration failures** `newt_site_registrations_total` is currently
incremented only on success. Emit `result="failure"` samples whenever Pangolin
rejects a registration or credential exchange so operators can alert on churn.
2. **Surface config reload failures** `telemetry.IncConfigReload` is invoked with
`result="success"` only. Callers should record a failure result when blueprint
parsing or application aborts before success counters are incremented.
3. **Expose robust uptime** Document using `time() - process_start_time_seconds`
to derive uptime now that the restart counter has been replaced with a timestamp
gauge.
4. **Propagate contexts where available** Many emitters call metric helpers with
`context.Background()`. Passing real contexts (when inexpensive) would allow future
exporters to correlate spans and metrics.
5. **Extend tracing coverage** Instrument critical flows such as blueprint fetches,
WireGuard reconfiguration, proxy accept loops, and Docker discovery to provide end
to end visibility when OTLP tracing is enabled.
## Metrics to add before release
Prioritised additions that would close visibility gaps:
1. **Config reload error taxonomy** Split reload attempts into a dedicated
`newt_config_reload_errors_total{phase}` counter to make blueprint validation failures
visible alongside the existing success counter.
2. **Config source visibility** Export `newt_config_source_info{source,version}` so
operators can audit the active blueprint origin/commit during incidents.
3. **Certificate expiry** Emit `newt_cert_expiry_timestamp_seconds` (per cert) to
enable proactive alerts before mTLS credentials lapse.
4. **Blueprint/config pull latency** Measuring Pangolin blueprint fetch durations and
HTTP status distribution would expose slow control-plane operations.
5. **Tunnel setup latency** Histograms for DNS resolution and tunnel handshakes would
help correlate connect latency spikes with network dependencies.
These metrics rely on data that is already available in the code paths mentioned
above and would round out operational dashboards.
## Tracing wishlist
To benefit from tracing when OTLP is active, add spans around:
- Pangolin REST calls (wrap the HTTP client with `otelhttp.NewTransport`).
- Docker discovery cycles and target registration callbacks.
- WireGuard reconfiguration (interface bring-up, peer updates).
- Proxy dial/accept loops for both TCP and UDP targets.
Capturing these stages will let operators correlate latency spikes with reconnects
and proxy drops using distributed traces in addition to the metric signals.

10
go.mod
View File

@@ -3,7 +3,7 @@ module github.com/fosrl/newt
go 1.25
require (
github.com/docker/docker v28.5.0+incompatible
github.com/docker/docker v28.5.1+incompatible
github.com/google/gopacket v1.1.19
github.com/gorilla/websocket v1.5.3
github.com/prometheus/client_golang v1.23.2
@@ -18,12 +18,12 @@ require (
go.opentelemetry.io/otel/sdk v1.38.0
go.opentelemetry.io/otel/sdk/metric v1.38.0
golang.org/x/crypto v0.43.0
golang.org/x/net v0.45.0
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792
golang.org/x/net v0.46.0
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10
google.golang.org/grpc v1.76.0
gopkg.in/yaml.v3 v3.0.1
google.golang.org/grpc v1.76.0
gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c
software.sslmate.com/src/go-pkcs12 v0.6.0
)
@@ -75,10 +75,6 @@ require (
golang.org/x/text v0.30.0 // indirect
golang.org/x/time v0.12.0 // indirect
golang.org/x/tools v0.37.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect
go.opentelemetry.io/otel v1.37.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0 // indirect
go.opentelemetry.io/otel/metric v1.37.0 // indirect
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect

55
go.sum
View File

@@ -1,5 +1,3 @@
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg=
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg=
github.com/Microsoft/go-winio v0.6.0/go.mod h1:cTAf44im0RAYeL23bpB+fzCyDH2MJiz2BO69KH/soAE=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -12,20 +10,14 @@ github.com/containerd/errdefs v0.3.0 h1:FSZgGOeK4yuT/+DnF07/Olde/q4KBoMsaamhXxIM
github.com/containerd/errdefs v0.3.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE=
github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk=
github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
github.com/docker/docker v28.5.0+incompatible h1:ZdSQoRUE9XxhFI/B8YLvhnEFMmYN9Pp8Egd2qcaFk1E=
github.com/docker/docker v28.5.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
github.com/docker/docker v28.5.1+incompatible h1:Bm8DchhSD2J6PsFzxC35TZo4TLGR2PdW/E69rU45NhM=
github.com/docker/docker v28.5.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94=
github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE=
github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=
github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
@@ -33,8 +25,6 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg=
github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
@@ -51,31 +41,16 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnV
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs=
github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA=
github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/mdlayher/genetlink v1.3.2 h1:KdrNKe+CTu+IbZnm/GVUMXSqBBLqcGpRDa0xkQy56gw=
github.com/mdlayher/genetlink v1.3.2/go.mod h1:tcC3pkCrPUGIKKsCsp0B3AdaaKuHtaxoJRz3cc+528o=
github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g=
github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw=
github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos=
github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ=
github.com/mikioh/ipaddr v0.0.0-20190404000644-d465c8ab6721 h1:RlZweED6sbSArvlE924+mUcZuXKLBHA35U7LN621Bws=
github.com/mikioh/ipaddr v0.0.0-20190404000644-d465c8ab6721/go.mod h1:Ickgr2WtCLZ2MDGd4Gr0geeCH5HybhRJbonOgQpvSxc=
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
github.com/moby/sys/atomicwriter v0.1.0 h1:kw5D/EqkBwsBFi0ss9v1VG3wIkVhzGvLklJ+w3A14Sw=
github.com/moby/sys/atomicwriter v0.1.0/go.mod h1:Ul8oqv2ZMNHOceF643P6FKPXeCmYtlQMvpizfsSoaWs=
github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU=
github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko=
github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
@@ -85,8 +60,6 @@ github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQ
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
@@ -97,12 +70,6 @@ github.com/prometheus/otlptranslator v0.0.2 h1:+1CdeLVrRQ6Psmhnobldo0kTp96Rj80DR
github.com/prometheus/otlptranslator v0.0.2/go.mod h1:P8AwMgdD7XEr6QRUJ2QWLpiAZTgTE2UYgjlu3svompI=
github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0=
github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0=
github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4=
github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY=
@@ -121,7 +88,6 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZF
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 h1:aTL7F04bJHUlztTsNGJ2l+6he8c+y/b//eR0jjjemT4=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0/go.mod h1:kldtb7jDTeol0l3ewcmd8SDvx3EmIE7lyvqbasU3QC4=
go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo=
go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk=
@@ -135,24 +101,21 @@ go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJr
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4=
go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc=
golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/mod v0.28.0 h1:gQBtGhjxykdjY9YhZpSlZIsbnaE2+PgjfLWUQTnoZ1U=
golang.org/x/mod v0.28.0/go.mod h1:yfB/L0NOf/kmEbXjzCPOx1iK1fRutOydrCMsqRhEBxI=
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4=
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.45.0 h1:RLBg5JKixCy82FtLJpeNlVM0nrSqpCRYzVU1n8kj0tM=
golang.org/x/net v0.45.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
@@ -170,7 +133,6 @@ golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE=
golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w=
golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 h1:B82qJJgjvYKsXS9jeunTOisW56dUokqW/FOteYJJ/yg=
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2/go.mod h1:deeaetjYA+DHMHg+sMSMI58GrEteJUUzzw7en6TJQcI=
@@ -178,8 +140,6 @@ golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb h1:whnFRlWMcXI9d+Z
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb/go.mod h1:rpwXGsirqLqN2L0JDJQlwOboGHmptD5ZD6T2VmcqhTw=
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10 h1:3GDAcqdIg1ozBNLgPy4SLT84nfcBjr6rhGtXYtrkWLU=
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10/go.mod h1:T97yPqesLiNrOYxkwmhMI0ZIlJDm+p0PMR8eRVeR5tQ=
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1:BIRfGDEjiHRrk0QKZe3Xv2ieMhtgRGeLcZQ0mIVn4EY=
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE=
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE=
@@ -188,14 +148,9 @@ google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A=
google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c=
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
google.golang.org/genproto v0.0.0-20230920204549-e6e6cdab5c13 h1:vlzZttNJGVqTsRFU9AmdnrcO1Znh8Ew9kCD//yjigk0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o=
gotest.tools/v3 v3.4.0/go.mod h1:CtbdzLSsqVhDgMtKsx03ird5YTGB3ar27v0u/yKBW5g=
gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c h1:m/r7OM+Y2Ty1sgBQ7Qb27VgIMBW8ZZhT4gLnUyDIhzI=
gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c/go.mod h1:3r5CMtNQMKIvBlrmM9xWUNamjKBYPOWyXOjmg5Kts3g=
software.sslmate.com/src/go-pkcs12 v0.6.0 h1:f3sQittAeF+pao32Vb+mkli+ZyT+VwKaD014qFGq6oU=

View File

@@ -1,7 +0,0 @@
newt_connection_attempts_total
newt_websocket_connected
newt_websocket_reconnects_total
newt_proxy_connections_total
newt_build_info
process_start_time_seconds

10
main.go
View File

@@ -142,8 +142,8 @@ var (
adminAddr string
region string
metricsAsyncBytes bool
blueprintFile string
noCloud bool
blueprintFile string
noCloud bool
// New mTLS configuration variables
tlsClientCert string
@@ -169,10 +169,6 @@ func main() {
updownScript = os.Getenv("UPDOWN_SCRIPT")
interfaceName = os.Getenv("INTERFACE")
generateAndSaveKeyTo = os.Getenv("GENERATE_AND_SAVE_KEY_TO")
keepInterfaceEnv := os.Getenv("KEEP_INTERFACE")
acceptClientsEnv := os.Getenv("ACCEPT_CLIENTS")
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
// Metrics/observability env mirrors
metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
@@ -181,6 +177,7 @@ func main() {
regionEnv := os.Getenv("NEWT_REGION")
asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
keepInterfaceEnv := os.Getenv("KEEP_INTERFACE")
keepInterface = keepInterfaceEnv == "true"
acceptClientsEnv := os.Getenv("ACCEPT_CLIENTS")
acceptClients = acceptClientsEnv == "true"
@@ -394,6 +391,7 @@ func main() {
}
if tel != nil {
// Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
logger.Info("Starting metrics server on %s", tcfg.AdminAddr)
mux := http.NewServeMux()
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
if tel.PrometheusHandler != nil {

View File

@@ -1,802 +0,0 @@
diff --git a/Dockerfile b/Dockerfile
index b9c4d29..b9b6dea 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,6 +22,9 @@ RUN apk --no-cache add ca-certificates tzdata
COPY --from=builder /newt /usr/local/bin/
COPY entrypoint.sh /
+# Admin/metrics endpoint (Prometheus scrape)
+EXPOSE 2112
+
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
-CMD ["newt"]
\ No newline at end of file
+CMD ["newt"]
diff --git a/go.mod b/go.mod
index d475835..5909955 100644
--- a/go.mod
+++ b/go.mod
@@ -7,6 +7,14 @@ require (
github.com/google/gopacket v1.1.19
github.com/gorilla/websocket v1.5.3
github.com/vishvananda/netlink v1.3.1
+ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0
+ go.opentelemetry.io/contrib/instrumentation/runtime v0.62.0
+ go.opentelemetry.io/otel v1.37.0
+ go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.37.0
+ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.37.0
+ go.opentelemetry.io/otel/sdk/metric v1.37.0
+ go.opentelemetry.io/otel/sdk/trace v1.37.0
+ go.opentelemetry.io/otel/semconv v1.26.0
golang.org/x/crypto v0.42.0
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792
golang.org/x/net v0.44.0
diff --git a/main.go b/main.go
index 12849b1..c223b75 100644
--- a/main.go
+++ b/main.go
@@ -1,7 +1,9 @@
package main
import (
+ "context"
"encoding/json"
+ "errors"
"flag"
"fmt"
"net"
@@ -22,6 +24,9 @@ import (
"github.com/fosrl/newt/updates"
"github.com/fosrl/newt/websocket"
+ "github.com/fosrl/newt/internal/state"
+ "github.com/fosrl/newt/internal/telemetry"
+ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"golang.zx2c4.com/wireguard/conn"
"golang.zx2c4.com/wireguard/device"
"golang.zx2c4.com/wireguard/tun"
@@ -116,6 +121,13 @@ var (
healthMonitor *healthcheck.Monitor
enforceHealthcheckCert bool
+ // Observability/metrics flags
+ metricsEnabled bool
+ otlpEnabled bool
+ adminAddr string
+ region string
+ metricsAsyncBytes bool
+
// New mTLS configuration variables
tlsClientCert string
tlsClientKey string
@@ -126,6 +138,10 @@ var (
)
func main() {
+ // Prepare context for graceful shutdown and signal handling
+ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
+ defer stop()
+
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
id = os.Getenv("NEWT_ID")
@@ -141,6 +157,13 @@ func main() {
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
+ // Metrics/observability env mirrors
+ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
+ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
+ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
+ regionEnv := os.Getenv("NEWT_REGION")
+ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
+
keepInterface = keepInterfaceEnv == "true"
acceptClients = acceptClientsEnv == "true"
useNativeInterface = useNativeInterfaceEnv == "true"
@@ -272,6 +295,35 @@ func main() {
flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)")
}
+ // Metrics/observability flags (mirror ENV if unset)
+ if metricsEnabledEnv == "" {
+ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
+ } else {
+ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true }
+ }
+ if otlpEnabledEnv == "" {
+ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
+ } else {
+ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v }
+ }
+ if adminAddrEnv == "" {
+ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
+ } else {
+ adminAddr = adminAddrEnv
+ }
+ // Async bytes toggle
+ if asyncBytesEnv == "" {
+ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
+ } else {
+ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v }
+ }
+ // Optional region flag (resource attribute)
+ if regionEnv == "" {
+ flag.StringVar(&region, "region", "", "Optional region resource attribute (also NEWT_REGION)")
+ } else {
+ region = regionEnv
+ }
+
// do a --version check
version := flag.Bool("version", false, "Print the version")
@@ -286,6 +338,50 @@ func main() {
loggerLevel := parseLogLevel(logLevel)
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
+ // Initialize telemetry after flags are parsed (so flags override env)
+ tcfg := telemetry.FromEnv()
+ tcfg.PromEnabled = metricsEnabled
+ tcfg.OTLPEnabled = otlpEnabled
+ if adminAddr != "" { tcfg.AdminAddr = adminAddr }
+ // Resource attributes (if available)
+ tcfg.SiteID = id
+ tcfg.Region = region
+ // Build info
+ tcfg.BuildVersion = newtVersion
+ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
+
+ tel, telErr := telemetry.Init(ctx, tcfg)
+ if telErr != nil {
+ logger.Warn("Telemetry init failed: %v", telErr)
+ }
+ if tel != nil {
+ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
+ mux := http.NewServeMux()
+ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
+ if tel.PrometheusHandler != nil {
+ mux.Handle("/metrics", tel.PrometheusHandler)
+ }
+ admin := &http.Server{
+ Addr: tcfg.AdminAddr,
+ Handler: otelhttp.NewHandler(mux, "newt-admin"),
+ ReadTimeout: 5 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ ReadHeaderTimeout: 5 * time.Second,
+ IdleTimeout: 30 * time.Second,
+ }
+ go func() {
+ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
+ logger.Warn("admin http error: %v", err)
+ }
+ }()
+ defer func() {
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ _ = admin.Shutdown(ctx)
+ }()
+ defer func() { _ = tel.Shutdown(context.Background()) }()
+ }
+
newtVersion := "version_replaceme"
if *version {
fmt.Println("Newt version " + newtVersion)
@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
}
// Use reliable ping for initial connection test
logger.Debug("Testing initial connection with reliable ping...")
- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
+ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
+ if err == nil && wgData.PublicKey != "" {
+ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds())
+ }
if err != nil {
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
} else {
@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
// as the pings will continue in the background
if !connected {
logger.Debug("Starting ping check")
- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
+ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
}
// Create proxy manager
pm = proxy.NewProxyManager(tnet)
+ pm.SetAsyncBytes(metricsAsyncBytes)
+ // Set tunnel_id for metrics (WireGuard peer public key)
+ pm.SetTunnelID(wgData.PublicKey)
connected = true
+ // telemetry: record a successful site registration (omit region unless available)
+ telemetry.IncSiteRegistration(context.Background(), id, "", "success")
+
// add the targets if there are any
if len(wgData.Targets.TCP) > 0 {
updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP})
@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
logger.Info("Received reconnect message")
+ if wgData.PublicKey != "" {
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
+ }
// Close the WireGuard device and TUN
closeWgTunnel()
+ // Clear metrics attrs and sessions for the tunnel
+ if pm != nil {
+ pm.ClearTunnelID()
+ state.Global().ClearTunnel(wgData.PublicKey)
+ }
+
+ // Clear metrics attrs and sessions for the tunnel
+ if pm != nil {
+ pm.ClearTunnelID()
+ state.Global().ClearTunnel(wgData.PublicKey)
+ }
+
// Mark as disconnected
connected = false
@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
logger.Info("Received termination message")
+ if wgData.PublicKey != "" {
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
+ }
// Close the WireGuard device and TUN
closeWgTunnel()
diff --git a/proxy/manager.go b/proxy/manager.go
index bf10322..86c47a8 100644
--- a/proxy/manager.go
+++ b/proxy/manager.go
@@ -1,16 +1,22 @@
package proxy
import (
+ "context"
"fmt"
"io"
"net"
+ "os"
"strings"
"sync"
+ "sync/atomic"
"time"
+ "github.com/fosrl/newt/internal/state"
+ "github.com/fosrl/newt/internal/telemetry"
"github.com/fosrl/newt/logger"
"golang.zx2c4.com/wireguard/tun/netstack"
"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
+ "go.opentelemetry.io/otel/attribute"
)
// Target represents a proxy target with its address and port
@@ -28,6 +34,52 @@ type ProxyManager struct {
udpConns []*gonet.UDPConn
running bool
mutex sync.RWMutex
+
+ // telemetry (multi-tunnel)
+ currentTunnelID string
+ tunnels map[string]*tunnelEntry
+ asyncBytes bool
+ flushStop chan struct{}
+}
+
+// tunnelEntry holds per-tunnel attributes and (optional) async counters.
+type tunnelEntry struct {
+ attrInTCP attribute.Set
+ attrOutTCP attribute.Set
+ attrInUDP attribute.Set
+ attrOutUDP attribute.Set
+
+ bytesInTCP atomic.Uint64
+ bytesOutTCP atomic.Uint64
+ bytesInUDP atomic.Uint64
+ bytesOutUDP atomic.Uint64
+}
+
+// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set.
+type countingWriter struct {
+ ctx context.Context
+ w io.Writer
+ set attribute.Set
+ pm *ProxyManager
+ ent *tunnelEntry
+ out bool // false=in, true=out
+ proto string // "tcp" or "udp"
+}
+
+func (cw *countingWriter) Write(p []byte) (int, error) {
+ n, err := cw.w.Write(p)
+ if n > 0 {
+ if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil {
+ if cw.proto == "tcp" {
+ if cw.out { cw.ent.bytesOutTCP.Add(uint64(n)) } else { cw.ent.bytesInTCP.Add(uint64(n)) }
+ } else if cw.proto == "udp" {
+ if cw.out { cw.ent.bytesOutUDP.Add(uint64(n)) } else { cw.ent.bytesInUDP.Add(uint64(n)) }
+ }
+ } else {
+ telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set)
+ }
+ }
+ return n, err
}
// NewProxyManager creates a new proxy manager instance
@@ -38,9 +90,46 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager {
udpTargets: make(map[string]map[int]string),
listeners: make([]*gonet.TCPListener, 0),
udpConns: make([]*gonet.UDPConn, 0),
+ tunnels: make(map[string]*tunnelEntry),
}
}
+// SetTunnelID sets the WireGuard peer public key used as tunnel_id label.
+func (pm *ProxyManager) SetTunnelID(id string) {
+ pm.mutex.Lock()
+ defer pm.mutex.Unlock()
+ pm.currentTunnelID = id
+ if _, ok := pm.tunnels[id]; !ok {
+ pm.tunnels[id] = &tunnelEntry{}
+ }
+ e := pm.tunnels[id]
+ e.attrInTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "tcp"))
+ e.attrOutTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "tcp"))
+ e.attrInUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "udp"))
+ e.attrOutUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "udp"))
+}
+
+// ClearTunnelID clears cached attribute sets for the current tunnel.
+func (pm *ProxyManager) ClearTunnelID() {
+ pm.mutex.Lock()
+ defer pm.mutex.Unlock()
+ id := pm.currentTunnelID
+ if id == "" { return }
+ if e, ok := pm.tunnels[id]; ok {
+ // final flush for this tunnel
+ inTCP := e.bytesInTCP.Swap(0)
+ outTCP := e.bytesOutTCP.Swap(0)
+ inUDP := e.bytesInUDP.Swap(0)
+ outUDP := e.bytesOutUDP.Swap(0)
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
+ delete(pm.tunnels, id)
+ }
+ pm.currentTunnelID = ""
+}
+
// init function without tnet
func NewProxyManagerWithoutTNet() *ProxyManager {
return &ProxyManager{
@@ -160,6 +249,57 @@ func (pm *ProxyManager) Start() error {
return nil
}
+func (pm *ProxyManager) SetAsyncBytes(b bool) {
+ pm.mutex.Lock()
+ defer pm.mutex.Unlock()
+ pm.asyncBytes = b
+ if b && pm.flushStop == nil {
+ pm.flushStop = make(chan struct{})
+ go pm.flushLoop()
+ }
+}
+func (pm *ProxyManager) flushLoop() {
+ flushInterval := 2 * time.Second
+ if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" {
+ if d, err := time.ParseDuration(v); err == nil && d > 0 {
+ if d/2 < flushInterval { flushInterval = d / 2 }
+ }
+ }
+ ticker := time.NewTicker(flushInterval)
+ defer ticker.Stop()
+ for {
+ select {
+ case <-ticker.C:
+ pm.mutex.RLock()
+ for _, e := range pm.tunnels {
+ inTCP := e.bytesInTCP.Swap(0)
+ outTCP := e.bytesOutTCP.Swap(0)
+ inUDP := e.bytesInUDP.Swap(0)
+ outUDP := e.bytesOutUDP.Swap(0)
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
+ }
+ pm.mutex.RUnlock()
+ case <-pm.flushStop:
+ pm.mutex.RLock()
+ for _, e := range pm.tunnels {
+ inTCP := e.bytesInTCP.Swap(0)
+ outTCP := e.bytesOutTCP.Swap(0)
+ inUDP := e.bytesInUDP.Swap(0)
+ outUDP := e.bytesOutUDP.Swap(0)
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
+ }
+ pm.mutex.RUnlock()
+ return
+ }
+ }
+}
+
func (pm *ProxyManager) Stop() error {
pm.mutex.Lock()
defer pm.mutex.Unlock()
@@ -236,6 +376,14 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr
return nil
}
+// getEntry returns per-tunnel entry or nil.
+func (pm *ProxyManager) getEntry(id string) *tunnelEntry {
+ pm.mutex.RLock()
+ e := pm.tunnels[id]
+ pm.mutex.RUnlock()
+ return e
+}
+
func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) {
for {
conn, err := listener.Accept()
@@ -257,6 +405,9 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
continue
}
+// Count sessions only once per accepted TCP connection
+ if pm.tunnelID != "" { state.Global().IncSessions(pm.tunnelID) }
+
go func() {
target, err := net.Dial("tcp", targetAddr)
if err != nil {
@@ -265,24 +416,33 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
return
}
+ // already incremented on accept
+
// Create a WaitGroup to ensure both copy operations complete
var wg sync.WaitGroup
wg.Add(2)
+ // client -> target (direction=in)
go func() {
defer wg.Done()
- io.Copy(target, conn)
- target.Close()
+e := pm.getEntry(pm.currentTunnelID)
+cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"}
+ _, _ = io.Copy(cw, conn)
+ _ = target.Close()
}()
+ // target -> client (direction=out)
go func() {
defer wg.Done()
- io.Copy(conn, target)
- conn.Close()
+e := pm.getEntry(pm.currentTunnelID)
+cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"}
+ _, _ = io.Copy(cw, target)
+ _ = conn.Close()
}()
- // Wait for both copies to complete
+ // Wait for both copies to complete then session -1
wg.Wait()
+ if pm.tunnelID != "" { state.Global().DecSessions(pm.tunnelID) }
}()
}
}
@@ -326,6 +486,14 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
}
clientKey := remoteAddr.String()
+ // bytes from client -> target (direction=in)
+if pm.currentTunnelID != "" && n > 0 {
+if pm.asyncBytes {
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(n)) }
+ } else {
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP) }
+ }
+ }
clientsMutex.RLock()
targetConn, exists := clientConns[clientKey]
clientsMutex.RUnlock()
@@ -366,6 +534,15 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
return // defer will handle cleanup
}
+ // bytes from target -> client (direction=out)
+ if pm.currentTunnelID != "" && n > 0 {
+ if pm.asyncBytes {
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesOutUDP.Add(uint64(n)) }
+ } else {
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP) }
+ }
+ }
+
_, err = conn.WriteTo(buffer[:n], remoteAddr)
if err != nil {
logger.Error("Error writing to client: %v", err)
@@ -375,13 +552,19 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
}(clientKey, targetConn, remoteAddr)
}
- _, err = targetConn.Write(buffer[:n])
+ written, err := targetConn.Write(buffer[:n])
if err != nil {
logger.Error("Error writing to target: %v", err)
targetConn.Close()
clientsMutex.Lock()
delete(clientConns, clientKey)
clientsMutex.Unlock()
+} else if pm.currentTunnelID != "" && written > 0 {
+ if pm.asyncBytes {
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(written)) }
+ } else {
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP) }
+ }
}
}
}
diff --git a/util.go b/util.go
index 7d6da4f..c1f4915 100644
--- a/util.go
+++ b/util.go
@@ -17,6 +17,7 @@ import (
"github.com/fosrl/newt/logger"
"github.com/fosrl/newt/proxy"
"github.com/fosrl/newt/websocket"
+ "github.com/fosrl/newt/internal/telemetry"
"golang.org/x/net/icmp"
"golang.org/x/net/ipv4"
"golang.zx2c4.com/wireguard/device"
@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
}
-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} {
+func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
maxInterval := 6 * time.Second
currentInterval := pingInterval
consecutiveFailures := 0
@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
if !connectionLost {
connectionLost = true
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
+ if tunnelID != "" {
+ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout)
+ }
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second)
// Send registration message to the server for backward compatibility
err := client.SendMessage("newt/wg/register", map[string]interface{}{
@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
} else {
// Track recent latencies
recentLatencies = append(recentLatencies, latency)
+ // Record tunnel latency (limit sampling to this periodic check)
+ if tunnelID != "" {
+ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds())
+ }
if len(recentLatencies) > 10 {
recentLatencies = recentLatencies[1:]
}
diff --git a/websocket/client.go b/websocket/client.go
index 0c0664a..c9ac264 100644
--- a/websocket/client.go
+++ b/websocket/client.go
@@ -18,6 +18,10 @@ import (
"github.com/fosrl/newt/logger"
"github.com/gorilla/websocket"
+
+ "context"
+ "github.com/fosrl/newt/internal/telemetry"
+ "go.opentelemetry.io/otel"
)
type Client struct {
@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) {
}
resp, err := client.Do(req)
if err != nil {
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err))
return "", fmt.Errorf("failed to request new token: %w", err)
}
defer resp.Body.Close()
@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) {
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure")
+ bin := "http_other"
+ if resp.StatusCode >= 500 {
+ bin = "http_5xx"
+ } else if resp.StatusCode >= 400 {
+ bin = "http_4xx"
+ }
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin)
+ // Reconnect reason mapping for auth failures
+ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError)
+ }
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
}
@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) {
}
logger.Debug("Received token: %s", tokenResp.Data.Token)
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success")
return tokenResp.Data.Token, nil
}
+// classifyConnError maps common errors to low-cardinality error_type labels
+func classifyConnError(err error) string {
+ if err == nil {
+ return ""
+ }
+ msg := strings.ToLower(err.Error())
+ switch {
+ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
+ return "tls"
+ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"):
+ return "timeout"
+ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"):
+ return "dns"
+ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
+ return "auth"
+ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"):
+ return "io"
+ default:
+ return "other"
+ }
+}
+
func (c *Client) connectWithRetry() {
for {
select {
@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error {
// Get token for authentication
token, err := c.getToken()
if err != nil {
+ // telemetry: connection attempt failed before dialing
+ // site_id isn't globally available here; use client ID as site_id (low cardinality)
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err))
return fmt.Errorf("failed to get token: %w", err)
}
@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error {
q.Set("clientType", c.clientType)
u.RawQuery = q.Encode()
- // Connect to WebSocket
+ // Connect to WebSocket (optional span)
+ tr := otel.Tracer("newt")
+ spanCtx, span := tr.Start(context.Background(), "ws.connect")
+ defer span.End()
+
dialer := websocket.DefaultDialer
// Use new TLS configuration method
@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error {
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
}
- conn, _, err := dialer.Dial(u.String(), nil)
+conn, _, err := dialer.DialContext(spanCtx, u.String(), nil)
if err != nil {
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
+ etype := classifyConnError(err)
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype)
+ // Map handshake-related errors to reconnect reasons where appropriate
+ if etype == "tls" {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError)
+ } else if etype == "timeout" {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout)
+ } else {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError)
+ }
return fmt.Errorf("failed to connect to WebSocket: %w", err)
}
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success")
c.conn = conn
c.setConnected(true)
diff --git a/wg/wg.go b/wg/wg.go
index 3cee1a9..a765279 100644
--- a/wg/wg.go
+++ b/wg/wg.go
@@ -3,6 +3,7 @@
package wg
import (
+ "context"
"encoding/json"
"errors"
"fmt"
@@ -23,6 +24,8 @@ import (
"golang.zx2c4.com/wireguard/conn"
"golang.zx2c4.com/wireguard/wgctrl"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
+
+ "github.com/fosrl/newt/internal/telemetry"
)
type WgConfig struct {
@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
s.stopGetConfig = nil
}
+ // telemetry: config reload success
+ telemetry.IncConfigReload(context.Background(), "success")
+ // Optional reconnect reason mapping: config change
+ if s.serverPubKey != "" {
+ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange)
+ }
+
// Ensure the WireGuard interface and peers are configured
if err := s.ensureWireguardInterface(config); err != nil {
logger.Error("Failed to ensure WireGuard interface: %v", err)
diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go
index 6684c40..09f160e 100644
--- a/wgnetstack/wgnetstack.go
+++ b/wgnetstack/wgnetstack.go
@@ -1,6 +1,7 @@
package wgnetstack
import (
+ "context"
"crypto/rand"
"encoding/base64"
"encoding/hex"
@@ -26,6 +27,8 @@ import (
"golang.zx2c4.com/wireguard/tun"
"golang.zx2c4.com/wireguard/tun/netstack"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
+
+ "github.com/fosrl/newt/internal/telemetry"
)
type WgConfig struct {
@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str
return service, nil
}
+// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally.
+func (s *WireGuardService) ReportRTT(seconds float64) {
+ if s.serverPubKey == "" { return }
+ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds)
+}
+
func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) {
logger.Debug("Received: %+v", msg)
// if there is no wgData or pm, we can't add targets
if s.TunnelIP == "" || s.proxyManager == nil {
logger.Info("No tunnel IP or proxy manager available")
- return
- }
+ return
+}
targetData, err := parseTargetData(msg.Data)
if err != nil {

View File

@@ -1,301 +0,0 @@
diff --git a/proxy/manager.go b/proxy/manager.go
index bf10322..86c47a8 100644
--- a/proxy/manager.go
+++ b/proxy/manager.go
@@ -1,16 +1,22 @@
package proxy
import (
+ "context"
"fmt"
"io"
"net"
+ "os"
"strings"
"sync"
+ "sync/atomic"
"time"
+ "github.com/fosrl/newt/internal/state"
+ "github.com/fosrl/newt/internal/telemetry"
"github.com/fosrl/newt/logger"
"golang.zx2c4.com/wireguard/tun/netstack"
"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
+ "go.opentelemetry.io/otel/attribute"
)
// Target represents a proxy target with its address and port
@@ -28,6 +34,52 @@ type ProxyManager struct {
udpConns []*gonet.UDPConn
running bool
mutex sync.RWMutex
+
+ // telemetry (multi-tunnel)
+ currentTunnelID string
+ tunnels map[string]*tunnelEntry
+ asyncBytes bool
+ flushStop chan struct{}
+}
+
+// tunnelEntry holds per-tunnel attributes and (optional) async counters.
+type tunnelEntry struct {
+ attrInTCP attribute.Set
+ attrOutTCP attribute.Set
+ attrInUDP attribute.Set
+ attrOutUDP attribute.Set
+
+ bytesInTCP atomic.Uint64
+ bytesOutTCP atomic.Uint64
+ bytesInUDP atomic.Uint64
+ bytesOutUDP atomic.Uint64
+}
+
+// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set.
+type countingWriter struct {
+ ctx context.Context
+ w io.Writer
+ set attribute.Set
+ pm *ProxyManager
+ ent *tunnelEntry
+ out bool // false=in, true=out
+ proto string // "tcp" or "udp"
+}
+
+func (cw *countingWriter) Write(p []byte) (int, error) {
+ n, err := cw.w.Write(p)
+ if n > 0 {
+ if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil {
+ if cw.proto == "tcp" {
+ if cw.out { cw.ent.bytesOutTCP.Add(uint64(n)) } else { cw.ent.bytesInTCP.Add(uint64(n)) }
+ } else if cw.proto == "udp" {
+ if cw.out { cw.ent.bytesOutUDP.Add(uint64(n)) } else { cw.ent.bytesInUDP.Add(uint64(n)) }
+ }
+ } else {
+ telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set)
+ }
+ }
+ return n, err
}
// NewProxyManager creates a new proxy manager instance
@@ -38,9 +90,46 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager {
udpTargets: make(map[string]map[int]string),
listeners: make([]*gonet.TCPListener, 0),
udpConns: make([]*gonet.UDPConn, 0),
+ tunnels: make(map[string]*tunnelEntry),
}
}
+// SetTunnelID sets the WireGuard peer public key used as tunnel_id label.
+func (pm *ProxyManager) SetTunnelID(id string) {
+ pm.mutex.Lock()
+ defer pm.mutex.Unlock()
+ pm.currentTunnelID = id
+ if _, ok := pm.tunnels[id]; !ok {
+ pm.tunnels[id] = &tunnelEntry{}
+ }
+ e := pm.tunnels[id]
+ e.attrInTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "tcp"))
+ e.attrOutTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "tcp"))
+ e.attrInUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "udp"))
+ e.attrOutUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "udp"))
+}
+
+// ClearTunnelID clears cached attribute sets for the current tunnel.
+func (pm *ProxyManager) ClearTunnelID() {
+ pm.mutex.Lock()
+ defer pm.mutex.Unlock()
+ id := pm.currentTunnelID
+ if id == "" { return }
+ if e, ok := pm.tunnels[id]; ok {
+ // final flush for this tunnel
+ inTCP := e.bytesInTCP.Swap(0)
+ outTCP := e.bytesOutTCP.Swap(0)
+ inUDP := e.bytesInUDP.Swap(0)
+ outUDP := e.bytesOutUDP.Swap(0)
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
+ delete(pm.tunnels, id)
+ }
+ pm.currentTunnelID = ""
+}
+
// init function without tnet
func NewProxyManagerWithoutTNet() *ProxyManager {
return &ProxyManager{
@@ -160,6 +249,57 @@ func (pm *ProxyManager) Start() error {
return nil
}
+func (pm *ProxyManager) SetAsyncBytes(b bool) {
+ pm.mutex.Lock()
+ defer pm.mutex.Unlock()
+ pm.asyncBytes = b
+ if b && pm.flushStop == nil {
+ pm.flushStop = make(chan struct{})
+ go pm.flushLoop()
+ }
+}
+func (pm *ProxyManager) flushLoop() {
+ flushInterval := 2 * time.Second
+ if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" {
+ if d, err := time.ParseDuration(v); err == nil && d > 0 {
+ if d/2 < flushInterval { flushInterval = d / 2 }
+ }
+ }
+ ticker := time.NewTicker(flushInterval)
+ defer ticker.Stop()
+ for {
+ select {
+ case <-ticker.C:
+ pm.mutex.RLock()
+ for _, e := range pm.tunnels {
+ inTCP := e.bytesInTCP.Swap(0)
+ outTCP := e.bytesOutTCP.Swap(0)
+ inUDP := e.bytesInUDP.Swap(0)
+ outUDP := e.bytesOutUDP.Swap(0)
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
+ }
+ pm.mutex.RUnlock()
+ case <-pm.flushStop:
+ pm.mutex.RLock()
+ for _, e := range pm.tunnels {
+ inTCP := e.bytesInTCP.Swap(0)
+ outTCP := e.bytesOutTCP.Swap(0)
+ inUDP := e.bytesInUDP.Swap(0)
+ outUDP := e.bytesOutUDP.Swap(0)
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
+ }
+ pm.mutex.RUnlock()
+ return
+ }
+ }
+}
+
func (pm *ProxyManager) Stop() error {
pm.mutex.Lock()
defer pm.mutex.Unlock()
@@ -236,6 +376,14 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr
return nil
}
+// getEntry returns per-tunnel entry or nil.
+func (pm *ProxyManager) getEntry(id string) *tunnelEntry {
+ pm.mutex.RLock()
+ e := pm.tunnels[id]
+ pm.mutex.RUnlock()
+ return e
+}
+
func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) {
for {
conn, err := listener.Accept()
@@ -257,6 +405,9 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
continue
}
+// Count sessions only once per accepted TCP connection
+ if pm.tunnelID != "" { state.Global().IncSessions(pm.tunnelID) }
+
go func() {
target, err := net.Dial("tcp", targetAddr)
if err != nil {
@@ -265,24 +416,33 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
return
}
+ // already incremented on accept
+
// Create a WaitGroup to ensure both copy operations complete
var wg sync.WaitGroup
wg.Add(2)
+ // client -> target (direction=in)
go func() {
defer wg.Done()
- io.Copy(target, conn)
- target.Close()
+e := pm.getEntry(pm.currentTunnelID)
+cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"}
+ _, _ = io.Copy(cw, conn)
+ _ = target.Close()
}()
+ // target -> client (direction=out)
go func() {
defer wg.Done()
- io.Copy(conn, target)
- conn.Close()
+e := pm.getEntry(pm.currentTunnelID)
+cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"}
+ _, _ = io.Copy(cw, target)
+ _ = conn.Close()
}()
- // Wait for both copies to complete
+ // Wait for both copies to complete then session -1
wg.Wait()
+ if pm.tunnelID != "" { state.Global().DecSessions(pm.tunnelID) }
}()
}
}
@@ -326,6 +486,14 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
}
clientKey := remoteAddr.String()
+ // bytes from client -> target (direction=in)
+if pm.currentTunnelID != "" && n > 0 {
+if pm.asyncBytes {
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(n)) }
+ } else {
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP) }
+ }
+ }
clientsMutex.RLock()
targetConn, exists := clientConns[clientKey]
clientsMutex.RUnlock()
@@ -366,6 +534,15 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
return // defer will handle cleanup
}
+ // bytes from target -> client (direction=out)
+ if pm.currentTunnelID != "" && n > 0 {
+ if pm.asyncBytes {
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesOutUDP.Add(uint64(n)) }
+ } else {
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP) }
+ }
+ }
+
_, err = conn.WriteTo(buffer[:n], remoteAddr)
if err != nil {
logger.Error("Error writing to client: %v", err)
@@ -375,13 +552,19 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
}(clientKey, targetConn, remoteAddr)
}
- _, err = targetConn.Write(buffer[:n])
+ written, err := targetConn.Write(buffer[:n])
if err != nil {
logger.Error("Error writing to target: %v", err)
targetConn.Close()
clientsMutex.Lock()
delete(clientConns, clientKey)
clientsMutex.Unlock()
+} else if pm.currentTunnelID != "" && written > 0 {
+ if pm.asyncBytes {
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(written)) }
+ } else {
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP) }
+ }
}
}
}

View File

@@ -1,422 +0,0 @@
diff --git a/main.go b/main.go
index 12849b1..c223b75 100644
--- a/main.go
+++ b/main.go
@@ -1,7 +1,9 @@
package main
import (
+ "context"
"encoding/json"
+ "errors"
"flag"
"fmt"
"net"
@@ -22,6 +24,9 @@ import (
"github.com/fosrl/newt/updates"
"github.com/fosrl/newt/websocket"
+ "github.com/fosrl/newt/internal/state"
+ "github.com/fosrl/newt/internal/telemetry"
+ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"golang.zx2c4.com/wireguard/conn"
"golang.zx2c4.com/wireguard/device"
"golang.zx2c4.com/wireguard/tun"
@@ -116,6 +121,13 @@ var (
healthMonitor *healthcheck.Monitor
enforceHealthcheckCert bool
+ // Observability/metrics flags
+ metricsEnabled bool
+ otlpEnabled bool
+ adminAddr string
+ region string
+ metricsAsyncBytes bool
+
// New mTLS configuration variables
tlsClientCert string
tlsClientKey string
@@ -126,6 +138,10 @@ var (
)
func main() {
+ // Prepare context for graceful shutdown and signal handling
+ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
+ defer stop()
+
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
id = os.Getenv("NEWT_ID")
@@ -141,6 +157,13 @@ func main() {
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
+ // Metrics/observability env mirrors
+ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
+ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
+ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
+ regionEnv := os.Getenv("NEWT_REGION")
+ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
+
keepInterface = keepInterfaceEnv == "true"
acceptClients = acceptClientsEnv == "true"
useNativeInterface = useNativeInterfaceEnv == "true"
@@ -272,6 +295,35 @@ func main() {
flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)")
}
+ // Metrics/observability flags (mirror ENV if unset)
+ if metricsEnabledEnv == "" {
+ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
+ } else {
+ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true }
+ }
+ if otlpEnabledEnv == "" {
+ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
+ } else {
+ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v }
+ }
+ if adminAddrEnv == "" {
+ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
+ } else {
+ adminAddr = adminAddrEnv
+ }
+ // Async bytes toggle
+ if asyncBytesEnv == "" {
+ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
+ } else {
+ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v }
+ }
+ // Optional region flag (resource attribute)
+ if regionEnv == "" {
+ flag.StringVar(&region, "region", "", "Optional region resource attribute (also NEWT_REGION)")
+ } else {
+ region = regionEnv
+ }
+
// do a --version check
version := flag.Bool("version", false, "Print the version")
@@ -286,6 +338,50 @@ func main() {
loggerLevel := parseLogLevel(logLevel)
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
+ // Initialize telemetry after flags are parsed (so flags override env)
+ tcfg := telemetry.FromEnv()
+ tcfg.PromEnabled = metricsEnabled
+ tcfg.OTLPEnabled = otlpEnabled
+ if adminAddr != "" { tcfg.AdminAddr = adminAddr }
+ // Resource attributes (if available)
+ tcfg.SiteID = id
+ tcfg.Region = region
+ // Build info
+ tcfg.BuildVersion = newtVersion
+ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
+
+ tel, telErr := telemetry.Init(ctx, tcfg)
+ if telErr != nil {
+ logger.Warn("Telemetry init failed: %v", telErr)
+ }
+ if tel != nil {
+ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
+ mux := http.NewServeMux()
+ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
+ if tel.PrometheusHandler != nil {
+ mux.Handle("/metrics", tel.PrometheusHandler)
+ }
+ admin := &http.Server{
+ Addr: tcfg.AdminAddr,
+ Handler: otelhttp.NewHandler(mux, "newt-admin"),
+ ReadTimeout: 5 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ ReadHeaderTimeout: 5 * time.Second,
+ IdleTimeout: 30 * time.Second,
+ }
+ go func() {
+ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
+ logger.Warn("admin http error: %v", err)
+ }
+ }()
+ defer func() {
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ _ = admin.Shutdown(ctx)
+ }()
+ defer func() { _ = tel.Shutdown(context.Background()) }()
+ }
+
newtVersion := "version_replaceme"
if *version {
fmt.Println("Newt version " + newtVersion)
@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
}
// Use reliable ping for initial connection test
logger.Debug("Testing initial connection with reliable ping...")
- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
+ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
+ if err == nil && wgData.PublicKey != "" {
+ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds())
+ }
if err != nil {
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
} else {
@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
// as the pings will continue in the background
if !connected {
logger.Debug("Starting ping check")
- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
+ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
}
// Create proxy manager
pm = proxy.NewProxyManager(tnet)
+ pm.SetAsyncBytes(metricsAsyncBytes)
+ // Set tunnel_id for metrics (WireGuard peer public key)
+ pm.SetTunnelID(wgData.PublicKey)
connected = true
+ // telemetry: record a successful site registration (omit region unless available)
+ telemetry.IncSiteRegistration(context.Background(), id, "", "success")
+
// add the targets if there are any
if len(wgData.Targets.TCP) > 0 {
updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP})
@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
logger.Info("Received reconnect message")
+ if wgData.PublicKey != "" {
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
+ }
// Close the WireGuard device and TUN
closeWgTunnel()
+ // Clear metrics attrs and sessions for the tunnel
+ if pm != nil {
+ pm.ClearTunnelID()
+ state.Global().ClearTunnel(wgData.PublicKey)
+ }
+
+ // Clear metrics attrs and sessions for the tunnel
+ if pm != nil {
+ pm.ClearTunnelID()
+ state.Global().ClearTunnel(wgData.PublicKey)
+ }
+
// Mark as disconnected
connected = false
@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
logger.Info("Received termination message")
+ if wgData.PublicKey != "" {
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
+ }
// Close the WireGuard device and TUN
closeWgTunnel()
diff --git a/util.go b/util.go
index 7d6da4f..c1f4915 100644
--- a/util.go
+++ b/util.go
@@ -17,6 +17,7 @@ import (
"github.com/fosrl/newt/logger"
"github.com/fosrl/newt/proxy"
"github.com/fosrl/newt/websocket"
+ "github.com/fosrl/newt/internal/telemetry"
"golang.org/x/net/icmp"
"golang.org/x/net/ipv4"
"golang.zx2c4.com/wireguard/device"
@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
}
-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} {
+func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
maxInterval := 6 * time.Second
currentInterval := pingInterval
consecutiveFailures := 0
@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
if !connectionLost {
connectionLost = true
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
+ if tunnelID != "" {
+ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout)
+ }
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second)
// Send registration message to the server for backward compatibility
err := client.SendMessage("newt/wg/register", map[string]interface{}{
@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
} else {
// Track recent latencies
recentLatencies = append(recentLatencies, latency)
+ // Record tunnel latency (limit sampling to this periodic check)
+ if tunnelID != "" {
+ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds())
+ }
if len(recentLatencies) > 10 {
recentLatencies = recentLatencies[1:]
}
diff --git a/websocket/client.go b/websocket/client.go
index 0c0664a..c9ac264 100644
--- a/websocket/client.go
+++ b/websocket/client.go
@@ -18,6 +18,10 @@ import (
"github.com/fosrl/newt/logger"
"github.com/gorilla/websocket"
+
+ "context"
+ "github.com/fosrl/newt/internal/telemetry"
+ "go.opentelemetry.io/otel"
)
type Client struct {
@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) {
}
resp, err := client.Do(req)
if err != nil {
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err))
return "", fmt.Errorf("failed to request new token: %w", err)
}
defer resp.Body.Close()
@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) {
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure")
+ bin := "http_other"
+ if resp.StatusCode >= 500 {
+ bin = "http_5xx"
+ } else if resp.StatusCode >= 400 {
+ bin = "http_4xx"
+ }
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin)
+ // Reconnect reason mapping for auth failures
+ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError)
+ }
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
}
@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) {
}
logger.Debug("Received token: %s", tokenResp.Data.Token)
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success")
return tokenResp.Data.Token, nil
}
+// classifyConnError maps common errors to low-cardinality error_type labels
+func classifyConnError(err error) string {
+ if err == nil {
+ return ""
+ }
+ msg := strings.ToLower(err.Error())
+ switch {
+ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
+ return "tls"
+ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"):
+ return "timeout"
+ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"):
+ return "dns"
+ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
+ return "auth"
+ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"):
+ return "io"
+ default:
+ return "other"
+ }
+}
+
func (c *Client) connectWithRetry() {
for {
select {
@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error {
// Get token for authentication
token, err := c.getToken()
if err != nil {
+ // telemetry: connection attempt failed before dialing
+ // site_id isn't globally available here; use client ID as site_id (low cardinality)
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err))
return fmt.Errorf("failed to get token: %w", err)
}
@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error {
q.Set("clientType", c.clientType)
u.RawQuery = q.Encode()
- // Connect to WebSocket
+ // Connect to WebSocket (optional span)
+ tr := otel.Tracer("newt")
+ spanCtx, span := tr.Start(context.Background(), "ws.connect")
+ defer span.End()
+
dialer := websocket.DefaultDialer
// Use new TLS configuration method
@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error {
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
}
- conn, _, err := dialer.Dial(u.String(), nil)
+conn, _, err := dialer.DialContext(spanCtx, u.String(), nil)
if err != nil {
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
+ etype := classifyConnError(err)
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype)
+ // Map handshake-related errors to reconnect reasons where appropriate
+ if etype == "tls" {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError)
+ } else if etype == "timeout" {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout)
+ } else {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError)
+ }
return fmt.Errorf("failed to connect to WebSocket: %w", err)
}
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success")
c.conn = conn
c.setConnected(true)
diff --git a/wg/wg.go b/wg/wg.go
index 3cee1a9..a765279 100644
--- a/wg/wg.go
+++ b/wg/wg.go
@@ -3,6 +3,7 @@
package wg
import (
+ "context"
"encoding/json"
"errors"
"fmt"
@@ -23,6 +24,8 @@ import (
"golang.zx2c4.com/wireguard/conn"
"golang.zx2c4.com/wireguard/wgctrl"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
+
+ "github.com/fosrl/newt/internal/telemetry"
)
type WgConfig struct {
@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
s.stopGetConfig = nil
}
+ // telemetry: config reload success
+ telemetry.IncConfigReload(context.Background(), "success")
+ // Optional reconnect reason mapping: config change
+ if s.serverPubKey != "" {
+ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange)
+ }
+
// Ensure the WireGuard interface and peers are configured
if err := s.ensureWireguardInterface(config); err != nil {
logger.Error("Failed to ensure WireGuard interface: %v", err)

View File

@@ -1,466 +0,0 @@
diff --git a/main.go b/main.go
index 12849b1..c223b75 100644
--- a/main.go
+++ b/main.go
@@ -1,7 +1,9 @@
package main
import (
+ "context"
"encoding/json"
+ "errors"
"flag"
"fmt"
"net"
@@ -22,6 +24,9 @@ import (
"github.com/fosrl/newt/updates"
"github.com/fosrl/newt/websocket"
+ "github.com/fosrl/newt/internal/state"
+ "github.com/fosrl/newt/internal/telemetry"
+ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"golang.zx2c4.com/wireguard/conn"
"golang.zx2c4.com/wireguard/device"
"golang.zx2c4.com/wireguard/tun"
@@ -116,6 +121,13 @@ var (
healthMonitor *healthcheck.Monitor
enforceHealthcheckCert bool
+ // Observability/metrics flags
+ metricsEnabled bool
+ otlpEnabled bool
+ adminAddr string
+ region string
+ metricsAsyncBytes bool
+
// New mTLS configuration variables
tlsClientCert string
tlsClientKey string
@@ -126,6 +138,10 @@ var (
)
func main() {
+ // Prepare context for graceful shutdown and signal handling
+ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
+ defer stop()
+
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
id = os.Getenv("NEWT_ID")
@@ -141,6 +157,13 @@ func main() {
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
+ // Metrics/observability env mirrors
+ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
+ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
+ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
+ regionEnv := os.Getenv("NEWT_REGION")
+ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
+
keepInterface = keepInterfaceEnv == "true"
acceptClients = acceptClientsEnv == "true"
useNativeInterface = useNativeInterfaceEnv == "true"
@@ -272,6 +295,35 @@ func main() {
flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)")
}
+ // Metrics/observability flags (mirror ENV if unset)
+ if metricsEnabledEnv == "" {
+ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
+ } else {
+ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true }
+ }
+ if otlpEnabledEnv == "" {
+ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
+ } else {
+ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v }
+ }
+ if adminAddrEnv == "" {
+ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
+ } else {
+ adminAddr = adminAddrEnv
+ }
+ // Async bytes toggle
+ if asyncBytesEnv == "" {
+ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
+ } else {
+ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v }
+ }
+ // Optional region flag (resource attribute)
+ if regionEnv == "" {
+ flag.StringVar(&region, "region", "", "Optional region resource attribute (also NEWT_REGION)")
+ } else {
+ region = regionEnv
+ }
+
// do a --version check
version := flag.Bool("version", false, "Print the version")
@@ -286,6 +338,50 @@ func main() {
loggerLevel := parseLogLevel(logLevel)
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
+ // Initialize telemetry after flags are parsed (so flags override env)
+ tcfg := telemetry.FromEnv()
+ tcfg.PromEnabled = metricsEnabled
+ tcfg.OTLPEnabled = otlpEnabled
+ if adminAddr != "" { tcfg.AdminAddr = adminAddr }
+ // Resource attributes (if available)
+ tcfg.SiteID = id
+ tcfg.Region = region
+ // Build info
+ tcfg.BuildVersion = newtVersion
+ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
+
+ tel, telErr := telemetry.Init(ctx, tcfg)
+ if telErr != nil {
+ logger.Warn("Telemetry init failed: %v", telErr)
+ }
+ if tel != nil {
+ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
+ mux := http.NewServeMux()
+ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
+ if tel.PrometheusHandler != nil {
+ mux.Handle("/metrics", tel.PrometheusHandler)
+ }
+ admin := &http.Server{
+ Addr: tcfg.AdminAddr,
+ Handler: otelhttp.NewHandler(mux, "newt-admin"),
+ ReadTimeout: 5 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ ReadHeaderTimeout: 5 * time.Second,
+ IdleTimeout: 30 * time.Second,
+ }
+ go func() {
+ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
+ logger.Warn("admin http error: %v", err)
+ }
+ }()
+ defer func() {
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+ _ = admin.Shutdown(ctx)
+ }()
+ defer func() { _ = tel.Shutdown(context.Background()) }()
+ }
+
newtVersion := "version_replaceme"
if *version {
fmt.Println("Newt version " + newtVersion)
@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
}
// Use reliable ping for initial connection test
logger.Debug("Testing initial connection with reliable ping...")
- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
+ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
+ if err == nil && wgData.PublicKey != "" {
+ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds())
+ }
if err != nil {
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
} else {
@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
// as the pings will continue in the background
if !connected {
logger.Debug("Starting ping check")
- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
+ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
}
// Create proxy manager
pm = proxy.NewProxyManager(tnet)
+ pm.SetAsyncBytes(metricsAsyncBytes)
+ // Set tunnel_id for metrics (WireGuard peer public key)
+ pm.SetTunnelID(wgData.PublicKey)
connected = true
+ // telemetry: record a successful site registration (omit region unless available)
+ telemetry.IncSiteRegistration(context.Background(), id, "", "success")
+
// add the targets if there are any
if len(wgData.Targets.TCP) > 0 {
updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP})
@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
logger.Info("Received reconnect message")
+ if wgData.PublicKey != "" {
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
+ }
// Close the WireGuard device and TUN
closeWgTunnel()
+ // Clear metrics attrs and sessions for the tunnel
+ if pm != nil {
+ pm.ClearTunnelID()
+ state.Global().ClearTunnel(wgData.PublicKey)
+ }
+
+ // Clear metrics attrs and sessions for the tunnel
+ if pm != nil {
+ pm.ClearTunnelID()
+ state.Global().ClearTunnel(wgData.PublicKey)
+ }
+
// Mark as disconnected
connected = false
@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
logger.Info("Received termination message")
+ if wgData.PublicKey != "" {
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
+ }
// Close the WireGuard device and TUN
closeWgTunnel()
diff --git a/util.go b/util.go
index 7d6da4f..c1f4915 100644
--- a/util.go
+++ b/util.go
@@ -17,6 +17,7 @@ import (
"github.com/fosrl/newt/logger"
"github.com/fosrl/newt/proxy"
"github.com/fosrl/newt/websocket"
+ "github.com/fosrl/newt/internal/telemetry"
"golang.org/x/net/icmp"
"golang.org/x/net/ipv4"
"golang.zx2c4.com/wireguard/device"
@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
}
-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} {
+func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
maxInterval := 6 * time.Second
currentInterval := pingInterval
consecutiveFailures := 0
@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
if !connectionLost {
connectionLost = true
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
+ if tunnelID != "" {
+ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout)
+ }
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second)
// Send registration message to the server for backward compatibility
err := client.SendMessage("newt/wg/register", map[string]interface{}{
@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
} else {
// Track recent latencies
recentLatencies = append(recentLatencies, latency)
+ // Record tunnel latency (limit sampling to this periodic check)
+ if tunnelID != "" {
+ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds())
+ }
if len(recentLatencies) > 10 {
recentLatencies = recentLatencies[1:]
}
diff --git a/websocket/client.go b/websocket/client.go
index 0c0664a..c9ac264 100644
--- a/websocket/client.go
+++ b/websocket/client.go
@@ -18,6 +18,10 @@ import (
"github.com/fosrl/newt/logger"
"github.com/gorilla/websocket"
+
+ "context"
+ "github.com/fosrl/newt/internal/telemetry"
+ "go.opentelemetry.io/otel"
)
type Client struct {
@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) {
}
resp, err := client.Do(req)
if err != nil {
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err))
return "", fmt.Errorf("failed to request new token: %w", err)
}
defer resp.Body.Close()
@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) {
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure")
+ bin := "http_other"
+ if resp.StatusCode >= 500 {
+ bin = "http_5xx"
+ } else if resp.StatusCode >= 400 {
+ bin = "http_4xx"
+ }
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin)
+ // Reconnect reason mapping for auth failures
+ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError)
+ }
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
}
@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) {
}
logger.Debug("Received token: %s", tokenResp.Data.Token)
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success")
return tokenResp.Data.Token, nil
}
+// classifyConnError maps common errors to low-cardinality error_type labels
+func classifyConnError(err error) string {
+ if err == nil {
+ return ""
+ }
+ msg := strings.ToLower(err.Error())
+ switch {
+ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
+ return "tls"
+ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"):
+ return "timeout"
+ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"):
+ return "dns"
+ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
+ return "auth"
+ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"):
+ return "io"
+ default:
+ return "other"
+ }
+}
+
func (c *Client) connectWithRetry() {
for {
select {
@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error {
// Get token for authentication
token, err := c.getToken()
if err != nil {
+ // telemetry: connection attempt failed before dialing
+ // site_id isn't globally available here; use client ID as site_id (low cardinality)
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err))
return fmt.Errorf("failed to get token: %w", err)
}
@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error {
q.Set("clientType", c.clientType)
u.RawQuery = q.Encode()
- // Connect to WebSocket
+ // Connect to WebSocket (optional span)
+ tr := otel.Tracer("newt")
+ spanCtx, span := tr.Start(context.Background(), "ws.connect")
+ defer span.End()
+
dialer := websocket.DefaultDialer
// Use new TLS configuration method
@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error {
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
}
- conn, _, err := dialer.Dial(u.String(), nil)
+conn, _, err := dialer.DialContext(spanCtx, u.String(), nil)
if err != nil {
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
+ etype := classifyConnError(err)
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype)
+ // Map handshake-related errors to reconnect reasons where appropriate
+ if etype == "tls" {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError)
+ } else if etype == "timeout" {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout)
+ } else {
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError)
+ }
return fmt.Errorf("failed to connect to WebSocket: %w", err)
}
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success")
c.conn = conn
c.setConnected(true)
diff --git a/wg/wg.go b/wg/wg.go
index 3cee1a9..a765279 100644
--- a/wg/wg.go
+++ b/wg/wg.go
@@ -3,6 +3,7 @@
package wg
import (
+ "context"
"encoding/json"
"errors"
"fmt"
@@ -23,6 +24,8 @@ import (
"golang.zx2c4.com/wireguard/conn"
"golang.zx2c4.com/wireguard/wgctrl"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
+
+ "github.com/fosrl/newt/internal/telemetry"
)
type WgConfig struct {
@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
s.stopGetConfig = nil
}
+ // telemetry: config reload success
+ telemetry.IncConfigReload(context.Background(), "success")
+ // Optional reconnect reason mapping: config change
+ if s.serverPubKey != "" {
+ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange)
+ }
+
// Ensure the WireGuard interface and peers are configured
if err := s.ensureWireguardInterface(config); err != nil {
logger.Error("Failed to ensure WireGuard interface: %v", err)
diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go
index 6684c40..09f160e 100644
--- a/wgnetstack/wgnetstack.go
+++ b/wgnetstack/wgnetstack.go
@@ -1,6 +1,7 @@
package wgnetstack
import (
+ "context"
"crypto/rand"
"encoding/base64"
"encoding/hex"
@@ -26,6 +27,8 @@ import (
"golang.zx2c4.com/wireguard/tun"
"golang.zx2c4.com/wireguard/tun/netstack"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
+
+ "github.com/fosrl/newt/internal/telemetry"
)
type WgConfig struct {
@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str
return service, nil
}
+// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally.
+func (s *WireGuardService) ReportRTT(seconds float64) {
+ if s.serverPubKey == "" { return }
+ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds)
+}
+
func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) {
logger.Debug("Received: %+v", msg)
// if there is no wgData or pm, we can't add targets
if s.TunnelIP == "" || s.proxyManager == nil {
logger.Info("No tunnel IP or proxy manager available")
- return
- }
+ return
+}
targetData, err := parseTargetData(msg.Data)
if err != nil {

View File

@@ -1,44 +0,0 @@
diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go
index 6684c40..09f160e 100644
--- a/wgnetstack/wgnetstack.go
+++ b/wgnetstack/wgnetstack.go
@@ -1,6 +1,7 @@
package wgnetstack
import (
+ "context"
"crypto/rand"
"encoding/base64"
"encoding/hex"
@@ -26,6 +27,8 @@ import (
"golang.zx2c4.com/wireguard/tun"
"golang.zx2c4.com/wireguard/tun/netstack"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
+
+ "github.com/fosrl/newt/internal/telemetry"
)
type WgConfig struct {
@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str
return service, nil
}
+// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally.
+func (s *WireGuardService) ReportRTT(seconds float64) {
+ if s.serverPubKey == "" { return }
+ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds)
+}
+
func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) {
logger.Debug("Received: %+v", msg)
// if there is no wgData or pm, we can't add targets
if s.TunnelIP == "" || s.proxyManager == nil {
logger.Info("No tunnel IP or proxy manager available")
- return
- }
+ return
+}
targetData, err := parseTargetData(msg.Data)
if err != nil {

View File

@@ -1,25 +0,0 @@
# How to apply patches
These patches were generated from the working tree without commits. You can apply them in one shot or in topic order.
One shot (recommended during review):
```bash
git apply patches/00_all_changes.patch
```
Topic order:
```bash
git apply patches/01_proxy_multitunnel.patch
git apply patches/02_reconnect_rtt.patch
git apply patches/03_constants_docs.patch
```
Rollback (restore to HEAD and clean untracked files):
```bash
git restore --source=HEAD --worktree --staged .
git clean -fd
```

View File

@@ -1,55 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
NEWTHOST=${NEWTHOST:-localhost}
NEWTPORT=${NEWTPORT:-2112}
METRICS_URL="http://${NEWTHOST}:${NEWTPORT}/metrics"
probe() {
local name=$1
local pattern=$2
echo "[probe] ${name}"
curl -sf "${METRICS_URL}" | grep -E "${pattern}" || {
echo "[warn] ${name} not found"
return 1
}
}
# Basic presence
probe "newt_* presence" "^newt_" || true
# Site gauges with site_id
probe "site_online with site_id" "^newt_site_online\{.*site_id=\"[^\"]+\"" || true
probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_timestamp_seconds\{.*site_id=\"[^\"]+\"" || true
# Bytes with direction ingress/egress and protocol
probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true
probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true
# Optional: verify absence/presence of tunnel_id based on EXPECT_TUNNEL_ID (default true)
EXPECT_TUNNEL_ID=${EXPECT_TUNNEL_ID:-true}
if [ "$EXPECT_TUNNEL_ID" = "false" ]; then
echo "[probe] ensure tunnel_id label is absent when NEWT_METRICS_INCLUDE_TUNNEL_ID=false"
! curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[fail] tunnel_id present but EXPECT_TUNNEL_ID=false"; exit 1; }
else
echo "[probe] ensure tunnel_id label is present (default)"
curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[warn] tunnel_id not found (may be expected if no tunnel is active)"; }
fi
# WebSocket metrics (when OTLP/WS used)
probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true
probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true
probe "websocket connected gauge" "^newt_websocket_connected" || true
probe "websocket reconnects total" "^newt_websocket_reconnects_total\{" || true
# Proxy metrics (when proxy active)
probe "proxy active connections" "^newt_proxy_active_connections\{" || true
probe "proxy buffer bytes" "^newt_proxy_buffer_bytes\{" || true
probe "proxy drops total" "^newt_proxy_drops_total\{" || true
probe "proxy connections total" "^newt_proxy_connections_total\{" || true
# Config apply
probe "config apply seconds buckets" "^newt_config_apply_seconds_bucket\{" || true
echo "Smoke checks completed (warnings above are acceptable if the feature isn't exercised yet)."