mirror of
https://github.com/fosrl/newt.git
synced 2026-03-13 02:14:56 -05:00
Compare commits
85 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e07439a366 | ||
|
|
9f43d4ce6d | ||
|
|
5888553c50 | ||
|
|
f63b1b689f | ||
|
|
7f104d1a0c | ||
|
|
9de29e7e00 | ||
|
|
cf611fe849 | ||
|
|
23e2731473 | ||
|
|
186b51e000 | ||
|
|
d21f4951e9 | ||
|
|
e04c654292 | ||
|
|
e43fbebcb8 | ||
|
|
1afed32562 | ||
|
|
46384e6242 | ||
|
|
52e4a57cc1 | ||
|
|
1a9f6c4685 | ||
|
|
b6f5458ad9 | ||
|
|
4ef9737862 | ||
|
|
b68777e83a | ||
|
|
8d26de5f4d | ||
|
|
c32828128f | ||
|
|
3cd7329d8b | ||
|
|
3490220803 | ||
|
|
bd62da4cc9 | ||
|
|
8d0e6be2c7 | ||
|
|
b62e18622e | ||
|
|
89274eb9a8 | ||
|
|
77d56596ab | ||
|
|
6ec0ab813c | ||
|
|
fef9e8c76b | ||
|
|
ae5129a7c7 | ||
|
|
ed127a2d61 | ||
|
|
20ddbb5382 | ||
|
|
5cbda35637 | ||
|
|
60196455d1 | ||
|
|
84e659acde | ||
|
|
e16881b7c8 | ||
|
|
587e829e42 | ||
|
|
ee2f8899ff | ||
|
|
744a741556 | ||
|
|
aea80200e0 | ||
|
|
b20f7a02b2 | ||
|
|
f28d90595b | ||
|
|
4a90e36a44 | ||
|
|
9ace45e71f | ||
|
|
75d5e695d6 | ||
|
|
d74065a71b | ||
|
|
f86031f458 | ||
|
|
31f70e5032 | ||
|
|
31514f26df | ||
|
|
09fcb36963 | ||
|
|
83c3ae5cf9 | ||
|
|
1e88fb86b4 | ||
|
|
62407b0c74 | ||
|
|
d91c6ef168 | ||
|
|
59e8d79404 | ||
|
|
d907ae9e84 | ||
|
|
d745aa79d4 | ||
|
|
427ab67bb5 | ||
|
|
a86b14d97d | ||
|
|
f8fd8e1bc5 | ||
|
|
0b5e662abc | ||
|
|
bd55269b39 | ||
|
|
3e9c74a65b | ||
|
|
922591b269 | ||
|
|
cfe52caa4a | ||
|
|
d31d08c1c8 | ||
|
|
9ac4cee48d | ||
|
|
b53fb70778 | ||
|
|
0f83489f11 | ||
|
|
09e9bd9493 | ||
|
|
2d4f656852 | ||
|
|
8f7f9c417c | ||
|
|
660adcc72d | ||
|
|
0d55e35784 | ||
|
|
ceef228665 | ||
|
|
496ff0734c | ||
|
|
a89f13870c | ||
|
|
85394d3255 | ||
|
|
0405aebb45 | ||
|
|
9c0f4599b8 | ||
|
|
fd6b1ae323 | ||
|
|
831ae2d9c5 | ||
|
|
a63a27e3ab | ||
|
|
34d558a5a2 |
5
.env.example
Normal file
5
.env.example
Normal file
@@ -0,0 +1,5 @@
|
||||
# Copy this file to .env and fill in your values
|
||||
# Required for connecting to Pangolin service
|
||||
PANGOLIN_ENDPOINT=https://example.com
|
||||
NEWT_ID=changeme-id
|
||||
NEWT_SECRET=changeme-secret
|
||||
188
.github/workflows/cicd.yml
vendored
188
.github/workflows/cicd.yml
vendored
@@ -1,64 +1,160 @@
|
||||
name: CI/CD Pipeline
|
||||
|
||||
# CI/CD workflow for building, publishing, mirroring, signing container images and building release binaries.
|
||||
# Actions are pinned to specific SHAs to reduce supply-chain risk. This workflow triggers on tag push events.
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write # for GHCR push
|
||||
id-token: write # for Cosign Keyless (OIDC) Signing
|
||||
|
||||
# Required secrets:
|
||||
# - DOCKER_HUB_USERNAME / DOCKER_HUB_ACCESS_TOKEN: push to Docker Hub
|
||||
# - GITHUB_TOKEN: used for GHCR login and OIDC keyless signing
|
||||
# - COSIGN_PRIVATE_KEY / COSIGN_PASSWORD / COSIGN_PUBLIC_KEY: for key-based signing
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "*"
|
||||
push:
|
||||
tags:
|
||||
- "*"
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
release:
|
||||
name: Build and Release
|
||||
runs-on: ubuntu-latest
|
||||
release:
|
||||
name: Build and Release
|
||||
runs-on: ubuntu-latest
|
||||
# Job-level timeout to avoid runaway or stuck runs
|
||||
timeout-minutes: 120
|
||||
env:
|
||||
# Target images
|
||||
DOCKERHUB_IMAGE: docker.io/${{ secrets.DOCKER_HUB_USERNAME }}/${{ github.event.repository.name }}
|
||||
GHCR_IMAGE: ghcr.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v5
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_HUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
|
||||
with:
|
||||
registry: docker.io
|
||||
username: ${{ secrets.DOCKER_HUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
|
||||
|
||||
- name: Extract tag name
|
||||
id: get-tag
|
||||
run: echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
|
||||
- name: Extract tag name
|
||||
id: get-tag
|
||||
run: echo "TAG=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
|
||||
shell: bash
|
||||
|
||||
- name: Install Go
|
||||
uses: actions/setup-go@v6
|
||||
with:
|
||||
go-version: 1.25
|
||||
- name: Install Go
|
||||
uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
|
||||
with:
|
||||
go-version: 1.25
|
||||
|
||||
- name: Update version in main.go
|
||||
run: |
|
||||
TAG=${{ env.TAG }}
|
||||
if [ -f main.go ]; then
|
||||
sed -i 's/version_replaceme/'"$TAG"'/' main.go
|
||||
echo "Updated main.go with version $TAG"
|
||||
else
|
||||
echo "main.go not found"
|
||||
fi
|
||||
- name: Update version in main.go
|
||||
run: |
|
||||
TAG=${{ env.TAG }}
|
||||
if [ -f main.go ]; then
|
||||
sed -i 's/version_replaceme/'"$TAG"'/' main.go
|
||||
echo "Updated main.go with version $TAG"
|
||||
else
|
||||
echo "main.go not found"
|
||||
fi
|
||||
shell: bash
|
||||
|
||||
- name: Build and push Docker images
|
||||
run: |
|
||||
TAG=${{ env.TAG }}
|
||||
make docker-build-release tag=$TAG
|
||||
- name: Build and push Docker images (Docker Hub)
|
||||
run: |
|
||||
TAG=${{ env.TAG }}
|
||||
make docker-build-release tag=$TAG
|
||||
echo "Built & pushed to: ${{ env.DOCKERHUB_IMAGE }}:${TAG}"
|
||||
shell: bash
|
||||
|
||||
- name: Build binaries
|
||||
run: |
|
||||
make go-build-release
|
||||
- name: Login in to GHCR
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Upload artifacts from /bin
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: binaries
|
||||
path: bin/
|
||||
- name: Install skopeo + jq
|
||||
# skopeo: copy/inspect images between registries
|
||||
# jq: JSON parsing tool used to extract digest values
|
||||
run: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y skopeo jq
|
||||
skopeo --version
|
||||
shell: bash
|
||||
|
||||
- name: Copy tag from Docker Hub to GHCR
|
||||
# Mirror the already-built image (all architectures) to GHCR so we can sign it
|
||||
run: |
|
||||
set -euo pipefail
|
||||
TAG=${{ env.TAG }}
|
||||
echo "Copying ${{ env.DOCKERHUB_IMAGE }}:${TAG} -> ${{ env.GHCR_IMAGE }}:${TAG}"
|
||||
skopeo copy --all --retry-times 3 \
|
||||
docker://$DOCKERHUB_IMAGE:$TAG \
|
||||
docker://$GHCR_IMAGE:$TAG
|
||||
shell: bash
|
||||
|
||||
- name: Install cosign
|
||||
# cosign is used to sign and verify container images (key and keyless)
|
||||
uses: sigstore/cosign-installer@faadad0cce49287aee09b3a48701e75088a2c6ad # v4.0.0
|
||||
|
||||
- name: Dual-sign and verify (GHCR & Docker Hub)
|
||||
# Sign each image by digest using keyless (OIDC) and key-based signing,
|
||||
# then verify both the public key signature and the keyless OIDC signature.
|
||||
env:
|
||||
TAG: ${{ env.TAG }}
|
||||
COSIGN_PRIVATE_KEY: ${{ secrets.COSIGN_PRIVATE_KEY }}
|
||||
COSIGN_PASSWORD: ${{ secrets.COSIGN_PASSWORD }}
|
||||
COSIGN_PUBLIC_KEY: ${{ secrets.COSIGN_PUBLIC_KEY }}
|
||||
COSIGN_YES: "true"
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
issuer="https://token.actions.githubusercontent.com"
|
||||
id_regex="^https://github.com/${{ github.repository }}/.+" # accept this repo (all workflows/refs)
|
||||
|
||||
for IMAGE in "${GHCR_IMAGE}" "${DOCKERHUB_IMAGE}"; do
|
||||
echo "Processing ${IMAGE}:${TAG}"
|
||||
|
||||
DIGEST="$(skopeo inspect --retry-times 3 docker://${IMAGE}:${TAG} | jq -r '.Digest')"
|
||||
REF="${IMAGE}@${DIGEST}"
|
||||
echo "Resolved digest: ${REF}"
|
||||
|
||||
echo "==> cosign sign (keyless) --recursive ${REF}"
|
||||
cosign sign --recursive "${REF}"
|
||||
|
||||
echo "==> cosign sign (key) --recursive ${REF}"
|
||||
cosign sign --key env://COSIGN_PRIVATE_KEY --recursive "${REF}"
|
||||
|
||||
echo "==> cosign verify (public key) ${REF}"
|
||||
cosign verify --key env://COSIGN_PUBLIC_KEY "${REF}" -o text
|
||||
|
||||
echo "==> cosign verify (keyless policy) ${REF}"
|
||||
cosign verify \
|
||||
--certificate-oidc-issuer "${issuer}" \
|
||||
--certificate-identity-regexp "${id_regex}" \
|
||||
"${REF}" -o text
|
||||
done
|
||||
shell: bash
|
||||
|
||||
- name: Build binaries
|
||||
run: |
|
||||
make go-build-release
|
||||
shell: bash
|
||||
|
||||
- name: Upload artifacts from /bin
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: binaries
|
||||
path: bin/
|
||||
|
||||
139
.github/workflows/mirror.yaml
vendored
Normal file
139
.github/workflows/mirror.yaml
vendored
Normal file
@@ -0,0 +1,139 @@
|
||||
name: Mirror & Sign (Docker Hub to GHCR)
|
||||
|
||||
on:
|
||||
workflow_dispatch: {}
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
id-token: write # for keyless OIDC
|
||||
|
||||
env:
|
||||
SOURCE_IMAGE: docker.io/fosrl/newt
|
||||
DEST_IMAGE: ghcr.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
||||
|
||||
jobs:
|
||||
mirror-and-dual-sign:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Install skopeo + jq
|
||||
run: |
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y skopeo jq
|
||||
skopeo --version
|
||||
|
||||
- name: Install cosign
|
||||
uses: sigstore/cosign-installer@faadad0cce49287aee09b3a48701e75088a2c6ad # v4.0.0
|
||||
|
||||
- name: Input check
|
||||
run: |
|
||||
test -n "${SOURCE_IMAGE}" || (echo "SOURCE_IMAGE is empty" && exit 1)
|
||||
echo "Source : ${SOURCE_IMAGE}"
|
||||
echo "Target : ${DEST_IMAGE}"
|
||||
|
||||
# Auth for skopeo (containers-auth)
|
||||
- name: Skopeo login to GHCR
|
||||
run: |
|
||||
skopeo login ghcr.io -u "${{ github.actor }}" -p "${{ secrets.GITHUB_TOKEN }}"
|
||||
|
||||
# >>> IMPORTANT: Auth for cosign (docker-config) <<<
|
||||
- name: Docker login to GHCR (for cosign)
|
||||
run: |
|
||||
echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
|
||||
|
||||
# Optional (if Docker Hub private / tight limits)
|
||||
# - name: Login to Docker Hub (skopeo and cosign share this via docker login)
|
||||
# run: |
|
||||
# echo "${{ secrets.DOCKERHUB_TOKEN }}" | docker login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin
|
||||
# skopeo login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" -p "${{ secrets.DOCKERHUB_TOKEN }}"
|
||||
|
||||
- name: List source tags
|
||||
run: |
|
||||
set -euo pipefail
|
||||
skopeo list-tags --retry-times 3 docker://"${SOURCE_IMAGE}" \
|
||||
| jq -r '.Tags[]' | sort -u > src-tags.txt
|
||||
echo "Found source tags: $(wc -l < src-tags.txt)"
|
||||
head -n 20 src-tags.txt || true
|
||||
|
||||
- name: List destination tags (skip existing)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if skopeo list-tags --retry-times 3 docker://"${DEST_IMAGE}" >/tmp/dst.json 2>/dev/null; then
|
||||
jq -r '.Tags[]' /tmp/dst.json | sort -u > dst-tags.txt
|
||||
else
|
||||
: > dst-tags.txt
|
||||
fi
|
||||
echo "Existing destination tags: $(wc -l < dst-tags.txt)"
|
||||
|
||||
- name: Mirror, dual-sign, and verify
|
||||
env:
|
||||
# keyless
|
||||
COSIGN_YES: "true"
|
||||
# key-based
|
||||
COSIGN_PRIVATE_KEY: ${{ secrets.COSIGN_PRIVATE_KEY }}
|
||||
COSIGN_PASSWORD: ${{ secrets.COSIGN_PASSWORD }}
|
||||
# verify
|
||||
COSIGN_PUBLIC_KEY: ${{ secrets.COSIGN_PUBLIC_KEY }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
copied=0; skipped=0; v_ok=0; errs=0
|
||||
|
||||
issuer="https://token.actions.githubusercontent.com"
|
||||
id_regex="^https://github.com/${{ github.repository }}/.+"
|
||||
|
||||
while read -r tag; do
|
||||
[ -z "$tag" ] && continue
|
||||
|
||||
if grep -Fxq "$tag" dst-tags.txt; then
|
||||
echo "::notice ::Skip (exists) ${DEST_IMAGE}:${tag}"
|
||||
skipped=$((skipped+1))
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "==> Copy ${SOURCE_IMAGE}:${tag} → ${DEST_IMAGE}:${tag}"
|
||||
if ! skopeo copy --all --retry-times 3 \
|
||||
docker://"${SOURCE_IMAGE}:${tag}" docker://"${DEST_IMAGE}:${tag}"; then
|
||||
echo "::warning title=Copy failed::${SOURCE_IMAGE}:${tag}"
|
||||
errs=$((errs+1)); continue
|
||||
fi
|
||||
copied=$((copied+1))
|
||||
|
||||
digest="$(skopeo inspect --retry-times 3 docker://"${DEST_IMAGE}:${tag}" | jq -r '.Digest')"
|
||||
ref="${DEST_IMAGE}@${digest}"
|
||||
|
||||
echo "==> cosign sign (keyless) --recursive ${ref}"
|
||||
if ! cosign sign --recursive "${ref}"; then
|
||||
echo "::warning title=Keyless sign failed::${ref}"
|
||||
errs=$((errs+1))
|
||||
fi
|
||||
|
||||
echo "==> cosign sign (key) --recursive ${ref}"
|
||||
if ! cosign sign --key env://COSIGN_PRIVATE_KEY --recursive "${ref}"; then
|
||||
echo "::warning title=Key sign failed::${ref}"
|
||||
errs=$((errs+1))
|
||||
fi
|
||||
|
||||
echo "==> cosign verify (public key) ${ref}"
|
||||
if ! cosign verify --key env://COSIGN_PUBLIC_KEY "${ref}" -o text; then
|
||||
echo "::warning title=Verify(pubkey) failed::${ref}"
|
||||
errs=$((errs+1))
|
||||
fi
|
||||
|
||||
echo "==> cosign verify (keyless policy) ${ref}"
|
||||
if ! cosign verify \
|
||||
--certificate-oidc-issuer "${issuer}" \
|
||||
--certificate-identity-regexp "${id_regex}" \
|
||||
"${ref}" -o text; then
|
||||
echo "::warning title=Verify(keyless) failed::${ref}"
|
||||
errs=$((errs+1))
|
||||
else
|
||||
v_ok=$((v_ok+1))
|
||||
fi
|
||||
done < src-tags.txt
|
||||
|
||||
echo "---- Summary ----"
|
||||
echo "Copied : $copied"
|
||||
echo "Skipped : $skipped"
|
||||
echo "Verified OK : $v_ok"
|
||||
echo "Errors : $errs"
|
||||
|
||||
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@@ -14,10 +14,10 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v6
|
||||
uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
|
||||
with:
|
||||
go-version: 1.25
|
||||
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -6,3 +6,4 @@ nohup.out
|
||||
*.iml
|
||||
certs/
|
||||
newt_arm64
|
||||
.env
|
||||
|
||||
19
Dockerfile
19
Dockerfile
@@ -1,19 +1,29 @@
|
||||
#ghcr.io/marcschaeferger/newt-private:1.0.0-otel
|
||||
#tademsh/newt:1.0.0-otel
|
||||
FROM golang:1.25-alpine AS builder
|
||||
|
||||
# Install git and ca-certificates
|
||||
RUN apk --no-cache add ca-certificates git tzdata
|
||||
|
||||
# Set the working directory inside the container
|
||||
WORKDIR /app
|
||||
|
||||
# Copy go mod and sum files
|
||||
COPY go.mod go.sum ./
|
||||
|
||||
# Coolify specific Test - set Go proxy to direct to avoid issues
|
||||
# ENV GOSUMDB=off
|
||||
ENV GOPROXY=https://goproxy.io,https://proxy.golang.org,direct
|
||||
RUN go env | grep -E 'GOPROXY|GOSUMDB|GOPRIVATE' && go mod download
|
||||
|
||||
# Download all dependencies
|
||||
RUN go mod download
|
||||
#RUN go mod download
|
||||
|
||||
# Copy the source code into the container
|
||||
COPY . .
|
||||
|
||||
# Build the application
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build -o /newt
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /newt
|
||||
|
||||
FROM alpine:3.22 AS runner
|
||||
|
||||
@@ -22,6 +32,9 @@ RUN apk --no-cache add ca-certificates tzdata
|
||||
COPY --from=builder /newt /usr/local/bin/
|
||||
COPY entrypoint.sh /
|
||||
|
||||
# Admin/metrics endpoint (Prometheus scrape)
|
||||
EXPOSE 2112
|
||||
|
||||
RUN chmod +x /entrypoint.sh
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
CMD ["newt"]
|
||||
CMD ["newt"]
|
||||
|
||||
96
README.md
96
README.md
@@ -1,15 +1,18 @@
|
||||
<!-- markdownlint-disable MD033 -->
|
||||
# Newt
|
||||
|
||||
[](https://pkg.go.dev/github.com/fosrl/newt)
|
||||
[](https://github.com/fosrl/newt/blob/main/LICENSE)
|
||||
[](https://goreportcard.com/report/github.com/fosrl/newt)
|
||||
|
||||
Newt is a fully user space [WireGuard](https://www.wireguard.com/) tunnel client and TCP/UDP proxy, designed to securely expose private resources controlled by Pangolin. By using Newt, you don't need to manage complex WireGuard tunnels and NATing.
|
||||
|
||||
### Installation and Documentation
|
||||
## Installation and Documentation
|
||||
|
||||
Newt is used with Pangolin and Gerbil as part of the larger system. See documentation below:
|
||||
|
||||
- [Full Documentation](https://docs.fossorial.io)
|
||||
- [Full Documentation](https://docs.fossorial.io)
|
||||
- Observability Quickstart: see `docs/observability.md` — canonical Prometheus/OTel Collector quickstart and smoke tests
|
||||
|
||||
## Preview
|
||||
|
||||
@@ -91,9 +94,9 @@ All CLI arguments can be set using environment variables as an alternative to co
|
||||
|
||||
## Loading secrets from files
|
||||
|
||||
You can use `CONFIG_FILE` to define a location of a config file to store the credentials between runs.
|
||||
You can use `CONFIG_FILE` to define a location of a config file to store the credentials between runs.
|
||||
|
||||
```
|
||||
```sh
|
||||
$ cat ~/.config/newt-client/config.json
|
||||
{
|
||||
"id": "spmzu8rbpzj1qq6",
|
||||
@@ -103,19 +106,21 @@ $ cat ~/.config/newt-client/config.json
|
||||
}
|
||||
```
|
||||
|
||||
This file is also written to when newt first starts up. So you do not need to run every time with --id and secret if you have run it once!
|
||||
This file is also written to when newt first starts up. So you do not need to run every time with --id and secret if you have run it once!
|
||||
|
||||
Default locations:
|
||||
Default locations:
|
||||
|
||||
- **macOS**: `~/Library/Application Support/newt-client/config.json`
|
||||
- **Windows**: `%PROGRAMDATA%\newt\newt-client\config.json`
|
||||
- **Linux/Others**: `~/.config/newt-client/config.json`
|
||||
|
||||
<!-- Observability Quickstart moved to docs/observability.md (canonical). -->
|
||||
|
||||
## Examples
|
||||
|
||||
**Note**: When both environment variables and CLI arguments are provided, CLI arguments take precedence.
|
||||
|
||||
- Example:
|
||||
- Example:
|
||||
|
||||
```bash
|
||||
newt \
|
||||
@@ -162,16 +167,16 @@ When the `--accept-clients` flag is enabled (or `ACCEPT_CLIENTS=true` environmen
|
||||
|
||||
In client acceptance mode, Newt:
|
||||
|
||||
- **Creates a WireGuard service** that can accept incoming connections from other WireGuard clients
|
||||
- **Starts a connection testing server** (WGTester) that responds to connectivity checks from remote clients
|
||||
- **Manages peer configurations** dynamically based on Pangolin's instructions
|
||||
- **Enables bidirectional communication** between the Newt instance and connected clients
|
||||
- **Creates a WireGuard service** that can accept incoming connections from other WireGuard clients
|
||||
- **Starts a connection testing server** (WGTester) that responds to connectivity checks from remote clients
|
||||
- **Manages peer configurations** dynamically based on Pangolin's instructions
|
||||
- **Enables bidirectional communication** between the Newt instance and connected clients
|
||||
|
||||
### Use Cases
|
||||
|
||||
- **Site-to-site connectivity**: Connect multiple locations through a central Newt instance
|
||||
- **Client access to private networks**: Allow remote clients to access resources behind the Newt instance
|
||||
- **Development environments**: Provide developers secure access to internal services
|
||||
- **Site-to-site connectivity**: Connect multiple locations through a central Newt instance
|
||||
- **Client access to private networks**: Allow remote clients to access resources behind the Newt instance
|
||||
- **Development environments**: Provide developers secure access to internal services
|
||||
|
||||
### Client Tunneling Modes
|
||||
|
||||
@@ -181,11 +186,11 @@ Newt supports two WireGuard tunneling modes:
|
||||
|
||||
By default, Newt uses a fully userspace WireGuard implementation using [netstack](https://github.com/WireGuard/wireguard-go/blob/master/tun/netstack/examples/http_server.go). This mode:
|
||||
|
||||
- **Does not require root privileges**
|
||||
- **Works on all supported platforms** (Linux, Windows, macOS)
|
||||
- **Does not require WireGuard kernel module** to be installed
|
||||
- **Runs entirely in userspace** - no system network interface is created
|
||||
- **Is containerization-friendly** - works seamlessly in Docker containers
|
||||
- **Does not require root privileges**
|
||||
- **Works on all supported platforms** (Linux, Windows, macOS)
|
||||
- **Does not require WireGuard kernel module** to be installed
|
||||
- **Runs entirely in userspace** - no system network interface is created
|
||||
- **Is containerization-friendly** - works seamlessly in Docker containers
|
||||
|
||||
This is the recommended mode for most deployments, especially containerized environments.
|
||||
|
||||
@@ -195,11 +200,11 @@ In this mode, TCP and UDP is proxied out of newt from the remote client using TC
|
||||
|
||||
When using the `--native` flag or setting `USE_NATIVE_INTERFACE=true`, Newt uses the native WireGuard kernel module. This mode:
|
||||
|
||||
- **Requires root privileges** to create and manage network interfaces
|
||||
- **Only works on Linux** with the WireGuard kernel module installed
|
||||
- **Creates a real network interface** (e.g., `newt0`) on the system
|
||||
- **May offer better performance** for high-throughput scenarios
|
||||
- **Requires proper network permissions** and may conflict with existing network configurations
|
||||
- **Requires root privileges** to create and manage network interfaces
|
||||
- **Only works on Linux** with the WireGuard kernel module installed
|
||||
- **Creates a real network interface** (e.g., `newt0`) on the system
|
||||
- **May offer better performance** for high-throughput scenarios
|
||||
- **Requires proper network permissions** and may conflict with existing network configurations
|
||||
|
||||
In this mode it functions like a traditional VPN interface - all data arrives on the interface and you must get it to the destination (or access things locally).
|
||||
|
||||
@@ -231,10 +236,10 @@ services:
|
||||
|
||||
When client acceptance is enabled:
|
||||
|
||||
- **WGTester Server**: Runs on `port + 1` (e.g., if WireGuard uses port 51820, WGTester uses 51821)
|
||||
- **Connection Testing**: Responds to UDP packets with magic header `0xDEADBEEF` for connectivity verification
|
||||
- **Dynamic Configuration**: Peer configurations are managed remotely through Pangolin
|
||||
- **Proxy Integration**: Can work with both userspace (netstack) and native WireGuard modes
|
||||
- **WGTester Server**: Runs on `port + 1` (e.g., if WireGuard uses port 51820, WGTester uses 51821)
|
||||
- **Connection Testing**: Responds to UDP packets with magic header `0xDEADBEEF` for connectivity verification
|
||||
- **Dynamic Configuration**: Peer configurations are managed remotely through Pangolin
|
||||
- **Proxy Integration**: Can work with both userspace (netstack) and native WireGuard modes
|
||||
|
||||
**Note**: Client acceptance mode requires coordination with Pangolin for peer management and configuration distribution.
|
||||
|
||||
@@ -248,24 +253,23 @@ You can specify the Docker socket path using the `--docker-socket` CLI argument
|
||||
|
||||
Supported values include:
|
||||
|
||||
- Local UNIX socket (default):
|
||||
- Local UNIX socket (default):
|
||||
>You must mount the socket file into the container using a volume, so Newt can access it.
|
||||
|
||||
`unix:///var/run/docker.sock`
|
||||
|
||||
- TCP socket (e.g., via Docker Socket Proxy):
|
||||
- TCP socket (e.g., via Docker Socket Proxy):
|
||||
|
||||
`tcp://localhost:2375`
|
||||
|
||||
- HTTP/HTTPS endpoints (e.g., remote Docker APIs):
|
||||
- HTTP/HTTPS endpoints (e.g., remote Docker APIs):
|
||||
|
||||
`http://your-host:2375`
|
||||
|
||||
- SSH connections (experimental, requires SSH setup):
|
||||
- SSH connections (experimental, requires SSH setup):
|
||||
|
||||
`ssh://user@host`
|
||||
|
||||
|
||||
```yaml
|
||||
services:
|
||||
newt:
|
||||
@@ -280,16 +284,17 @@ services:
|
||||
- NEWT_SECRET=nnisrfsdfc7prqsp9ewo1dvtvci50j5uiqotez00dgap0ii2
|
||||
- DOCKER_SOCKET=unix:///var/run/docker.sock
|
||||
```
|
||||
|
||||
>If you previously used just a path like `/var/run/docker.sock`, it still works — Newt assumes it is a UNIX socket by default.
|
||||
|
||||
#### Hostnames vs IPs
|
||||
|
||||
When the Docker Socket Integration is used, depending on the network which Newt is run with, either the hostname (generally considered the container name) or the IP address of the container will be sent to Pangolin. Here are some of the scenarios where IPs or hostname of the container will be utilised:
|
||||
|
||||
- **Running in Network Mode 'host'**: IP addresses will be used
|
||||
- **Running in Network Mode 'bridge'**: IP addresses will be used
|
||||
- **Running in docker-compose without a network specification**: Docker compose creates a network for the compose by default, hostnames will be used
|
||||
- **Running on docker-compose with defined network**: Hostnames will be used
|
||||
- **Running in Network Mode 'host'**: IP addresses will be used
|
||||
- **Running in Network Mode 'bridge'**: IP addresses will be used
|
||||
- **Running in docker-compose without a network specification**: Docker compose creates a network for the compose by default, hostnames will be used
|
||||
- **Running on docker-compose with defined network**: Hostnames will be used
|
||||
|
||||
### Docker Enforce Network Validation
|
||||
|
||||
@@ -325,12 +330,12 @@ Newt supports mutual TLS (mTLS) authentication if the server is configured to re
|
||||
|
||||
> This is the original method and still supported.
|
||||
|
||||
* File must contain:
|
||||
- File must contain:
|
||||
|
||||
* Client private key
|
||||
* Public certificate
|
||||
* CA certificate
|
||||
* Encrypted `.p12` files are **not supported**
|
||||
- Client private key
|
||||
- Public certificate
|
||||
- CA certificate
|
||||
- Encrypted `.p12` files are **not supported**
|
||||
|
||||
Example:
|
||||
|
||||
@@ -346,9 +351,9 @@ newt \
|
||||
|
||||
You can now provide separate files for:
|
||||
|
||||
* `--tls-client-cert`: client certificate (`.crt` or `.pem`)
|
||||
* `--tls-client-key`: client private key (`.key` or `.pem`)
|
||||
* `--tls-ca-cert`: CA cert to verify the server
|
||||
- `--tls-client-cert`: client certificate (`.crt` or `.pem`)
|
||||
- `--tls-client-key`: client private key (`.key` or `.pem`)
|
||||
- `--tls-ca-cert`: CA cert to verify the server
|
||||
|
||||
Example:
|
||||
|
||||
@@ -362,7 +367,6 @@ newt \
|
||||
--tls-ca-cert ./ca.crt
|
||||
```
|
||||
|
||||
|
||||
```yaml
|
||||
services:
|
||||
newt:
|
||||
|
||||
32
docker-compose-coolify.yml
Normal file
32
docker-compose-coolify.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
services:
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector:0.111.0
|
||||
command: ["--config=/etc/otelcol/config.yaml"]
|
||||
volumes:
|
||||
- ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
|
||||
ports:
|
||||
- "4317:4317" # OTLP gRPC in
|
||||
- "8889:8889" # Prometheus scrape out
|
||||
|
||||
newt:
|
||||
build: .
|
||||
image: newt:dev
|
||||
environment:
|
||||
OTEL_SERVICE_NAME: newt
|
||||
NEWT_METRICS_PROMETHEUS_ENABLED: "true"
|
||||
NEWT_METRICS_OTLP_ENABLED: "true"
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "otel-collector:4317"
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative"
|
||||
NEWT_ADMIN_ADDR: "0.0.0.0:2112"
|
||||
ports:
|
||||
- "2112:2112"
|
||||
depends_on:
|
||||
- otel-collector
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.55.0
|
||||
volumes:
|
||||
- ./examples/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
ports:
|
||||
- "9090:9090"
|
||||
41
docker-compose.metrics.collector.yml
Normal file
41
docker-compose.metrics.collector.yml
Normal file
@@ -0,0 +1,41 @@
|
||||
services:
|
||||
newt:
|
||||
build: .
|
||||
image: newt:dev
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- NEWT_METRICS_PROMETHEUS_ENABLED=false # important: disable direct /metrics scraping
|
||||
- NEWT_METRICS_OTLP_ENABLED=true # OTLP to the Collector
|
||||
# optional:
|
||||
# - NEWT_METRICS_INCLUDE_TUNNEL_ID=false
|
||||
# When using the Collector pattern, do NOT map the Newt admin/metrics port
|
||||
# (2112) on the application service. Mapping 2112 here can cause port
|
||||
# conflicts and may result in duplicated Prometheus scraping (app AND
|
||||
# collector being scraped for the same metrics). Instead either:
|
||||
# - leave ports unset on the app service (recommended), or
|
||||
# - map 2112 only on a dedicated metrics/collector service that is
|
||||
# responsible for exposing metrics to Prometheus.
|
||||
# Example: do NOT map here
|
||||
# ports: []
|
||||
# Example: map 2112 only on a collector service
|
||||
# collector:
|
||||
# ports:
|
||||
# - "2112:2112" # collector's prometheus exporter (scraped by Prometheus)
|
||||
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector-contrib:latest
|
||||
command: ["--config=/etc/otelcol/config.yaml"]
|
||||
volumes:
|
||||
- ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
|
||||
ports:
|
||||
- "4317:4317" # OTLP gRPC
|
||||
- "8889:8889" # Prometheus Exporter (scraped by Prometheus)
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
volumes:
|
||||
- ./examples/prometheus.with-collector.yml:/etc/prometheus/prometheus.yml:ro
|
||||
ports:
|
||||
- "9090:9090"
|
||||
|
||||
55
docker-compose.metrics.yml
Normal file
55
docker-compose.metrics.yml
Normal file
@@ -0,0 +1,55 @@
|
||||
services:
|
||||
# Recommended Variant A: Direct Prometheus scrape of Newt (/metrics)
|
||||
# Optional: You may add the Collector service and enable OTLP export, but do NOT
|
||||
# scrape both Newt and the Collector for the same process.
|
||||
|
||||
newt:
|
||||
build: .
|
||||
image: newt:dev
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
OTEL_SERVICE_NAME: newt
|
||||
NEWT_METRICS_PROMETHEUS_ENABLED: "true"
|
||||
NEWT_METRICS_OTLP_ENABLED: "false" # avoid double-scrape by default
|
||||
NEWT_ADMIN_ADDR: ":2112"
|
||||
# Base NEWT configuration
|
||||
PANGOLIN_ENDPOINT: ${PANGOLIN_ENDPOINT}
|
||||
NEWT_ID: ${NEWT_ID}
|
||||
NEWT_SECRET: ${NEWT_SECRET}
|
||||
LOG_LEVEL: "DEBUG"
|
||||
ports:
|
||||
- "2112:2112"
|
||||
|
||||
# Optional Variant B: Enable the Collector and switch Prometheus scrape to it.
|
||||
# collector:
|
||||
# image: otel/opentelemetry-collector-contrib:0.136.0
|
||||
# command: ["--config=/etc/otelcol/config.yaml"]
|
||||
# volumes:
|
||||
# - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
|
||||
# ports:
|
||||
# - "4317:4317" # OTLP gRPC in
|
||||
# - "8889:8889" # Prometheus scrape out
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:v3.6.0
|
||||
volumes:
|
||||
- ./examples/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
ports:
|
||||
- "9090:9090"
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:12.2.0
|
||||
container_name: newt-metrics-grafana
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
ports:
|
||||
- "3005:3000"
|
||||
depends_on:
|
||||
- prometheus
|
||||
volumes:
|
||||
- ./examples/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
|
||||
- ./examples/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
|
||||
- ./examples/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
75
docs/METRICS_RECOMMENDATIONS.md
Normal file
75
docs/METRICS_RECOMMENDATIONS.md
Normal file
@@ -0,0 +1,75 @@
|
||||
# Newt Metrics: Recommendations, Gaps, and Roadmap
|
||||
|
||||
This document captures the current state of Newt metrics, prioritized fixes, and a pragmatic roadmap for near-term improvements.
|
||||
|
||||
1) Current setup (summary)
|
||||
|
||||
- Export: Prometheus exposition (default), optional OTLP (gRPC)
|
||||
- Existing instruments:
|
||||
- Sites: newt_site_registrations_total, newt_site_online (0/1), newt_site_last_heartbeat_timestamp_seconds
|
||||
- Tunnel/Traffic: newt_tunnel_sessions, newt_tunnel_bytes_total, newt_tunnel_latency_seconds, newt_tunnel_reconnects_total
|
||||
- Connection lifecycle: newt_connection_attempts_total, newt_connection_errors_total
|
||||
- Operations: newt_config_reloads_total, process_start_time_seconds, newt_build_info
|
||||
- Operations: newt_config_reloads_total, process_start_time_seconds, newt_config_apply_seconds, newt_cert_rotation_total
|
||||
- Build metadata: newt_build_info
|
||||
- Control plane: newt_websocket_connect_latency_seconds, newt_websocket_messages_total, newt_websocket_connected, newt_websocket_reconnects_total
|
||||
- Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_async_backlog_bytes, newt_proxy_drops_total, newt_proxy_accept_total, newt_proxy_connection_duration_seconds, newt_proxy_connections_total
|
||||
- Go runtime: GC, heap, goroutines via runtime instrumentation
|
||||
|
||||
2) Main issues addressed now
|
||||
|
||||
- Attribute filter (allow-list) extended to include site_id and region in addition to existing keys (tunnel_id, transport, protocol, direction, result, reason, error_type, version, commit).
|
||||
- site_id and region propagation: site_id/region remain resource attributes. Metric labels mirror them on per-site gauges and counters by default; set `NEWT_METRICS_INCLUDE_SITE_LABELS=false` to drop them for multi-tenant scrapes.
|
||||
- Label semantics clarified:
|
||||
- transport: control-plane mechanism (e.g., websocket, wireguard)
|
||||
- protocol: L4 payload type (tcp, udp)
|
||||
- newt_tunnel_bytes_total uses protocol and direction, not transport.
|
||||
- Robustness improvements: removed duplicate clear logic on reconnect; avoided empty site_id by reading NEWT_SITE_ID/NEWT_ID and OTEL_RESOURCE_ATTRIBUTES.
|
||||
|
||||
3) Remaining gaps and deviations
|
||||
|
||||
- Some call sites still need initiator label on reconnect outcomes (client vs server). This is planned.
|
||||
- Config apply duration and cert rotation counters are planned.
|
||||
- Registration and config reload failures are not yet emitted; add failure code paths so result labels expose churn.
|
||||
- Document using `process_start_time_seconds` (and `time()` in PromQL) to derive uptime; no explicit restart counter is needed.
|
||||
- Metric helpers often use `context.Background()`. Where lightweight contexts exist (e.g., HTTP handlers), propagate them to ease future correlation.
|
||||
- Tracing coverage is limited to admin HTTP and WebSocket connect spans; extend to blueprint fetches, proxy accept loops, and WireGuard updates when OTLP is enabled.
|
||||
|
||||
4) Roadmap (phased)
|
||||
|
||||
- Phase 1 (done in this iteration)
|
||||
- Fix attribute filter (site_id, region)
|
||||
- Propagate site_id (and optional region) across metrics
|
||||
- Correct label semantics (transport vs protocol); fix sessions transport labelling
|
||||
- Documentation alignment
|
||||
- Phase 2 (next)
|
||||
- Reconnect: add initiator label (client/server)
|
||||
- Config & PKI: newt_config_apply_seconds{phase,result}; newt_cert_rotation_total{result}
|
||||
- WebSocket disconnect and keepalive failure counters
|
||||
- Proxy connection lifecycle metrics (accept totals, duration histogram)
|
||||
- Pangolin blueprint/config fetch latency and status metrics
|
||||
- Certificate rotation duration histogram to complement success/failure counter
|
||||
|
||||
5) Operational guidance
|
||||
|
||||
- Do not double scrape: scrape either Newt (/metrics) or the Collector’s Prometheus exporter (not both) to avoid double-counting cumulative counters.
|
||||
- For high cardinality tunnel_id, consider relabeling or dropping per-tunnel series in Prometheus to control cardinality.
|
||||
- OTLP troubleshooting: enable TLS via OTEL_EXPORTER_OTLP_CERTIFICATE, use OTEL_EXPORTER_OTLP_HEADERS for auth; verify endpoint reachability.
|
||||
|
||||
6) Example alerts/recording rules (suggestions)
|
||||
|
||||
- Reconnect spikes:
|
||||
- increase(newt_tunnel_reconnects_total[5m]) by (site_id)
|
||||
- Sustained connection errors:
|
||||
- rate(newt_connection_errors_total[5m]) by (site_id,transport,error_type)
|
||||
- Heartbeat gaps:
|
||||
- max_over_time(time() - newt_site_last_heartbeat_timestamp_seconds[15m]) by (site_id)
|
||||
- Proxy drops:
|
||||
- increase(newt_proxy_drops_total[5m]) by (site_id,protocol)
|
||||
- WebSocket connect p95 (when added):
|
||||
- histogram_quantile(0.95, sum(rate(newt_websocket_connect_latency_seconds_bucket[5m])) by (le,site_id))
|
||||
|
||||
7) Collector configuration
|
||||
|
||||
- Direct scrape variant requires no attribute promotion since site_id is already a metric label.
|
||||
- Transform/promote variant remains optional for environments that rely on resource-to-label promotion.
|
||||
247
docs/observability.md
Normal file
247
docs/observability.md
Normal file
@@ -0,0 +1,247 @@
|
||||
<!-- markdownlint-disable MD033 -->
|
||||
# OpenTelemetry Observability for Newt
|
||||
|
||||
This document describes how Newt exposes metrics using the OpenTelemetry (OTel) Go SDK, how to enable Prometheus scraping, and how to send data to an OpenTelemetry Collector for further export.
|
||||
|
||||
Goals
|
||||
|
||||
- Provide a /metrics endpoint in Prometheus exposition format (via OTel Prometheus exporter)
|
||||
- Keep metrics backend-agnostic; optional OTLP export to a Collector
|
||||
- Use OTel semantic conventions where applicable and enforce SI units
|
||||
- Low-cardinality, stable labels only
|
||||
|
||||
Enable via flags (ENV mirrors)
|
||||
|
||||
- --metrics (default: true) ↔ NEWT_METRICS_PROMETHEUS_ENABLED
|
||||
- --metrics-admin-addr (default: 127.0.0.1:2112) ↔ NEWT_ADMIN_ADDR
|
||||
- --otlp (default: false) ↔ NEWT_METRICS_OTLP_ENABLED
|
||||
|
||||
Enable exporters via environment variables (no code changes required)
|
||||
|
||||
- NEWT_METRICS_PROMETHEUS_ENABLED=true|false (default: true)
|
||||
- NEWT_METRICS_OTLP_ENABLED=true|false (default: false)
|
||||
- OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317
|
||||
- OTEL_EXPORTER_OTLP_INSECURE=true|false (default: true for dev)
|
||||
- OTEL_SERVICE_NAME=newt (default)
|
||||
- OTEL_SERVICE_VERSION=<version>
|
||||
- OTEL_RESOURCE_ATTRIBUTES=service.instance.id=<id>,site_id=<id>
|
||||
- OTEL_METRIC_EXPORT_INTERVAL=15s (default)
|
||||
- NEWT_ADMIN_ADDR=127.0.0.1:2112 (default admin HTTP with /metrics)
|
||||
- NEWT_METRICS_INCLUDE_SITE_LABELS=true|false (default: true; disable to drop site_id/region as metric labels and rely on resource attributes only)
|
||||
|
||||
Runtime behavior
|
||||
|
||||
- When Prometheus exporter is enabled, Newt serves /metrics on NEWT_ADMIN_ADDR (default :2112)
|
||||
- When OTLP is enabled, metrics and traces are exported to OTLP gRPC endpoint
|
||||
- Go runtime metrics (goroutines, GC, memory) are exported automatically
|
||||
|
||||
Metric catalog (current)
|
||||
|
||||
Unless otherwise noted, `site_id` and `region` are available via resource attributes and, by default, as metric labels. Set `NEWT_METRICS_INCLUDE_SITE_LABELS=false` to drop them from counter/histogram label sets in high-cardinality environments.
|
||||
|
||||
| Metric | Instrument | Key attributes | Purpose | Example |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| `newt_build_info` | Observable gauge (Int64) | `version`, `commit`, `site_id`, `region` (optional when site labels enabled) | Emits build metadata with value `1` for scrape-time verification. | `newt_build_info{version="1.5.0"} 1` |
|
||||
| `newt_site_registrations_total` | Counter (Int64) | `result` (`success`/`failure`), `site_id`, `region` (optional) | Counts Pangolin registration attempts. | `newt_site_registrations_total{result="success",site_id="acme-edge-1"} 1` |
|
||||
| `newt_site_online` | Observable gauge (Int64) | `site_id` | Reports whether the site is currently connected (`1`) or offline (`0`). | `newt_site_online{site_id="acme-edge-1"} 1` |
|
||||
| `newt_site_last_heartbeat_timestamp_seconds` | Observable gauge (Float64) | `site_id` | Unix timestamp of the most recent Pangolin heartbeat (derive age via `time() - metric`). | `newt_site_last_heartbeat_timestamp_seconds{site_id="acme-edge-1"} 1.728e+09` |
|
||||
| `newt_tunnel_sessions` | Observable gauge (Int64) | `site_id`, `tunnel_id` (when enabled) | Counts active tunnel sessions per peer; collapses to per-site when tunnel IDs are disabled. | `newt_tunnel_sessions{site_id="acme-edge-1",tunnel_id="wgpub..."} 3` |
|
||||
| `newt_tunnel_bytes_total` | Counter (Int64) | `direction` (`ingress`/`egress`), `protocol` (`tcp`/`udp`), `tunnel_id` (optional), `site_id`, `region` (optional) | Measures proxied traffic volume across tunnels. | `newt_tunnel_bytes_total{direction="ingress",protocol="tcp",site_id="acme-edge-1"} 4096` |
|
||||
| `newt_tunnel_latency_seconds` | Histogram (Float64) | `transport` (e.g., `wireguard`), `tunnel_id` (optional), `site_id`, `region` (optional) | Captures RTT or configuration-driven latency samples. | `newt_tunnel_latency_seconds_bucket{transport="wireguard",le="0.5"} 42` |
|
||||
| `newt_tunnel_reconnects_total` | Counter (Int64) | `initiator` (`client`/`server`), `reason` (enumerated), `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks reconnect causes for troubleshooting flaps. | `newt_tunnel_reconnects_total{initiator="client",reason="timeout",site_id="acme-edge-1"} 5` |
|
||||
| `newt_connection_attempts_total` | Counter (Int64) | `transport` (`auth`/`websocket`), `result`, `site_id`, `region` (optional) | Measures control-plane dial attempts and their outcomes. | `newt_connection_attempts_total{transport="websocket",result="success",site_id="acme-edge-1"} 8` |
|
||||
| `newt_connection_errors_total` | Counter (Int64) | `transport`, `error_type`, `site_id`, `region` (optional) | Buckets connection failures by normalized error class. | `newt_connection_errors_total{transport="websocket",error_type="tls_handshake",site_id="acme-edge-1"} 1` |
|
||||
| `newt_config_reloads_total` | Counter (Int64) | `result`, `site_id`, `region` (optional) | Counts remote blueprint/config reloads. | `newt_config_reloads_total{result="success",site_id="acme-edge-1"} 3` |
|
||||
| `process_start_time_seconds` | Observable gauge (Float64) | — | Unix timestamp of the Newt process start time (use `time() - process_start_time_seconds` for uptime). | `process_start_time_seconds 1.728e+09` |
|
||||
| `newt_config_apply_seconds` | Histogram (Float64) | `phase` (`interface`/`peer`), `result`, `site_id`, `region` (optional) | Measures time spent applying WireGuard configuration phases. | `newt_config_apply_seconds_sum{phase="peer",result="success",site_id="acme-edge-1"} 0.48` |
|
||||
| `newt_cert_rotation_total` | Counter (Int64) | `result`, `site_id`, `region` (optional) | Tracks client certificate rotation attempts. | `newt_cert_rotation_total{result="success",site_id="acme-edge-1"} 2` |
|
||||
| `newt_websocket_connect_latency_seconds` | Histogram (Float64) | `transport="websocket"`, `result`, `error_type` (on failure), `site_id`, `region` (optional) | Measures WebSocket dial latency and exposes failure buckets. | `newt_websocket_connect_latency_seconds_bucket{result="success",le="0.5",site_id="acme-edge-1"} 9` |
|
||||
| `newt_websocket_messages_total` | Counter (Int64) | `direction` (`in`/`out`), `msg_type` (`text`/`ping`/`pong`), `site_id`, `region` (optional) | Accounts for control WebSocket traffic volume by type. | `newt_websocket_messages_total{direction="out",msg_type="ping",site_id="acme-edge-1"} 12` |
|
||||
| `newt_websocket_connected` | Observable gauge (Int64) | `site_id`, `region` (optional) | Reports current WebSocket connectivity (`1` when connected). | `newt_websocket_connected{site_id="acme-edge-1"} 1` |
|
||||
| `newt_websocket_reconnects_total` | Counter (Int64) | `reason` (`tls_handshake`, `dial_timeout`, `io_error`, `ping_write`, `timeout`, etc.), `site_id`, `region` (optional) | Counts reconnect attempts with normalized reasons for failure analysis. | `newt_websocket_reconnects_total{reason="timeout",site_id="acme-edge-1"} 3` |
|
||||
| `newt_proxy_active_connections` | Observable gauge (Int64) | `protocol` (`tcp`/`udp`), `direction` (`ingress`/`egress`), `tunnel_id` (optional), `site_id`, `region` (optional) | Current proxy connections per tunnel and protocol. | `newt_proxy_active_connections{protocol="tcp",direction="egress",site_id="acme-edge-1"} 4` |
|
||||
| `newt_proxy_buffer_bytes` | Observable gauge (Int64) | `protocol`, `direction`, `tunnel_id` (optional), `site_id`, `region` (optional) | Volume of buffered data awaiting flush in proxy queues. | `newt_proxy_buffer_bytes{protocol="udp",direction="egress",site_id="acme-edge-1"} 2048` |
|
||||
| `newt_proxy_async_backlog_bytes` | Observable gauge (Int64) | `protocol`, `direction`, `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks async write backlog when deferred flushing is enabled. | `newt_proxy_async_backlog_bytes{protocol="tcp",direction="egress",site_id="acme-edge-1"} 512` |
|
||||
| `newt_proxy_drops_total` | Counter (Int64) | `protocol`, `tunnel_id` (optional), `site_id`, `region` (optional) | Counts proxy drop events caused by downstream write errors. | `newt_proxy_drops_total{protocol="udp",site_id="acme-edge-1"} 1` |
|
||||
| `newt_proxy_connections_total` | Counter (Int64) | `event` (`opened`/`closed`), `protocol`, `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks proxy connection lifecycle events for rate/SLO calculations. | `newt_proxy_connections_total{event="opened",protocol="tcp",site_id="acme-edge-1"} 10` |
|
||||
|
||||
Conventions
|
||||
|
||||
- Durations in seconds (unit: s), names end with _seconds
|
||||
- Sizes in bytes (unit: By), names end with _bytes
|
||||
- Counters end with _total
|
||||
- Labels must be low-cardinality and stable
|
||||
|
||||
Histogram buckets
|
||||
|
||||
- Latency (seconds): 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30
|
||||
|
||||
Local quickstart
|
||||
|
||||
1) Direct Prometheus scrape (do not also scrape the Collector)
|
||||
NEWT_METRICS_PROMETHEUS_ENABLED=true \
|
||||
NEWT_METRICS_OTLP_ENABLED=false \
|
||||
NEWT_ADMIN_ADDR="127.0.0.1:2112" \
|
||||
./newt
|
||||
|
||||
curl -s <http://localhost:2112/metrics> | grep ^newt_
|
||||
|
||||
2) Using the Collector (compose-style)
|
||||
NEWT_METRICS_PROMETHEUS_ENABLED=true \
|
||||
NEWT_METRICS_OTLP_ENABLED=true \
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \
|
||||
OTEL_EXPORTER_OTLP_INSECURE=true \
|
||||
OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=cumulative \
|
||||
./newt
|
||||
|
||||
Collector config example: examples/otel-collector.yaml
|
||||
Prometheus scrape config: examples/prometheus.yml
|
||||
|
||||
Adding new metrics
|
||||
|
||||
- Use helpers in internal/telemetry/metrics.go for counters/histograms
|
||||
- Keep labels low-cardinality
|
||||
- Add observable gauges through SetObservableCallback
|
||||
|
||||
Optional tracing
|
||||
|
||||
- When --otlp is enabled, you can wrap outbound HTTP clients with otelhttp.NewTransport to create spans for HTTP requests to Pangolin. This affects traces only and does not add metric labels.
|
||||
|
||||
OTLP TLS example
|
||||
|
||||
- Enable TLS to Collector with a custom CA and headers:
|
||||
|
||||
```sh
|
||||
NEWT_METRICS_OTLP_ENABLED=true \
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \
|
||||
OTEL_EXPORTER_OTLP_INSECURE=false \
|
||||
OTEL_EXPORTER_OTLP_CERTIFICATE=/etc/otel/custom-ca.pem \
|
||||
OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer abc123,tenant=acme" \
|
||||
./newt
|
||||
```
|
||||
|
||||
Prometheus scrape strategy (choose one)
|
||||
|
||||
Important: Do not scrape both Newt (2112) and the Collector’s Prometheus exporter (8889) at the same time for the same process. Doing so will double-count cumulative counters.
|
||||
|
||||
A) Scrape Newt directly:
|
||||
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
scrape_configs:
|
||||
- job_name: newt
|
||||
static_configs:
|
||||
- targets: ["newt:2112"]
|
||||
```
|
||||
|
||||
B) Scrape the Collector’s Prometheus exporter:
|
||||
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
scrape_configs:
|
||||
- job_name: otel-collector
|
||||
static_configs:
|
||||
- targets: ["otel-collector:8889"]
|
||||
```
|
||||
|
||||
Reason mapping (source → reason)
|
||||
|
||||
- Server instructs reconnect/terminate → server_request
|
||||
- Heartbeat/Ping threshold exceeded → timeout
|
||||
- Peer closed connection gracefully → peer_close
|
||||
- Route/Interface change detected → network_change
|
||||
- Auth/token failure (HTTP 401/403) → auth_error
|
||||
- TLS/WG handshake error → handshake_error
|
||||
- Config reloaded/applied (causing reconnection) → config_change
|
||||
- Other/unclassified errors → error
|
||||
|
||||
PromQL snippets
|
||||
|
||||
- Throughput in (5m):
|
||||
|
||||
```sh
|
||||
sum(rate(newt_tunnel_bytes_total{direction="ingress"}[5m]))
|
||||
```
|
||||
|
||||
- P95 latency (seconds):
|
||||
|
||||
```sh
|
||||
histogram_quantile(0.95, sum(rate(newt_tunnel_latency_seconds_bucket[5m])) by (le))
|
||||
```
|
||||
|
||||
- Active sessions:
|
||||
|
||||
```sh
|
||||
sum(newt_tunnel_sessions)
|
||||
```
|
||||
|
||||
Compatibility notes
|
||||
|
||||
- Gauges do not use the _total suffix (e.g., newt_tunnel_sessions).
|
||||
- site_id/region remain resource attributes. Metric labels for these fields appear on per-site gauges (e.g., `newt_site_online`) and, by default, on counters/histograms; disable them with `NEWT_METRICS_INCLUDE_SITE_LABELS=false` if needed. `tunnel_id` is a metric label (WireGuard public key). Never expose secrets in labels.
|
||||
- NEWT_METRICS_INCLUDE_TUNNEL_ID (default: true) toggles whether tunnel_id is included as a label on bytes/sessions/proxy/reconnect metrics. Disable in high-cardinality environments.
|
||||
- Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both.
|
||||
- Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write.
|
||||
- No free text in labels; use only the enumerated constants for reason, protocol (tcp|udp), and transport (e.g., websocket|wireguard).
|
||||
|
||||
Further reading
|
||||
|
||||
- See docs/METRICS_RECOMMENDATIONS.md for roadmap, label guidance (transport vs protocol), and example alerts.
|
||||
|
||||
Cardinality tips
|
||||
|
||||
- tunnel_id can grow in larger fleets. Use relabeling to drop or retain a subset, for example:
|
||||
|
||||
```yaml
|
||||
# Drop all tunnel_id on bytes to reduce series
|
||||
- source_labels: [__name__]
|
||||
regex: newt_tunnel_bytes_total
|
||||
action: keep
|
||||
- action: labeldrop
|
||||
regex: tunnel_id
|
||||
|
||||
# Or drop only high-churn tunnels
|
||||
- source_labels: [tunnel_id]
|
||||
regex: .*
|
||||
action: drop
|
||||
```
|
||||
|
||||
Quickstart: direkte Prometheus-Erfassung (empfohlen)
|
||||
|
||||
```sh
|
||||
# Start (direkter /metrics-Scrape, keine Doppel-Erfassung)
|
||||
docker compose -f docker-compose.metrics.yml up -d
|
||||
|
||||
# Smoke-Checks
|
||||
./scripts/smoke-metrics.sh
|
||||
# Tunnel-IDs ausblenden (optional):
|
||||
# EXPECT_TUNNEL_ID=false NEWT_METRICS_INCLUDE_TUNNEL_ID=false ./scripts/smoke-metrics.sh
|
||||
```
|
||||
|
||||
- Prometheus UI: <http://localhost:9090>
|
||||
- Standard-Scrape-Intervall: 15s
|
||||
- Kein OTLP aktiv (NEWT_METRICS_OTLP_ENABLED=false in docker-compose.metrics.yml)
|
||||
|
||||
Häufige PromQL-Schnelltests
|
||||
|
||||
```yaml
|
||||
# Online-Status einer Site in den letzten 5 Minuten
|
||||
max_over_time(newt_site_online{site_id="$site"}[5m])
|
||||
|
||||
# TCP egress-Bytes pro Site/Tunnel (10m)
|
||||
sum by (site_id, tunnel_id) (increase(newt_tunnel_bytes_total{protocol="tcp",direction="egress"}[10m]))
|
||||
|
||||
# WebSocket-Connect P95
|
||||
histogram_quantile(0.95, sum by (le, site_id) (rate(newt_websocket_connect_latency_seconds_bucket[5m])))
|
||||
|
||||
# Reconnects nach Initiator
|
||||
increase(newt_tunnel_reconnects_total{site_id="$site"}[30m]) by (initiator, reason)
|
||||
```
|
||||
|
||||
Troubleshooting
|
||||
|
||||
- curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics
|
||||
- Check Collector logs for OTLP connection issues
|
||||
- Verify Prometheus Targets are UP and scraping Newt or Collector
|
||||
129
docs/otel-review.md
Normal file
129
docs/otel-review.md
Normal file
@@ -0,0 +1,129 @@
|
||||
# Newt OpenTelemetry Review
|
||||
|
||||
## Overview
|
||||
|
||||
This document summarises the current OpenTelemetry (OTel) instrumentation in Newt, assesses
|
||||
compliance with OTel guidelines, and lists concrete improvements to pursue before release.
|
||||
It is based on the implementation in `internal/telemetry` and the call-sites that emit
|
||||
metrics and traces across the code base.
|
||||
|
||||
## Current metric instrumentation
|
||||
|
||||
All instruments are registered in `internal/telemetry/metrics.go`. They are grouped
|
||||
into site, tunnel, connection, configuration, build, WebSocket, and proxy domains.
|
||||
A global attribute filter (see `buildMeterProvider`) constrains exposed label keys to
|
||||
`site_id`, `region`, and a curated list of low-cardinality dimensions so that Prometheus
|
||||
exports stay bounded.
|
||||
|
||||
- **Site lifecycle**: `newt_site_registrations_total`, `newt_site_online`, and
|
||||
`newt_site_last_heartbeat_timestamp_seconds` capture registration attempts and liveness. They
|
||||
are fed either manually (`IncSiteRegistration`) or via the `TelemetryView` state
|
||||
callback that publishes observable gauges for the active site.
|
||||
- **Tunnel health and usage**: Counters and histograms track bytes, latency, reconnects,
|
||||
and active sessions per tunnel (`newt_tunnel_*` family). Attribute helpers respect
|
||||
the `NEWT_METRICS_INCLUDE_TUNNEL_ID` toggle to keep cardinality manageable on larger
|
||||
fleets.
|
||||
- **Connection attempts**: `newt_connection_attempts_total` and
|
||||
`newt_connection_errors_total` are emitted throughout the WebSocket client to classify
|
||||
authentication, dial, and transport failures.
|
||||
- **Operations/configuration**: `newt_config_reloads_total`,
|
||||
`process_start_time_seconds`, `newt_config_apply_seconds`, and
|
||||
`newt_cert_rotation_total` provide visibility into blueprint reloads, process boots,
|
||||
configuration timings, and certificate rotation outcomes.
|
||||
- **Build metadata**: `newt_build_info` records the binary version/commit together
|
||||
with optional site metadata when build information is supplied at startup.
|
||||
- **WebSocket control-plane**: `newt_websocket_connect_latency_seconds`,
|
||||
`newt_websocket_messages_total`, `newt_websocket_connected`, and
|
||||
`newt_websocket_reconnects_total` report connect latency, ping/pong/text activity,
|
||||
connection state, and reconnect reasons.
|
||||
- **Proxy data-plane**: Observable gauges (`newt_proxy_active_connections`,
|
||||
`newt_proxy_buffer_bytes`, `newt_proxy_async_backlog_bytes`) plus counters for
|
||||
drops, accepts, connection lifecycle events (`newt_proxy_connections_total`), and
|
||||
duration histograms (`newt_proxy_connection_duration_seconds`) surface backlog,
|
||||
drop behaviour, and churn alongside per-protocol byte counters.
|
||||
|
||||
Refer to `docs/observability.md` for a tabular catalogue with instrument types,
|
||||
attributes, and sample exposition lines.
|
||||
|
||||
## Tracing coverage
|
||||
|
||||
Tracing is optional and enabled only when OTLP export is configured. When active:
|
||||
|
||||
- The admin HTTP mux is wrapped with `otelhttp.NewHandler`, producing spans for
|
||||
`/metrics` and `/healthz` requests.
|
||||
- The WebSocket dial path creates a `ws.connect` span around the gRPC-based handshake.
|
||||
|
||||
No other subsystems currently create spans, so data-plane operations, blueprint fetches,
|
||||
Docker discovery, and WireGuard reconfiguration happen without trace context.
|
||||
|
||||
## Guideline & best-practice alignment
|
||||
|
||||
The implementation adheres to most OTel Go recommendations:
|
||||
|
||||
- **Naming & units** – Every instrument follows the `newt_*` prefix with `_total`
|
||||
suffixes for counters and `_seconds`/`_bytes` unit conventions. Histograms are
|
||||
registered with explicit second-based buckets.
|
||||
- **Resource attributes** – Service name/version and optional `site_id`/`region`
|
||||
populate the `resource.Resource`. Metric labels mirror these by default (and on
|
||||
per-site gauges) but can be disabled with `NEWT_METRICS_INCLUDE_SITE_LABELS=false`
|
||||
to avoid unnecessary cardinality growth.
|
||||
- **Attribute hygiene** – A single attribute filter (`sdkmetric.WithView`) enforces
|
||||
the allow-list of label keys to prevent accidental high-cardinality emission.
|
||||
- **Runtime metrics** – Go runtime instrumentation is enabled automatically through
|
||||
`runtime.Start`.
|
||||
- **Configuration via environment** – `telemetry.FromEnv` honours `OTEL_*` variables
|
||||
alongside `NEWT_*` overrides so operators can configure exporters without code
|
||||
changes.
|
||||
- **Shutdown handling** – `Setup.Shutdown` iterates exporters in reverse order to
|
||||
flush buffers before process exit.
|
||||
|
||||
## Adjustments & improvements
|
||||
|
||||
The review identified a few actionable adjustments:
|
||||
|
||||
1. **Record registration failures** – `newt_site_registrations_total` is currently
|
||||
incremented only on success. Emit `result="failure"` samples whenever Pangolin
|
||||
rejects a registration or credential exchange so operators can alert on churn.
|
||||
2. **Surface config reload failures** – `telemetry.IncConfigReload` is invoked with
|
||||
`result="success"` only. Callers should record a failure result when blueprint
|
||||
parsing or application aborts before success counters are incremented.
|
||||
3. **Expose robust uptime** – Document using `time() - process_start_time_seconds`
|
||||
to derive uptime now that the restart counter has been replaced with a timestamp
|
||||
gauge.
|
||||
4. **Propagate contexts where available** – Many emitters call metric helpers with
|
||||
`context.Background()`. Passing real contexts (when inexpensive) would allow future
|
||||
exporters to correlate spans and metrics.
|
||||
5. **Extend tracing coverage** – Instrument critical flows such as blueprint fetches,
|
||||
WireGuard reconfiguration, proxy accept loops, and Docker discovery to provide end
|
||||
to end visibility when OTLP tracing is enabled.
|
||||
|
||||
## Metrics to add before release
|
||||
|
||||
Prioritised additions that would close visibility gaps:
|
||||
|
||||
1. **Config reload error taxonomy** – Split reload attempts into a dedicated
|
||||
`newt_config_reload_errors_total{phase}` counter to make blueprint validation failures
|
||||
visible alongside the existing success counter.
|
||||
2. **Config source visibility** – Export `newt_config_source_info{source,version}` so
|
||||
operators can audit the active blueprint origin/commit during incidents.
|
||||
3. **Certificate expiry** – Emit `newt_cert_expiry_timestamp_seconds` (per cert) to
|
||||
enable proactive alerts before mTLS credentials lapse.
|
||||
4. **Blueprint/config pull latency** – Measuring Pangolin blueprint fetch durations and
|
||||
HTTP status distribution would expose slow control-plane operations.
|
||||
5. **Tunnel setup latency** – Histograms for DNS resolution and tunnel handshakes would
|
||||
help correlate connect latency spikes with network dependencies.
|
||||
|
||||
These metrics rely on data that is already available in the code paths mentioned
|
||||
above and would round out operational dashboards.
|
||||
|
||||
## Tracing wishlist
|
||||
|
||||
To benefit from tracing when OTLP is active, add spans around:
|
||||
|
||||
- Pangolin REST calls (wrap the HTTP client with `otelhttp.NewTransport`).
|
||||
- Docker discovery cycles and target registration callbacks.
|
||||
- WireGuard reconfiguration (interface bring-up, peer updates).
|
||||
- Proxy dial/accept loops for both TCP and UDP targets.
|
||||
|
||||
Capturing these stages will let operators correlate latency spikes with reconnects
|
||||
and proxy drops using distributed traces in addition to the metric signals.
|
||||
898
examples/grafana/dashboards/newt-overview.json
Normal file
898
examples/grafana/dashboards/newt-overview.json
Normal file
@@ -0,0 +1,898 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"decimals": 0,
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 500
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "go_goroutine_count",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Goroutines",
|
||||
"transformations": [],
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"decimals": 1,
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 256
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 512
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "go_memory_gc_goal_bytes / 1024 / 1024",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "GC Target Heap (MiB)",
|
||||
"transformations": [],
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"decimals": 2,
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 10
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 25
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "ops"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count[$__rate_interval]))",
|
||||
"instant": false,
|
||||
"legendFormat": "req/s",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "HTTP Requests / s",
|
||||
"transformations": [],
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"decimals": 3,
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 0.1
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 0.5
|
||||
}
|
||||
]
|
||||
},
|
||||
"unit": "ops"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 0
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(newt_connection_errors_total{site_id=~\"$site_id\"}[$__rate_interval]))",
|
||||
"instant": false,
|
||||
"legendFormat": "errors/s",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Connection Errors / s",
|
||||
"transformations": [],
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {},
|
||||
"mappings": [],
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 7
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(go_memory_used_bytes)",
|
||||
"legendFormat": "Used",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "go_memory_gc_goal_bytes",
|
||||
"legendFormat": "GC Goal",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Go Heap Usage vs GC Goal",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {},
|
||||
"decimals": 0,
|
||||
"mappings": [],
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 7
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "rate(go_memory_allocations_total[$__rate_interval])",
|
||||
"legendFormat": "Allocations/s",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "rate(go_memory_allocated_bytes_total[$__rate_interval])",
|
||||
"legendFormat": "Allocated bytes/s",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Allocation Activity",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {},
|
||||
"mappings": [],
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 16
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.5, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"title": "HTTP Request Duration Quantiles",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {},
|
||||
"mappings": [],
|
||||
"unit": "ops"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 16
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count[$__rate_interval])) by (http_response_status_code)",
|
||||
"legendFormat": "{{http_response_status_code}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "HTTP Requests by Status",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {},
|
||||
"mappings": [],
|
||||
"unit": "ops"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 25
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(newt_connection_attempts_total{site_id=~\"$site_id\"}[$__rate_interval])) by (transport, result)",
|
||||
"legendFormat": "{{transport}} • {{result}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Connection Attempts by Transport/Result",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {},
|
||||
"mappings": [],
|
||||
"unit": "ops"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 25
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(newt_connection_errors_total{site_id=~\"$site_id\"}[$__rate_interval])) by (transport, error_type)",
|
||||
"legendFormat": "{{transport}} • {{error_type}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Connection Errors by Type",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {},
|
||||
"decimals": 3,
|
||||
"mappings": [],
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 34
|
||||
},
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": [],
|
||||
"displayMode": "table",
|
||||
"placement": "right"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.5, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.95, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"title": "Tunnel Latency Quantiles",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"cards": {},
|
||||
"color": {
|
||||
"cardColor": "#b4ff00",
|
||||
"colorScale": "sqrt",
|
||||
"colorScheme": "interpolateTurbo"
|
||||
},
|
||||
"dataFormat": "tsbuckets",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {},
|
||||
"mappings": [],
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 34
|
||||
},
|
||||
"heatmap": {},
|
||||
"hideZeroBuckets": true,
|
||||
"id": 12,
|
||||
"legend": {
|
||||
"show": false
|
||||
},
|
||||
"options": {
|
||||
"calculate": true,
|
||||
"cellGap": 2,
|
||||
"cellSize": "auto",
|
||||
"color": {
|
||||
"exponent": 0.5
|
||||
},
|
||||
"exemplars": {
|
||||
"color": "rgba(255,255,255,0.7)"
|
||||
},
|
||||
"filterValues": {
|
||||
"le": 1e-9
|
||||
},
|
||||
"legend": {
|
||||
"show": false
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"show": true
|
||||
},
|
||||
"xAxis": {
|
||||
"show": true
|
||||
},
|
||||
"yAxis": {
|
||||
"decimals": 3,
|
||||
"show": true
|
||||
}
|
||||
},
|
||||
"pluginVersion": "11.1.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le)",
|
||||
"format": "heatmap",
|
||||
"legendFormat": "{{le}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Tunnel Latency Bucket Rate",
|
||||
"type": "heatmap"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"style": "dark",
|
||||
"tags": [
|
||||
"newt",
|
||||
"otel"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "Prometheus",
|
||||
"value": "prometheus"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": "Datasource",
|
||||
"name": "DS_PROMETHEUS",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"definition": "label_values(target_info, site_id)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Site",
|
||||
"multi": true,
|
||||
"name": "site_id",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(target_info, site_id)",
|
||||
"refId": "SiteIdVar"
|
||||
},
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"definition": "label_values(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\"}, tunnel_id)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Tunnel",
|
||||
"multi": true,
|
||||
"name": "tunnel_id",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\"}, tunnel_id)",
|
||||
"refId": "TunnelVar"
|
||||
},
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "browser",
|
||||
"title": "Newt Overview",
|
||||
"uid": "newt-overview",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
9
examples/grafana/provisioning/dashboards/dashboard.yaml
Normal file
9
examples/grafana/provisioning/dashboards/dashboard.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
apiVersion: 1
|
||||
providers:
|
||||
- name: "newt"
|
||||
folder: "Newt"
|
||||
type: file
|
||||
disableDeletion: false
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
uid: prometheus
|
||||
isDefault: true
|
||||
editable: true
|
||||
61
examples/otel-collector.yaml
Normal file
61
examples/otel-collector.yaml
Normal file
@@ -0,0 +1,61 @@
|
||||
# Variant A: Direct scrape of Newt (/metrics) via Prometheus (no Collector needed)
|
||||
# Note: Newt already exposes labels like site_id, protocol, direction. Do not promote
|
||||
# resource attributes into labels when scraping Newt directly.
|
||||
#
|
||||
# Example Prometheus scrape config:
|
||||
# global:
|
||||
# scrape_interval: 15s
|
||||
# scrape_configs:
|
||||
# - job_name: newt
|
||||
# static_configs:
|
||||
# - targets: ["newt:2112"]
|
||||
#
|
||||
# Variant B: Use OTEL Collector (Newt -> OTLP -> Collector -> Prometheus)
|
||||
# This pipeline scrapes metrics from the Collector's Prometheus exporter.
|
||||
# Labels are already on datapoints; promotion from resource is OPTIONAL and typically NOT required.
|
||||
# If you enable transform/promote below, ensure you do not duplicate labels.
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: ":4317"
|
||||
|
||||
processors:
|
||||
memory_limiter:
|
||||
check_interval: 5s
|
||||
limit_percentage: 80
|
||||
spike_limit_percentage: 25
|
||||
resourcedetection:
|
||||
detectors: [env, system]
|
||||
timeout: 5s
|
||||
batch: {}
|
||||
# OPTIONAL: Only enable if you need to promote resource attributes to labels.
|
||||
# WARNING: Newt already provides site_id as a label; avoid double-promotion.
|
||||
# transform/promote:
|
||||
# error_mode: ignore
|
||||
# metric_statements:
|
||||
# - context: datapoint
|
||||
# statements:
|
||||
# - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil
|
||||
# - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil
|
||||
|
||||
exporters:
|
||||
prometheus:
|
||||
endpoint: ":8889"
|
||||
send_timestamps: true
|
||||
# prometheusremotewrite:
|
||||
# endpoint: http://mimir:9009/api/v1/push
|
||||
debug:
|
||||
verbosity: basic
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, resourcedetection, batch] # add transform/promote if you really need it
|
||||
exporters: [prometheus]
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, resourcedetection, batch]
|
||||
exporters: [debug]
|
||||
16
examples/prometheus.with-collector.yml
Normal file
16
examples/prometheus.with-collector.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
# IMPORTANT: Do not scrape Newt directly; scrape only the Collector!
|
||||
- job_name: 'otel-collector'
|
||||
static_configs:
|
||||
- targets: ['otel-collector:8889']
|
||||
|
||||
# optional: limit metric cardinality
|
||||
relabel_configs:
|
||||
- action: labeldrop
|
||||
regex: 'tunnel_id'
|
||||
# - action: keep
|
||||
# source_labels: [site_id]
|
||||
# regex: '(site-a|site-b)'
|
||||
21
examples/prometheus.yml
Normal file
21
examples/prometheus.yml
Normal file
@@ -0,0 +1,21 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'newt'
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets: ['newt:2112'] # /metrics
|
||||
relabel_configs:
|
||||
# optional: drop tunnel_id
|
||||
- action: labeldrop
|
||||
regex: 'tunnel_id'
|
||||
# optional: allow only specific sites
|
||||
- action: keep
|
||||
source_labels: [site_id]
|
||||
regex: '(site-a|site-b)'
|
||||
|
||||
# WARNING: Do not enable this together with the 'newt' job above or you will double-count.
|
||||
# - job_name: 'otel-collector'
|
||||
# static_configs:
|
||||
# - targets: ['otel-collector:8889']
|
||||
54
go.mod
54
go.mod
@@ -6,29 +6,46 @@ require (
|
||||
github.com/docker/docker v28.5.0+incompatible
|
||||
github.com/google/gopacket v1.1.19
|
||||
github.com/gorilla/websocket v1.5.3
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
github.com/vishvananda/netlink v1.3.1
|
||||
golang.org/x/crypto v0.42.0
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0
|
||||
go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0
|
||||
go.opentelemetry.io/otel v1.38.0
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0
|
||||
go.opentelemetry.io/otel/exporters/prometheus v0.60.0
|
||||
go.opentelemetry.io/otel/metric v1.38.0
|
||||
go.opentelemetry.io/otel/sdk v1.38.0
|
||||
go.opentelemetry.io/otel/sdk/metric v1.38.0
|
||||
golang.org/x/crypto v0.43.0
|
||||
golang.org/x/net v0.45.0
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792
|
||||
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb
|
||||
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
google.golang.org/grpc v1.76.0
|
||||
gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c
|
||||
software.sslmate.com/src/go-pkcs12 v0.6.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/Microsoft/go-winio v0.6.2 // indirect
|
||||
github.com/containerd/errdefs v1.0.0 // indirect
|
||||
github.com/Microsoft/go-winio v0.6.0 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/containerd/errdefs v0.3.0 // indirect
|
||||
github.com/containerd/errdefs/pkg v0.3.0 // indirect
|
||||
github.com/distribution/reference v0.6.0 // indirect
|
||||
github.com/docker/go-connections v0.5.0 // indirect
|
||||
github.com/docker/go-units v0.5.0 // indirect
|
||||
github.com/docker/go-connections v0.6.0 // indirect
|
||||
github.com/docker/go-units v0.4.0 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/go-logr/logr v1.4.3 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
github.com/google/btree v1.1.3 // indirect
|
||||
github.com/google/go-cmp v0.7.0 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect
|
||||
github.com/josharian/native v1.1.0 // indirect
|
||||
github.com/mdlayher/genetlink v1.3.2 // indirect
|
||||
github.com/mdlayher/netlink v1.7.2 // indirect
|
||||
@@ -37,18 +54,33 @@ require (
|
||||
github.com/moby/sys/atomicwriter v0.1.0 // indirect
|
||||
github.com/moby/term v0.5.2 // indirect
|
||||
github.com/morikuni/aec v1.0.0 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/opencontainers/go-digest v1.0.0 // indirect
|
||||
github.com/opencontainers/image-spec v1.1.1 // indirect
|
||||
github.com/opencontainers/image-spec v1.1.0 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/common v0.66.1 // indirect
|
||||
github.com/prometheus/otlptranslator v0.0.2 // indirect
|
||||
github.com/prometheus/procfs v0.17.0 // indirect
|
||||
github.com/vishvananda/netns v0.0.5 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.38.0 // indirect
|
||||
go.opentelemetry.io/proto/otlp v1.7.1 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.2 // indirect
|
||||
golang.org/x/mod v0.28.0 // indirect
|
||||
golang.org/x/sync v0.17.0 // indirect
|
||||
golang.org/x/sys v0.37.0 // indirect
|
||||
golang.org/x/text v0.30.0 // indirect
|
||||
golang.org/x/time v0.12.0 // indirect
|
||||
golang.org/x/tools v0.37.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect
|
||||
go.opentelemetry.io/otel v1.37.0 // indirect
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.37.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.37.0 // indirect
|
||||
golang.org/x/sync v0.16.0 // indirect
|
||||
golang.org/x/sys v0.36.0 // indirect
|
||||
golang.org/x/time v0.12.0 // indirect
|
||||
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect
|
||||
google.golang.org/protobuf v1.36.8 // indirect
|
||||
)
|
||||
|
||||
146
go.sum
146
go.sum
@@ -1,12 +1,15 @@
|
||||
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
|
||||
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
|
||||
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
|
||||
github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4=
|
||||
github.com/cenkalti/backoff/v5 v5.0.2 h1:rIfFVxEf1QsI7E1ZHfp/B4DF/6QBAUhmgkxc0H7Zss8=
|
||||
github.com/cenkalti/backoff/v5 v5.0.2/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
|
||||
github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
|
||||
github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
|
||||
github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg=
|
||||
github.com/Microsoft/go-winio v0.6.0/go.mod h1:cTAf44im0RAYeL23bpB+fzCyDH2MJiz2BO69KH/soAE=
|
||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
|
||||
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/containerd/errdefs v0.3.0 h1:FSZgGOeK4yuT/+DnF07/Olde/q4KBoMsaamhXxIMDp4=
|
||||
github.com/containerd/errdefs v0.3.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
|
||||
github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE=
|
||||
github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk=
|
||||
github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
|
||||
@@ -17,8 +20,10 @@ github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5Qvfr
|
||||
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
|
||||
github.com/docker/docker v28.5.0+incompatible h1:ZdSQoRUE9XxhFI/B8YLvhnEFMmYN9Pp8Egd2qcaFk1E=
|
||||
github.com/docker/docker v28.5.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
|
||||
github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c=
|
||||
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
|
||||
github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94=
|
||||
github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE=
|
||||
github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=
|
||||
github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
|
||||
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
|
||||
@@ -28,6 +33,8 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
|
||||
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
|
||||
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
|
||||
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
|
||||
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
|
||||
github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg=
|
||||
github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
@@ -38,14 +45,20 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
|
||||
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI=
|
||||
github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc h1:GN2Lv3MGO7AS6PrRoT6yV5+wkrOpcszoIsO4+4ds248=
|
||||
github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs=
|
||||
github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA=
|
||||
github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
|
||||
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
|
||||
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||
github.com/mdlayher/genetlink v1.3.2 h1:KdrNKe+CTu+IbZnm/GVUMXSqBBLqcGpRDa0xkQy56gw=
|
||||
github.com/mdlayher/genetlink v1.3.2/go.mod h1:tcC3pkCrPUGIKKsCsp0B3AdaaKuHtaxoJRz3cc+528o=
|
||||
github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g=
|
||||
@@ -64,71 +77,100 @@ github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
|
||||
github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
|
||||
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
|
||||
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
|
||||
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
|
||||
github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
|
||||
github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M=
|
||||
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
|
||||
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
||||
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
||||
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
|
||||
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
|
||||
github.com/prometheus/otlptranslator v0.0.2 h1:+1CdeLVrRQ6Psmhnobldo0kTp96Rj80DRXRd5OSnMEQ=
|
||||
github.com/prometheus/otlptranslator v0.0.2/go.mod h1:P8AwMgdD7XEr6QRUJ2QWLpiAZTgTE2UYgjlu3svompI=
|
||||
github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0=
|
||||
github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw=
|
||||
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
|
||||
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
|
||||
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
|
||||
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
|
||||
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0=
|
||||
github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4=
|
||||
github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY=
|
||||
github.com/vishvananda/netns v0.0.5/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM=
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
|
||||
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0/go.mod h1:NfchwuyNoMcZ5MLHwPrODwUF1HWCXWrL31s8gSAdIKY=
|
||||
go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ=
|
||||
go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0 h1:dNzwXjZKpMpE2JhmO+9HsPl42NIXFIFSUSSs0fiqra0=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0/go.mod h1:90PoxvaEB5n6AOdZvi+yWJQoE95U8Dhhw2bSyRqnTD0=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0 h1:nRVXXvf78e00EwY6Wp0YII8ww2JVWshZ20HfTlE11AM=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0/go.mod h1:r49hO7CgrxY9Voaj3Xe8pANWtr0Oq916d0XAmOoCZAQ=
|
||||
go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE=
|
||||
go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E=
|
||||
go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI=
|
||||
go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps=
|
||||
go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4=
|
||||
go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0=
|
||||
go.opentelemetry.io/proto/otlp v1.6.0 h1:jQjP+AQyTf+Fe7OKj/MfkDrmK4MNVtw2NpXsf9fefDI=
|
||||
go.opentelemetry.io/proto/otlp v1.6.0/go.mod h1:cicgGehlFuNdgZkcALOCh3VE6K/u2tAjzlRhDwmVpZc=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg=
|
||||
go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 h1:PeBoRj6af6xMI7qCupwFvTbbnd49V7n5YpG6pg8iDYQ=
|
||||
go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0/go.mod h1:ingqBCtMCe8I4vpz/UVzCW6sxoqgZB37nao91mLQ3Bw=
|
||||
go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
|
||||
go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 h1:aTL7F04bJHUlztTsNGJ2l+6he8c+y/b//eR0jjjemT4=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0/go.mod h1:kldtb7jDTeol0l3ewcmd8SDvx3EmIE7lyvqbasU3QC4=
|
||||
go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo=
|
||||
go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk=
|
||||
go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
|
||||
go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
|
||||
go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E=
|
||||
go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA=
|
||||
go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
|
||||
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
|
||||
go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4=
|
||||
go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE=
|
||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
|
||||
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.42.0 h1:chiH31gIWm57EkTXpwnqf8qeuMUi0yekh6mT2AvFlqI=
|
||||
golang.org/x/crypto v0.42.0/go.mod h1:4+rDnOTJhQCx2q7/j6rAN5XDw8kPjeaXEUR2eL94ix8=
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4=
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc=
|
||||
golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
|
||||
golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
|
||||
golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
|
||||
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
|
||||
golang.org/x/mod v0.28.0 h1:gQBtGhjxykdjY9YhZpSlZIsbnaE2+PgjfLWUQTnoZ1U=
|
||||
golang.org/x/mod v0.28.0/go.mod h1:yfB/L0NOf/kmEbXjzCPOx1iK1fRutOydrCMsqRhEBxI=
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4=
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.45.0 h1:RLBg5JKixCy82FtLJpeNlVM0nrSqpCRYzVU1n8kj0tM=
|
||||
golang.org/x/net v0.45.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw=
|
||||
golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
|
||||
golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
|
||||
golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k=
|
||||
golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
|
||||
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk=
|
||||
golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4=
|
||||
golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k=
|
||||
golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM=
|
||||
golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE=
|
||||
golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
|
||||
golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
|
||||
golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE=
|
||||
golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w=
|
||||
golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 h1:B82qJJgjvYKsXS9jeunTOisW56dUokqW/FOteYJJ/yg=
|
||||
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2/go.mod h1:deeaetjYA+DHMHg+sMSMI58GrEteJUUzzw7en6TJQcI=
|
||||
@@ -136,15 +178,17 @@ golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb h1:whnFRlWMcXI9d+Z
|
||||
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb/go.mod h1:rpwXGsirqLqN2L0JDJQlwOboGHmptD5ZD6T2VmcqhTw=
|
||||
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10 h1:3GDAcqdIg1ozBNLgPy4SLT84nfcBjr6rhGtXYtrkWLU=
|
||||
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10/go.mod h1:T97yPqesLiNrOYxkwmhMI0ZIlJDm+p0PMR8eRVeR5tQ=
|
||||
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
|
||||
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1:BIRfGDEjiHRrk0QKZe3Xv2ieMhtgRGeLcZQ0mIVn4EY=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc=
|
||||
google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A=
|
||||
google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c=
|
||||
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
|
||||
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
|
||||
google.golang.org/genproto v0.0.0-20230920204549-e6e6cdab5c13 h1:vlzZttNJGVqTsRFU9AmdnrcO1Znh8Ew9kCD//yjigk0=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237 h1:Kog3KlB4xevJlAcbbbzPfRG0+X9fdoGM+UBRKVz6Wr0=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237/go.mod h1:ezi0AVyMKDWy5xAncvjLWH7UcLBB5n7y2fQ8MzjJcto=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250519155744-55703ea1f237 h1:cJfm9zPbe1e873mHJzmQ1nwVEeRDU/T1wXDK2kUSU34=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250519155744-55703ea1f237/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A=
|
||||
google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA=
|
||||
google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM=
|
||||
google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
|
||||
google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
|
||||
80
internal/state/telemetry_view.go
Normal file
80
internal/state/telemetry_view.go
Normal file
@@ -0,0 +1,80 @@
|
||||
package state
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
// TelemetryView is a minimal, thread-safe implementation to feed observables.
|
||||
// Since one Newt process represents one site, we expose a single logical site.
|
||||
// site_id is a resource attribute, so we do not emit per-site labels here.
|
||||
type TelemetryView struct {
|
||||
online atomic.Bool
|
||||
lastHBUnix atomic.Int64 // unix seconds
|
||||
// per-tunnel sessions
|
||||
sessMu sync.RWMutex
|
||||
sessions map[string]*atomic.Int64
|
||||
}
|
||||
|
||||
var (
|
||||
globalView atomic.Pointer[TelemetryView]
|
||||
)
|
||||
|
||||
// Global returns a singleton TelemetryView.
|
||||
func Global() *TelemetryView {
|
||||
if v := globalView.Load(); v != nil { return v }
|
||||
v := &TelemetryView{ sessions: make(map[string]*atomic.Int64) }
|
||||
globalView.Store(v)
|
||||
telemetry.RegisterStateView(v)
|
||||
return v
|
||||
}
|
||||
|
||||
// Instrumentation helpers
|
||||
func (v *TelemetryView) IncSessions(tunnelID string) {
|
||||
v.sessMu.Lock(); defer v.sessMu.Unlock()
|
||||
c := v.sessions[tunnelID]
|
||||
if c == nil { c = &atomic.Int64{}; v.sessions[tunnelID] = c }
|
||||
c.Add(1)
|
||||
}
|
||||
func (v *TelemetryView) DecSessions(tunnelID string) {
|
||||
v.sessMu.Lock(); defer v.sessMu.Unlock()
|
||||
if c := v.sessions[tunnelID]; c != nil {
|
||||
c.Add(-1)
|
||||
if c.Load() <= 0 { delete(v.sessions, tunnelID) }
|
||||
}
|
||||
}
|
||||
func (v *TelemetryView) ClearTunnel(tunnelID string) {
|
||||
v.sessMu.Lock(); defer v.sessMu.Unlock()
|
||||
delete(v.sessions, tunnelID)
|
||||
}
|
||||
func (v *TelemetryView) SetOnline(b bool) { v.online.Store(b) }
|
||||
func (v *TelemetryView) TouchHeartbeat() { v.lastHBUnix.Store(time.Now().Unix()) }
|
||||
|
||||
// --- telemetry.StateView interface ---
|
||||
|
||||
func (v *TelemetryView) ListSites() []string { return []string{"self"} }
|
||||
func (v *TelemetryView) Online(_ string) (bool, bool) { return v.online.Load(), true }
|
||||
func (v *TelemetryView) LastHeartbeat(_ string) (time.Time, bool) {
|
||||
sec := v.lastHBUnix.Load()
|
||||
if sec == 0 { return time.Time{}, false }
|
||||
return time.Unix(sec, 0), true
|
||||
}
|
||||
func (v *TelemetryView) ActiveSessions(_ string) (int64, bool) {
|
||||
// aggregated sessions (not used for per-tunnel gauge)
|
||||
v.sessMu.RLock(); defer v.sessMu.RUnlock()
|
||||
var sum int64
|
||||
for _, c := range v.sessions { if c != nil { sum += c.Load() } }
|
||||
return sum, true
|
||||
}
|
||||
|
||||
// Extended accessor used by telemetry callback to publish per-tunnel samples.
|
||||
func (v *TelemetryView) SessionsByTunnel() map[string]int64 {
|
||||
v.sessMu.RLock(); defer v.sessMu.RUnlock()
|
||||
out := make(map[string]int64, len(v.sessions))
|
||||
for id, c := range v.sessions { if c != nil && c.Load() > 0 { out[id] = c.Load() } }
|
||||
return out
|
||||
}
|
||||
|
||||
19
internal/telemetry/constants.go
Normal file
19
internal/telemetry/constants.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package telemetry
|
||||
|
||||
// Protocol labels (low-cardinality)
|
||||
const (
|
||||
ProtocolTCP = "tcp"
|
||||
ProtocolUDP = "udp"
|
||||
)
|
||||
|
||||
// Reconnect reason bins (fixed, low-cardinality)
|
||||
const (
|
||||
ReasonServerRequest = "server_request"
|
||||
ReasonTimeout = "timeout"
|
||||
ReasonPeerClose = "peer_close"
|
||||
ReasonNetworkChange = "network_change"
|
||||
ReasonAuthError = "auth_error"
|
||||
ReasonHandshakeError = "handshake_error"
|
||||
ReasonConfigChange = "config_change"
|
||||
ReasonError = "error"
|
||||
)
|
||||
32
internal/telemetry/constants_test.go
Normal file
32
internal/telemetry/constants_test.go
Normal file
@@ -0,0 +1,32 @@
|
||||
package telemetry
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestAllowedConstants(t *testing.T) {
|
||||
allowedReasons := map[string]struct{}{
|
||||
ReasonServerRequest: {},
|
||||
ReasonTimeout: {},
|
||||
ReasonPeerClose: {},
|
||||
ReasonNetworkChange: {},
|
||||
ReasonAuthError: {},
|
||||
ReasonHandshakeError: {},
|
||||
ReasonConfigChange: {},
|
||||
ReasonError: {},
|
||||
}
|
||||
for k := range allowedReasons {
|
||||
if k == "" {
|
||||
t.Fatalf("empty reason constant")
|
||||
}
|
||||
}
|
||||
|
||||
allowedProtocols := map[string]struct{}{
|
||||
ProtocolTCP: {},
|
||||
ProtocolUDP: {},
|
||||
}
|
||||
for k := range allowedProtocols {
|
||||
if k == "" {
|
||||
t.Fatalf("empty protocol constant")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
542
internal/telemetry/metrics.go
Normal file
542
internal/telemetry/metrics.go
Normal file
@@ -0,0 +1,542 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
)
|
||||
|
||||
// Instruments and helpers for Newt metrics following the naming, units, and
|
||||
// low-cardinality label guidance from the issue description.
|
||||
//
|
||||
// Counters end with _total, durations are in seconds, sizes in bytes.
|
||||
// Only low-cardinality stable labels are supported: tunnel_id,
|
||||
// transport, direction, result, reason, error_type.
|
||||
var (
|
||||
initOnce sync.Once
|
||||
|
||||
meter metric.Meter
|
||||
|
||||
// Site / Registration
|
||||
mSiteRegistrations metric.Int64Counter
|
||||
mSiteOnline metric.Int64ObservableGauge
|
||||
mSiteLastHeartbeat metric.Float64ObservableGauge
|
||||
|
||||
// Tunnel / Sessions
|
||||
mTunnelSessions metric.Int64ObservableGauge
|
||||
mTunnelBytes metric.Int64Counter
|
||||
mTunnelLatency metric.Float64Histogram
|
||||
mReconnects metric.Int64Counter
|
||||
|
||||
// Connection / NAT
|
||||
mConnAttempts metric.Int64Counter
|
||||
mConnErrors metric.Int64Counter
|
||||
|
||||
// Config/Restart
|
||||
mConfigReloads metric.Int64Counter
|
||||
mConfigApply metric.Float64Histogram
|
||||
mCertRotationTotal metric.Int64Counter
|
||||
mProcessStartTime metric.Float64ObservableGauge
|
||||
|
||||
// Build info
|
||||
mBuildInfo metric.Int64ObservableGauge
|
||||
|
||||
// WebSocket
|
||||
mWSConnectLatency metric.Float64Histogram
|
||||
mWSMessages metric.Int64Counter
|
||||
mWSDisconnects metric.Int64Counter
|
||||
mWSKeepaliveFailure metric.Int64Counter
|
||||
mWSSessionDuration metric.Float64Histogram
|
||||
mWSConnected metric.Int64ObservableGauge
|
||||
mWSReconnects metric.Int64Counter
|
||||
|
||||
// Proxy
|
||||
mProxyActiveConns metric.Int64ObservableGauge
|
||||
mProxyBufferBytes metric.Int64ObservableGauge
|
||||
mProxyAsyncBacklogByte metric.Int64ObservableGauge
|
||||
mProxyDropsTotal metric.Int64Counter
|
||||
mProxyAcceptsTotal metric.Int64Counter
|
||||
mProxyConnDuration metric.Float64Histogram
|
||||
mProxyConnectionsTotal metric.Int64Counter
|
||||
|
||||
buildVersion string
|
||||
buildCommit string
|
||||
processStartUnix = float64(time.Now().UnixNano()) / 1e9
|
||||
wsConnectedState atomic.Int64
|
||||
)
|
||||
|
||||
// Proxy connection lifecycle events.
|
||||
const (
|
||||
ProxyConnectionOpened = "opened"
|
||||
ProxyConnectionClosed = "closed"
|
||||
)
|
||||
|
||||
// attrsWithSite appends site/region labels only when explicitly enabled to keep
|
||||
// label cardinality low by default.
|
||||
func attrsWithSite(extra ...attribute.KeyValue) []attribute.KeyValue {
|
||||
attrs := make([]attribute.KeyValue, len(extra))
|
||||
copy(attrs, extra)
|
||||
if ShouldIncludeSiteLabels() {
|
||||
attrs = append(attrs, siteAttrs()...)
|
||||
}
|
||||
return attrs
|
||||
}
|
||||
|
||||
func registerInstruments() error {
|
||||
var err error
|
||||
initOnce.Do(func() {
|
||||
meter = otel.Meter("newt")
|
||||
if e := registerSiteInstruments(); e != nil {
|
||||
err = e
|
||||
return
|
||||
}
|
||||
if e := registerTunnelInstruments(); e != nil {
|
||||
err = e
|
||||
return
|
||||
}
|
||||
if e := registerConnInstruments(); e != nil {
|
||||
err = e
|
||||
return
|
||||
}
|
||||
if e := registerConfigInstruments(); e != nil {
|
||||
err = e
|
||||
return
|
||||
}
|
||||
if e := registerBuildWSProxyInstruments(); e != nil {
|
||||
err = e
|
||||
return
|
||||
}
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
func registerSiteInstruments() error {
|
||||
var err error
|
||||
mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total",
|
||||
metric.WithDescription("Total site registration attempts"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online",
|
||||
metric.WithDescription("Site online (0/1)"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_timestamp_seconds",
|
||||
metric.WithDescription("Unix timestamp of the last site heartbeat"),
|
||||
metric.WithUnit("s"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerTunnelInstruments() error {
|
||||
var err error
|
||||
mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions",
|
||||
metric.WithDescription("Active tunnel sessions"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total",
|
||||
metric.WithDescription("Tunnel bytes ingress/egress"),
|
||||
metric.WithUnit("By"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds",
|
||||
metric.WithDescription("Per-tunnel latency in seconds"),
|
||||
metric.WithUnit("s"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total",
|
||||
metric.WithDescription("Tunnel reconnect events"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerConnInstruments() error {
|
||||
var err error
|
||||
mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total",
|
||||
metric.WithDescription("Connection attempts"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mConnErrors, err = meter.Int64Counter("newt_connection_errors_total",
|
||||
metric.WithDescription("Connection errors by type"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerConfigInstruments() error {
|
||||
mConfigReloads, _ = meter.Int64Counter("newt_config_reloads_total",
|
||||
metric.WithDescription("Configuration reloads"))
|
||||
mConfigApply, _ = meter.Float64Histogram("newt_config_apply_seconds",
|
||||
metric.WithDescription("Configuration apply duration in seconds"),
|
||||
metric.WithUnit("s"))
|
||||
mCertRotationTotal, _ = meter.Int64Counter("newt_cert_rotation_total",
|
||||
metric.WithDescription("Certificate rotation events (success/failure)"))
|
||||
mProcessStartTime, _ = meter.Float64ObservableGauge("process_start_time_seconds",
|
||||
metric.WithDescription("Unix timestamp of the process start time"),
|
||||
metric.WithUnit("s"))
|
||||
if mProcessStartTime != nil {
|
||||
if _, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
o.ObserveFloat64(mProcessStartTime, processStartUnix)
|
||||
return nil
|
||||
}, mProcessStartTime); err != nil {
|
||||
otel.Handle(err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerBuildWSProxyInstruments() error {
|
||||
// Build info gauge (value 1 with version/commit attributes)
|
||||
mBuildInfo, _ = meter.Int64ObservableGauge("newt_build_info",
|
||||
metric.WithDescription("Newt build information (value is always 1)"))
|
||||
// WebSocket
|
||||
mWSConnectLatency, _ = meter.Float64Histogram("newt_websocket_connect_latency_seconds",
|
||||
metric.WithDescription("WebSocket connect latency in seconds"),
|
||||
metric.WithUnit("s"))
|
||||
mWSMessages, _ = meter.Int64Counter("newt_websocket_messages_total",
|
||||
metric.WithDescription("WebSocket messages by direction and type"))
|
||||
mWSDisconnects, _ = meter.Int64Counter("newt_websocket_disconnects_total",
|
||||
metric.WithDescription("WebSocket disconnects by reason/result"))
|
||||
mWSKeepaliveFailure, _ = meter.Int64Counter("newt_websocket_keepalive_failures_total",
|
||||
metric.WithDescription("WebSocket keepalive (ping/pong) failures"))
|
||||
mWSSessionDuration, _ = meter.Float64Histogram("newt_websocket_session_duration_seconds",
|
||||
metric.WithDescription("Duration of established WebSocket sessions"),
|
||||
metric.WithUnit("s"))
|
||||
mWSConnected, _ = meter.Int64ObservableGauge("newt_websocket_connected",
|
||||
metric.WithDescription("WebSocket connection state (1=connected, 0=disconnected)"))
|
||||
mWSReconnects, _ = meter.Int64Counter("newt_websocket_reconnects_total",
|
||||
metric.WithDescription("WebSocket reconnect attempts by reason"))
|
||||
// Proxy
|
||||
mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections",
|
||||
metric.WithDescription("Proxy active connections per tunnel and protocol"))
|
||||
mProxyBufferBytes, _ = meter.Int64ObservableGauge("newt_proxy_buffer_bytes",
|
||||
metric.WithDescription("Proxy buffer bytes (may approximate async backlog)"),
|
||||
metric.WithUnit("By"))
|
||||
mProxyAsyncBacklogByte, _ = meter.Int64ObservableGauge("newt_proxy_async_backlog_bytes",
|
||||
metric.WithDescription("Unflushed async byte backlog per tunnel and protocol"),
|
||||
metric.WithUnit("By"))
|
||||
mProxyDropsTotal, _ = meter.Int64Counter("newt_proxy_drops_total",
|
||||
metric.WithDescription("Proxy drops due to write errors"))
|
||||
mProxyAcceptsTotal, _ = meter.Int64Counter("newt_proxy_accept_total",
|
||||
metric.WithDescription("Proxy connection accepts by protocol and result"))
|
||||
mProxyConnDuration, _ = meter.Float64Histogram("newt_proxy_connection_duration_seconds",
|
||||
metric.WithDescription("Duration of completed proxy connections"),
|
||||
metric.WithUnit("s"))
|
||||
mProxyConnectionsTotal, _ = meter.Int64Counter("newt_proxy_connections_total",
|
||||
metric.WithDescription("Proxy connection lifecycle events by protocol"))
|
||||
// Register a default callback for build info if version/commit set
|
||||
reg, e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
if buildVersion == "" && buildCommit == "" {
|
||||
return nil
|
||||
}
|
||||
attrs := []attribute.KeyValue{}
|
||||
if buildVersion != "" {
|
||||
attrs = append(attrs, attribute.String("version", buildVersion))
|
||||
}
|
||||
if buildCommit != "" {
|
||||
attrs = append(attrs, attribute.String("commit", buildCommit))
|
||||
}
|
||||
if ShouldIncludeSiteLabels() {
|
||||
attrs = append(attrs, siteAttrs()...)
|
||||
}
|
||||
o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...))
|
||||
return nil
|
||||
}, mBuildInfo)
|
||||
if e != nil {
|
||||
otel.Handle(e)
|
||||
} else {
|
||||
// Provide a functional stopper that unregisters the callback
|
||||
obsStopper = func() { _ = reg.Unregister() }
|
||||
}
|
||||
if mWSConnected != nil {
|
||||
if regConn, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
val := wsConnectedState.Load()
|
||||
o.ObserveInt64(mWSConnected, val, metric.WithAttributes(attrsWithSite()...))
|
||||
return nil
|
||||
}, mWSConnected); err != nil {
|
||||
otel.Handle(err)
|
||||
} else {
|
||||
wsConnStopper = func() { _ = regConn.Unregister() }
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Observable registration: Newt can register a callback to report gauges.
|
||||
// Call SetObservableCallback once to start observing online status, last
|
||||
// heartbeat seconds, and active sessions.
|
||||
|
||||
var (
|
||||
obsOnce sync.Once
|
||||
obsStopper func()
|
||||
proxyObsOnce sync.Once
|
||||
proxyStopper func()
|
||||
wsConnStopper func()
|
||||
)
|
||||
|
||||
// SetObservableCallback registers a single callback that will be invoked
|
||||
// on collection. Use the provided observer to emit values for the observable
|
||||
// gauges defined here.
|
||||
//
|
||||
// Example inside your code (where you have access to current state):
|
||||
//
|
||||
// telemetry.SetObservableCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
// o.ObserveInt64(mSiteOnline, 1)
|
||||
// o.ObserveFloat64(mSiteLastHeartbeat, float64(lastHB.Unix()))
|
||||
// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions)))
|
||||
// return nil
|
||||
// })
|
||||
func SetObservableCallback(cb func(context.Context, metric.Observer) error) {
|
||||
obsOnce.Do(func() {
|
||||
reg, e := meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions)
|
||||
if e != nil {
|
||||
otel.Handle(e)
|
||||
obsStopper = func() {
|
||||
// no-op: registration failed; keep stopper callable
|
||||
}
|
||||
return
|
||||
}
|
||||
// Provide a functional stopper mirroring proxy/build-info behavior
|
||||
obsStopper = func() { _ = reg.Unregister() }
|
||||
})
|
||||
}
|
||||
|
||||
// SetProxyObservableCallback registers a callback to observe proxy gauges.
|
||||
func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) {
|
||||
proxyObsOnce.Do(func() {
|
||||
reg, e := meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte)
|
||||
if e != nil {
|
||||
otel.Handle(e)
|
||||
proxyStopper = func() {
|
||||
// no-op: registration failed; keep stopper callable
|
||||
}
|
||||
return
|
||||
}
|
||||
// Provide a functional stopper to unregister later if needed
|
||||
proxyStopper = func() { _ = reg.Unregister() }
|
||||
})
|
||||
}
|
||||
|
||||
// Build info registration
|
||||
func RegisterBuildInfo(version, commit string) {
|
||||
buildVersion = version
|
||||
buildCommit = commit
|
||||
}
|
||||
|
||||
// Config reloads
|
||||
func IncConfigReload(ctx context.Context, result string) {
|
||||
mConfigReloads.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
// Helpers for counters/histograms
|
||||
|
||||
func IncSiteRegistration(ctx context.Context, result string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("result", result),
|
||||
}
|
||||
mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("direction", direction),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations.
|
||||
func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) {
|
||||
mTunnelBytes.Add(ctx, n, metric.WithAttributeSet(attrs))
|
||||
}
|
||||
|
||||
// --- WebSocket helpers ---
|
||||
|
||||
func ObserveWSConnectLatency(ctx context.Context, seconds float64, result, errorType string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("transport", "websocket"),
|
||||
attribute.String("result", result),
|
||||
}
|
||||
if errorType != "" {
|
||||
attrs = append(attrs, attribute.String("error_type", errorType))
|
||||
}
|
||||
mWSConnectLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncWSMessage(ctx context.Context, direction, msgType string) {
|
||||
mWSMessages.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("direction", direction),
|
||||
attribute.String("msg_type", msgType),
|
||||
)...))
|
||||
}
|
||||
|
||||
func IncWSDisconnect(ctx context.Context, reason, result string) {
|
||||
mWSDisconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("reason", reason),
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
func IncWSKeepaliveFailure(ctx context.Context, reason string) {
|
||||
mWSKeepaliveFailure.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("reason", reason),
|
||||
)...))
|
||||
}
|
||||
|
||||
// SetWSConnectionState updates the backing gauge for the WebSocket connected state.
|
||||
func SetWSConnectionState(connected bool) {
|
||||
if connected {
|
||||
wsConnectedState.Store(1)
|
||||
} else {
|
||||
wsConnectedState.Store(0)
|
||||
}
|
||||
}
|
||||
|
||||
// IncWSReconnect increments the WebSocket reconnect counter with a bounded reason label.
|
||||
func IncWSReconnect(ctx context.Context, reason string) {
|
||||
if reason == "" {
|
||||
reason = "unknown"
|
||||
}
|
||||
mWSReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("reason", reason),
|
||||
)...))
|
||||
}
|
||||
|
||||
func ObserveWSSessionDuration(ctx context.Context, seconds float64, result string) {
|
||||
mWSSessionDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
// --- Proxy helpers ---
|
||||
|
||||
func ObserveProxyActiveConnsObs(o metric.Observer, value int64, attrs []attribute.KeyValue) {
|
||||
o.ObserveInt64(mProxyActiveConns, value, metric.WithAttributes(attrs...))
|
||||
}
|
||||
|
||||
func ObserveProxyBufferBytesObs(o metric.Observer, value int64, attrs []attribute.KeyValue) {
|
||||
o.ObserveInt64(mProxyBufferBytes, value, metric.WithAttributes(attrs...))
|
||||
}
|
||||
|
||||
func ObserveProxyAsyncBacklogObs(o metric.Observer, value int64, attrs []attribute.KeyValue) {
|
||||
o.ObserveInt64(mProxyAsyncBacklogByte, value, metric.WithAttributes(attrs...))
|
||||
}
|
||||
|
||||
func IncProxyDrops(ctx context.Context, tunnelID, protocol string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("protocol", protocol),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncProxyAccept(ctx context.Context, tunnelID, protocol, result, reason string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("protocol", protocol),
|
||||
attribute.String("result", result),
|
||||
}
|
||||
if reason != "" {
|
||||
attrs = append(attrs, attribute.String("reason", reason))
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mProxyAcceptsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func ObserveProxyConnectionDuration(ctx context.Context, tunnelID, protocol, result string, seconds float64) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("protocol", protocol),
|
||||
attribute.String("result", result),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mProxyConnDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
// IncProxyConnectionEvent records proxy connection lifecycle events (opened/closed).
|
||||
func IncProxyConnectionEvent(ctx context.Context, tunnelID, protocol, event string) {
|
||||
if event == "" {
|
||||
event = "unknown"
|
||||
}
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("protocol", protocol),
|
||||
attribute.String("event", event),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mProxyConnectionsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
// --- Config/PKI helpers ---
|
||||
|
||||
func ObserveConfigApply(ctx context.Context, phase, result string, seconds float64) {
|
||||
mConfigApply.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("phase", phase),
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
func IncCertRotation(ctx context.Context, result string) {
|
||||
mCertRotationTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("transport", transport),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("initiator", initiator),
|
||||
attribute.String("reason", reason),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncConnAttempt(ctx context.Context, transport, result string) {
|
||||
mConnAttempts.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("transport", transport),
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
func IncConnError(ctx context.Context, transport, typ string) {
|
||||
mConnErrors.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("transport", transport),
|
||||
attribute.String("error_type", typ),
|
||||
)...))
|
||||
}
|
||||
59
internal/telemetry/metrics_test_helper.go
Normal file
59
internal/telemetry/metrics_test_helper.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
func resetMetricsForTest() {
|
||||
initOnce = sync.Once{}
|
||||
obsOnce = sync.Once{}
|
||||
proxyObsOnce = sync.Once{}
|
||||
obsStopper = nil
|
||||
proxyStopper = nil
|
||||
if wsConnStopper != nil {
|
||||
wsConnStopper()
|
||||
}
|
||||
wsConnStopper = nil
|
||||
meter = nil
|
||||
|
||||
mSiteRegistrations = nil
|
||||
mSiteOnline = nil
|
||||
mSiteLastHeartbeat = nil
|
||||
|
||||
mTunnelSessions = nil
|
||||
mTunnelBytes = nil
|
||||
mTunnelLatency = nil
|
||||
mReconnects = nil
|
||||
|
||||
mConnAttempts = nil
|
||||
mConnErrors = nil
|
||||
|
||||
mConfigReloads = nil
|
||||
mConfigApply = nil
|
||||
mCertRotationTotal = nil
|
||||
mProcessStartTime = nil
|
||||
|
||||
mBuildInfo = nil
|
||||
|
||||
mWSConnectLatency = nil
|
||||
mWSMessages = nil
|
||||
mWSDisconnects = nil
|
||||
mWSKeepaliveFailure = nil
|
||||
mWSSessionDuration = nil
|
||||
mWSConnected = nil
|
||||
mWSReconnects = nil
|
||||
|
||||
mProxyActiveConns = nil
|
||||
mProxyBufferBytes = nil
|
||||
mProxyAsyncBacklogByte = nil
|
||||
mProxyDropsTotal = nil
|
||||
mProxyAcceptsTotal = nil
|
||||
mProxyConnDuration = nil
|
||||
mProxyConnectionsTotal = nil
|
||||
|
||||
processStartUnix = float64(time.Now().UnixNano()) / 1e9
|
||||
wsConnectedState.Store(0)
|
||||
includeTunnelIDVal.Store(false)
|
||||
includeSiteLabelVal.Store(false)
|
||||
}
|
||||
106
internal/telemetry/state_view.go
Normal file
106
internal/telemetry/state_view.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
)
|
||||
|
||||
// StateView provides a read-only view for observable gauges.
|
||||
// Implementations must be concurrency-safe and avoid blocking operations.
|
||||
// All methods should be fast and use RLocks where applicable.
|
||||
type StateView interface {
|
||||
// ListSites returns a stable, low-cardinality list of site IDs to expose.
|
||||
ListSites() []string
|
||||
// Online returns whether the site is online.
|
||||
Online(siteID string) (online bool, ok bool)
|
||||
// LastHeartbeat returns the last heartbeat time for a site.
|
||||
LastHeartbeat(siteID string) (t time.Time, ok bool)
|
||||
// ActiveSessions returns the current number of active sessions for a site (across tunnels),
|
||||
// or scoped to site if your model is site-scoped.
|
||||
ActiveSessions(siteID string) (n int64, ok bool)
|
||||
}
|
||||
|
||||
var (
|
||||
stateView atomic.Value // of type StateView
|
||||
)
|
||||
|
||||
// RegisterStateView sets the global StateView used by the default observable callback.
|
||||
func RegisterStateView(v StateView) {
|
||||
stateView.Store(v)
|
||||
// If instruments are registered, ensure a callback exists.
|
||||
if v != nil {
|
||||
SetObservableCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
if any := stateView.Load(); any != nil {
|
||||
if sv, ok := any.(StateView); ok {
|
||||
for _, siteID := range sv.ListSites() {
|
||||
observeSiteOnlineFor(o, sv, siteID)
|
||||
observeLastHeartbeatFor(o, sv, siteID)
|
||||
observeSessionsFor(o, siteID, sv)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func observeSiteOnlineFor(o metric.Observer, sv StateView, siteID string) {
|
||||
if online, ok := sv.Online(siteID); ok {
|
||||
val := int64(0)
|
||||
if online {
|
||||
val = 1
|
||||
}
|
||||
o.ObserveInt64(mSiteOnline, val, metric.WithAttributes(
|
||||
attribute.String("site_id", siteID),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
func observeLastHeartbeatFor(o metric.Observer, sv StateView, siteID string) {
|
||||
if t, ok := sv.LastHeartbeat(siteID); ok {
|
||||
ts := float64(t.UnixNano()) / 1e9
|
||||
o.ObserveFloat64(mSiteLastHeartbeat, ts, metric.WithAttributes(
|
||||
attribute.String("site_id", siteID),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
func observeSessionsFor(o metric.Observer, siteID string, any interface{}) {
|
||||
if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok {
|
||||
sessions := tm.SessionsByTunnel()
|
||||
// If tunnel_id labels are enabled, preserve existing per-tunnel observations
|
||||
if ShouldIncludeTunnelID() {
|
||||
for tid, n := range sessions {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("site_id", siteID),
|
||||
}
|
||||
if tid != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tid))
|
||||
}
|
||||
o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attrs...))
|
||||
}
|
||||
return
|
||||
}
|
||||
// When tunnel_id is disabled, collapse per-tunnel counts into a single site-level value
|
||||
var total int64
|
||||
for _, n := range sessions {
|
||||
total += n
|
||||
}
|
||||
// If there are no per-tunnel entries, fall back to ActiveSessions() if available
|
||||
if total == 0 {
|
||||
if svAny := stateView.Load(); svAny != nil {
|
||||
if sv, ok := svAny.(StateView); ok {
|
||||
if n, ok2 := sv.ActiveSessions(siteID); ok2 {
|
||||
total = n
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
o.ObserveInt64(mTunnelSessions, total, metric.WithAttributes(attribute.String("site_id", siteID)))
|
||||
return
|
||||
}
|
||||
}
|
||||
384
internal/telemetry/telemetry.go
Normal file
384
internal/telemetry/telemetry.go
Normal file
@@ -0,0 +1,384 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
promclient "github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"go.opentelemetry.io/contrib/instrumentation/runtime"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
||||
"go.opentelemetry.io/otel/exporters/prometheus"
|
||||
"go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
"go.opentelemetry.io/otel/sdk/trace"
|
||||
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
|
||||
"google.golang.org/grpc/credentials"
|
||||
)
|
||||
|
||||
// Config controls telemetry initialization via env flags.
|
||||
//
|
||||
// Defaults align with the issue requirements:
|
||||
// - Prometheus exporter enabled by default (/metrics)
|
||||
// - OTLP exporter disabled by default
|
||||
// - Durations in seconds, bytes in raw bytes
|
||||
// - Admin HTTP server address configurable (for mounting /metrics)
|
||||
type Config struct {
|
||||
ServiceName string
|
||||
ServiceVersion string
|
||||
|
||||
// Optional resource attributes
|
||||
SiteID string
|
||||
Region string
|
||||
|
||||
PromEnabled bool
|
||||
OTLPEnabled bool
|
||||
|
||||
OTLPEndpoint string // host:port
|
||||
OTLPInsecure bool
|
||||
|
||||
MetricExportInterval time.Duration
|
||||
AdminAddr string // e.g.: ":2112"
|
||||
|
||||
// Optional build info for newt_build_info metric
|
||||
BuildVersion string
|
||||
BuildCommit string
|
||||
}
|
||||
|
||||
// FromEnv reads configuration from environment variables.
|
||||
//
|
||||
// NEWT_METRICS_PROMETHEUS_ENABLED (default: true)
|
||||
// NEWT_METRICS_OTLP_ENABLED (default: false)
|
||||
// OTEL_EXPORTER_OTLP_ENDPOINT (default: "localhost:4317")
|
||||
// OTEL_EXPORTER_OTLP_INSECURE (default: true)
|
||||
// OTEL_METRIC_EXPORT_INTERVAL (default: 15s)
|
||||
// OTEL_SERVICE_NAME (default: "newt")
|
||||
// OTEL_SERVICE_VERSION (default: "")
|
||||
// NEWT_ADMIN_ADDR (default: ":2112")
|
||||
func FromEnv() Config {
|
||||
// Prefer explicit NEWT_* env vars, then fall back to OTEL_RESOURCE_ATTRIBUTES
|
||||
site := getenv("NEWT_SITE_ID", "")
|
||||
if site == "" {
|
||||
site = getenv("NEWT_ID", "")
|
||||
}
|
||||
region := os.Getenv("NEWT_REGION")
|
||||
if site == "" || region == "" {
|
||||
if ra := os.Getenv("OTEL_RESOURCE_ATTRIBUTES"); ra != "" {
|
||||
m := parseResourceAttributes(ra)
|
||||
if site == "" {
|
||||
site = m["site_id"]
|
||||
}
|
||||
if region == "" {
|
||||
region = m["region"]
|
||||
}
|
||||
}
|
||||
}
|
||||
return Config{
|
||||
ServiceName: getenv("OTEL_SERVICE_NAME", "newt"),
|
||||
ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"),
|
||||
SiteID: site,
|
||||
Region: region,
|
||||
PromEnabled: getenv("NEWT_METRICS_PROMETHEUS_ENABLED", "true") == "true",
|
||||
OTLPEnabled: getenv("NEWT_METRICS_OTLP_ENABLED", "false") == "true",
|
||||
OTLPEndpoint: getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"),
|
||||
OTLPInsecure: getenv("OTEL_EXPORTER_OTLP_INSECURE", "true") == "true",
|
||||
MetricExportInterval: getdur("OTEL_METRIC_EXPORT_INTERVAL", 15*time.Second),
|
||||
AdminAddr: getenv("NEWT_ADMIN_ADDR", ":2112"),
|
||||
}
|
||||
}
|
||||
|
||||
// Setup holds initialized telemetry providers and (optionally) a /metrics handler.
|
||||
// Call Shutdown when the process terminates to flush exporters.
|
||||
type Setup struct {
|
||||
MeterProvider *metric.MeterProvider
|
||||
TracerProvider *trace.TracerProvider
|
||||
|
||||
PrometheusHandler http.Handler // nil if Prometheus exporter disabled
|
||||
|
||||
shutdowns []func(context.Context) error
|
||||
}
|
||||
|
||||
// Init configures OpenTelemetry metrics and (optionally) tracing.
|
||||
//
|
||||
// It sets a global MeterProvider and TracerProvider, registers runtime instrumentation,
|
||||
// installs recommended histogram views for *_latency_seconds, and returns a Setup with
|
||||
// a Shutdown method to flush exporters.
|
||||
func Init(ctx context.Context, cfg Config) (*Setup, error) {
|
||||
// Configure tunnel_id label inclusion from env (default true)
|
||||
if getenv("NEWT_METRICS_INCLUDE_TUNNEL_ID", "true") == "true" {
|
||||
includeTunnelIDVal.Store(true)
|
||||
} else {
|
||||
includeTunnelIDVal.Store(false)
|
||||
}
|
||||
if getenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") == "true" {
|
||||
includeSiteLabelVal.Store(true)
|
||||
} else {
|
||||
includeSiteLabelVal.Store(false)
|
||||
}
|
||||
res := buildResource(ctx, cfg)
|
||||
UpdateSiteInfo(cfg.SiteID, cfg.Region)
|
||||
|
||||
s := &Setup{}
|
||||
readers, promHandler, shutdowns, err := setupMetricExport(ctx, cfg, res)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s.PrometheusHandler = promHandler
|
||||
// Build provider
|
||||
mp := buildMeterProvider(res, readers)
|
||||
otel.SetMeterProvider(mp)
|
||||
s.MeterProvider = mp
|
||||
s.shutdowns = append(s.shutdowns, mp.Shutdown)
|
||||
// Optional tracing
|
||||
if cfg.OTLPEnabled {
|
||||
if tp, shutdown := setupTracing(ctx, cfg, res); tp != nil {
|
||||
otel.SetTracerProvider(tp)
|
||||
s.TracerProvider = tp
|
||||
s.shutdowns = append(s.shutdowns, func(c context.Context) error {
|
||||
return errors.Join(shutdown(c), tp.Shutdown(c))
|
||||
})
|
||||
}
|
||||
}
|
||||
// Add metric exporter shutdowns
|
||||
s.shutdowns = append(s.shutdowns, shutdowns...)
|
||||
// Runtime metrics
|
||||
_ = runtime.Start(runtime.WithMeterProvider(mp))
|
||||
// Instruments
|
||||
if err := registerInstruments(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if cfg.BuildVersion != "" || cfg.BuildCommit != "" {
|
||||
RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func buildResource(ctx context.Context, cfg Config) *resource.Resource {
|
||||
attrs := []attribute.KeyValue{
|
||||
semconv.ServiceName(cfg.ServiceName),
|
||||
semconv.ServiceVersion(cfg.ServiceVersion),
|
||||
}
|
||||
if cfg.SiteID != "" {
|
||||
attrs = append(attrs, attribute.String("site_id", cfg.SiteID))
|
||||
}
|
||||
if cfg.Region != "" {
|
||||
attrs = append(attrs, attribute.String("region", cfg.Region))
|
||||
}
|
||||
res, _ := resource.New(ctx, resource.WithFromEnv(), resource.WithHost(), resource.WithAttributes(attrs...))
|
||||
return res
|
||||
}
|
||||
|
||||
func setupMetricExport(ctx context.Context, cfg Config, _ *resource.Resource) ([]metric.Reader, http.Handler, []func(context.Context) error, error) {
|
||||
var readers []metric.Reader
|
||||
var shutdowns []func(context.Context) error
|
||||
var promHandler http.Handler
|
||||
if cfg.PromEnabled {
|
||||
reg := promclient.NewRegistry()
|
||||
exp, err := prometheus.New(prometheus.WithRegisterer(reg))
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
readers = append(readers, exp)
|
||||
promHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{})
|
||||
}
|
||||
if cfg.OTLPEnabled {
|
||||
mopts := []otlpmetricgrpc.Option{otlpmetricgrpc.WithEndpoint(cfg.OTLPEndpoint)}
|
||||
if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 {
|
||||
mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs))
|
||||
}
|
||||
if cfg.OTLPInsecure {
|
||||
mopts = append(mopts, otlpmetricgrpc.WithInsecure())
|
||||
} else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" {
|
||||
if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil {
|
||||
mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds))
|
||||
}
|
||||
}
|
||||
mexp, err := otlpmetricgrpc.New(ctx, mopts...)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
readers = append(readers, metric.NewPeriodicReader(mexp, metric.WithInterval(cfg.MetricExportInterval)))
|
||||
shutdowns = append(shutdowns, mexp.Shutdown)
|
||||
}
|
||||
return readers, promHandler, shutdowns, nil
|
||||
}
|
||||
|
||||
func buildMeterProvider(res *resource.Resource, readers []metric.Reader) *metric.MeterProvider {
|
||||
var mpOpts []metric.Option
|
||||
mpOpts = append(mpOpts, metric.WithResource(res))
|
||||
for _, r := range readers {
|
||||
mpOpts = append(mpOpts, metric.WithReader(r))
|
||||
}
|
||||
mpOpts = append(mpOpts, metric.WithView(metric.NewView(
|
||||
metric.Instrument{Name: "newt_*_latency_seconds"},
|
||||
metric.Stream{Aggregation: metric.AggregationExplicitBucketHistogram{Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}}},
|
||||
)))
|
||||
mpOpts = append(mpOpts, metric.WithView(metric.NewView(
|
||||
metric.Instrument{Name: "newt_*"},
|
||||
metric.Stream{AttributeFilter: func(kv attribute.KeyValue) bool {
|
||||
k := string(kv.Key)
|
||||
switch k {
|
||||
case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "initiator", "error_type", "msg_type", "phase", "version", "commit", "site_id", "region":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}},
|
||||
)))
|
||||
return metric.NewMeterProvider(mpOpts...)
|
||||
}
|
||||
|
||||
func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*trace.TracerProvider, func(context.Context) error) {
|
||||
topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)}
|
||||
if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 {
|
||||
topts = append(topts, otlptracegrpc.WithHeaders(hdrs))
|
||||
}
|
||||
if cfg.OTLPInsecure {
|
||||
topts = append(topts, otlptracegrpc.WithInsecure())
|
||||
} else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" {
|
||||
if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil {
|
||||
topts = append(topts, otlptracegrpc.WithTLSCredentials(creds))
|
||||
}
|
||||
}
|
||||
exp, err := otlptracegrpc.New(ctx, topts...)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
tp := trace.NewTracerProvider(trace.WithBatcher(exp), trace.WithResource(res))
|
||||
return tp, exp.Shutdown
|
||||
}
|
||||
|
||||
// Shutdown flushes exporters and providers in reverse init order.
|
||||
func (s *Setup) Shutdown(ctx context.Context) error {
|
||||
var err error
|
||||
for i := len(s.shutdowns) - 1; i >= 0; i-- {
|
||||
err = errors.Join(err, s.shutdowns[i](ctx))
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func parseOTLPHeaders(h string) map[string]string {
|
||||
m := map[string]string{}
|
||||
if h == "" {
|
||||
return m
|
||||
}
|
||||
pairs := strings.Split(h, ",")
|
||||
for _, p := range pairs {
|
||||
kv := strings.SplitN(strings.TrimSpace(p), "=", 2)
|
||||
if len(kv) == 2 {
|
||||
m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// parseResourceAttributes parses OTEL_RESOURCE_ATTRIBUTES formatted as k=v,k2=v2
|
||||
func parseResourceAttributes(s string) map[string]string {
|
||||
m := map[string]string{}
|
||||
if s == "" {
|
||||
return m
|
||||
}
|
||||
parts := strings.Split(s, ",")
|
||||
for _, p := range parts {
|
||||
kv := strings.SplitN(strings.TrimSpace(p), "=", 2)
|
||||
if len(kv) == 2 {
|
||||
m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// Global site/region used to enrich metric labels.
|
||||
var siteIDVal atomic.Value
|
||||
var regionVal atomic.Value
|
||||
var (
|
||||
includeTunnelIDVal atomic.Value // bool; default true
|
||||
includeSiteLabelVal atomic.Value // bool; default false
|
||||
)
|
||||
|
||||
// UpdateSiteInfo updates the global site_id and region used for metric labels.
|
||||
// Thread-safe via atomic.Value: subsequent metric emissions will include
|
||||
// the new labels, prior emissions remain unchanged.
|
||||
func UpdateSiteInfo(siteID, region string) {
|
||||
if siteID != "" {
|
||||
siteIDVal.Store(siteID)
|
||||
}
|
||||
if region != "" {
|
||||
regionVal.Store(region)
|
||||
}
|
||||
}
|
||||
|
||||
func getSiteID() string {
|
||||
if v, ok := siteIDVal.Load().(string); ok {
|
||||
return v
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func getRegion() string {
|
||||
if v, ok := regionVal.Load().(string); ok {
|
||||
return v
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// siteAttrs returns label KVs for site_id and region (if set).
|
||||
func siteAttrs() []attribute.KeyValue {
|
||||
var out []attribute.KeyValue
|
||||
if s := getSiteID(); s != "" {
|
||||
out = append(out, attribute.String("site_id", s))
|
||||
}
|
||||
if r := getRegion(); r != "" {
|
||||
out = append(out, attribute.String("region", r))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager).
|
||||
func SiteLabelKVs() []attribute.KeyValue {
|
||||
if !ShouldIncludeSiteLabels() {
|
||||
return nil
|
||||
}
|
||||
return siteAttrs()
|
||||
}
|
||||
|
||||
// ShouldIncludeTunnelID returns whether tunnel_id labels should be emitted.
|
||||
func ShouldIncludeTunnelID() bool {
|
||||
if v, ok := includeTunnelIDVal.Load().(bool); ok {
|
||||
return v
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// ShouldIncludeSiteLabels returns whether site_id/region should be emitted as
|
||||
// metric labels in addition to resource attributes.
|
||||
func ShouldIncludeSiteLabels() bool {
|
||||
if v, ok := includeSiteLabelVal.Load().(bool); ok {
|
||||
return v
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func getenv(k, d string) string {
|
||||
if v := os.Getenv(k); v != "" {
|
||||
return v
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func getdur(k string, d time.Duration) time.Duration {
|
||||
if v := os.Getenv(k); v != "" {
|
||||
if p, e := time.ParseDuration(v); e == nil {
|
||||
return p
|
||||
}
|
||||
}
|
||||
return d
|
||||
}
|
||||
53
internal/telemetry/telemetry_attrfilter_test.go
Normal file
53
internal/telemetry/telemetry_attrfilter_test.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
)
|
||||
|
||||
// Test that disallowed attributes are filtered from the exposition.
|
||||
func TestAttributeFilterDropsUnknownKeys(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
resetMetricsForTest()
|
||||
t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true")
|
||||
cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"}
|
||||
tel, err := Init(ctx, cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("init: %v", err)
|
||||
}
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
|
||||
if tel.PrometheusHandler == nil {
|
||||
t.Fatalf("prom handler nil")
|
||||
}
|
||||
ts := httptest.NewServer(tel.PrometheusHandler)
|
||||
defer ts.Close()
|
||||
|
||||
// Add samples with disallowed attribute keys
|
||||
for _, k := range []string{"forbidden", "site_id", "host"} {
|
||||
set := attribute.NewSet(attribute.String(k, "x"))
|
||||
AddTunnelBytesSet(ctx, 123, set)
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
resp, err := http.Get(ts.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
body := string(b)
|
||||
if strings.Contains(body, "forbidden=") {
|
||||
t.Fatalf("unexpected forbidden attribute leaked into metrics: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "site_id=\"x\"") {
|
||||
t.Fatalf("expected allowed attribute site_id to be present in metrics, got: %s", body)
|
||||
}
|
||||
}
|
||||
76
internal/telemetry/telemetry_golden_test.go
Normal file
76
internal/telemetry/telemetry_golden_test.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Golden test that /metrics contains expected metric names.
|
||||
func TestMetricsGoldenContains(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
resetMetricsForTest()
|
||||
t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true")
|
||||
cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0", BuildVersion: "test"}
|
||||
tel, err := Init(ctx, cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("telemetry init error: %v", err)
|
||||
}
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
|
||||
if tel.PrometheusHandler == nil {
|
||||
t.Fatalf("prom handler nil")
|
||||
}
|
||||
ts := httptest.NewServer(tel.PrometheusHandler)
|
||||
defer ts.Close()
|
||||
|
||||
// Trigger counters to ensure they appear in the scrape
|
||||
IncConnAttempt(ctx, "websocket", "success")
|
||||
IncWSReconnect(ctx, "io_error")
|
||||
IncProxyConnectionEvent(ctx, "", "tcp", ProxyConnectionOpened)
|
||||
if tel.MeterProvider != nil {
|
||||
_ = tel.MeterProvider.ForceFlush(ctx)
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
var body string
|
||||
for i := 0; i < 5; i++ {
|
||||
resp, err := http.Get(ts.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("GET metrics failed: %v", err)
|
||||
}
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
body = string(b)
|
||||
if strings.Contains(body, "newt_connection_attempts_total") {
|
||||
break
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
|
||||
f, err := os.Open(filepath.Join("testdata", "expected_contains.golden"))
|
||||
if err != nil {
|
||||
t.Fatalf("read golden: %v", err)
|
||||
}
|
||||
defer f.Close()
|
||||
s := bufio.NewScanner(f)
|
||||
for s.Scan() {
|
||||
needle := strings.TrimSpace(s.Text())
|
||||
if needle == "" {
|
||||
continue
|
||||
}
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("expected metrics body to contain %q. body=\n%s", needle, body)
|
||||
}
|
||||
}
|
||||
if err := s.Err(); err != nil {
|
||||
t.Fatalf("scan golden: %v", err)
|
||||
}
|
||||
}
|
||||
65
internal/telemetry/telemetry_smoke_test.go
Normal file
65
internal/telemetry/telemetry_smoke_test.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Smoke test that /metrics contains at least one newt_* metric when Prom exporter is enabled.
|
||||
func TestMetricsSmoke(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
resetMetricsForTest()
|
||||
t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true")
|
||||
cfg := Config{
|
||||
ServiceName: "newt",
|
||||
PromEnabled: true,
|
||||
OTLPEnabled: false,
|
||||
AdminAddr: "127.0.0.1:0",
|
||||
BuildVersion: "test",
|
||||
BuildCommit: "deadbeef",
|
||||
MetricExportInterval: 5 * time.Second,
|
||||
}
|
||||
tel, err := Init(ctx, cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("telemetry init error: %v", err)
|
||||
}
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
|
||||
// Serve the Prom handler on a test server
|
||||
if tel.PrometheusHandler == nil {
|
||||
t.Fatalf("Prometheus handler nil; PromEnabled should enable it")
|
||||
}
|
||||
ts := httptest.NewServer(tel.PrometheusHandler)
|
||||
defer ts.Close()
|
||||
|
||||
// Record a simple metric and then fetch /metrics
|
||||
IncConnAttempt(ctx, "websocket", "success")
|
||||
if tel.MeterProvider != nil {
|
||||
_ = tel.MeterProvider.ForceFlush(ctx)
|
||||
}
|
||||
// Give the exporter a tick to collect
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
var body string
|
||||
for i := 0; i < 5; i++ {
|
||||
resp, err := http.Get(ts.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("GET /metrics failed: %v", err)
|
||||
}
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
body = string(b)
|
||||
if strings.Contains(body, "newt_connection_attempts_total") {
|
||||
break
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
if !strings.Contains(body, "newt_connection_attempts_total") {
|
||||
t.Fatalf("expected newt_connection_attempts_total in metrics, got:\n%s", body)
|
||||
}
|
||||
}
|
||||
7
internal/telemetry/testdata/expected_contains.golden
vendored
Normal file
7
internal/telemetry/testdata/expected_contains.golden
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
newt_connection_attempts_total
|
||||
newt_websocket_connected
|
||||
newt_websocket_reconnects_total
|
||||
newt_proxy_connections_total
|
||||
newt_build_info
|
||||
|
||||
process_start_time_seconds
|
||||
205
main.go
205
main.go
@@ -1,7 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net"
|
||||
@@ -22,6 +24,9 @@ import (
|
||||
"github.com/fosrl/newt/updates"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
|
||||
"github.com/fosrl/newt/internal/state"
|
||||
"github.com/fosrl/newt/internal/telemetry"
|
||||
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
@@ -91,6 +96,14 @@ func (s *stringSlice) Set(value string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
const (
|
||||
fmtErrMarshaling = "Error marshaling data: %v"
|
||||
fmtReceivedMsg = "Received: %+v"
|
||||
topicWGRegister = "newt/wg/register"
|
||||
msgNoTunnelOrProxy = "No tunnel IP or proxy manager available"
|
||||
fmtErrParsingTargetData = "Error parsing target data: %v"
|
||||
)
|
||||
|
||||
var (
|
||||
endpoint string
|
||||
id string
|
||||
@@ -120,6 +133,15 @@ var (
|
||||
preferEndpoint string
|
||||
healthMonitor *healthcheck.Monitor
|
||||
enforceHealthcheckCert bool
|
||||
// Build/version (can be overridden via -ldflags "-X main.newtVersion=...")
|
||||
newtVersion = "version_replaceme"
|
||||
|
||||
// Observability/metrics flags
|
||||
metricsEnabled bool
|
||||
otlpEnabled bool
|
||||
adminAddr string
|
||||
region string
|
||||
metricsAsyncBytes bool
|
||||
blueprintFile string
|
||||
noCloud bool
|
||||
|
||||
@@ -133,6 +155,10 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Prepare context for graceful shutdown and signal handling
|
||||
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
defer stop()
|
||||
|
||||
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
|
||||
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
|
||||
id = os.Getenv("NEWT_ID")
|
||||
@@ -144,6 +170,17 @@ func main() {
|
||||
interfaceName = os.Getenv("INTERFACE")
|
||||
generateAndSaveKeyTo = os.Getenv("GENERATE_AND_SAVE_KEY_TO")
|
||||
keepInterfaceEnv := os.Getenv("KEEP_INTERFACE")
|
||||
acceptClientsEnv := os.Getenv("ACCEPT_CLIENTS")
|
||||
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
|
||||
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
|
||||
|
||||
// Metrics/observability env mirrors
|
||||
metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
|
||||
otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
|
||||
adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
|
||||
regionEnv := os.Getenv("NEWT_REGION")
|
||||
asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
|
||||
|
||||
keepInterface = keepInterfaceEnv == "true"
|
||||
acceptClientsEnv := os.Getenv("ACCEPT_CLIENTS")
|
||||
acceptClients = acceptClientsEnv == "true"
|
||||
@@ -286,6 +323,43 @@ func main() {
|
||||
flag.BoolVar(&noCloud, "no-cloud", false, "Disable cloud failover")
|
||||
}
|
||||
|
||||
// Metrics/observability flags (mirror ENV if unset)
|
||||
if metricsEnabledEnv == "" {
|
||||
flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
|
||||
} else {
|
||||
if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil {
|
||||
metricsEnabled = v
|
||||
} else {
|
||||
metricsEnabled = true
|
||||
}
|
||||
}
|
||||
if otlpEnabledEnv == "" {
|
||||
flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
} else {
|
||||
if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil {
|
||||
otlpEnabled = v
|
||||
}
|
||||
}
|
||||
if adminAddrEnv == "" {
|
||||
flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
|
||||
} else {
|
||||
adminAddr = adminAddrEnv
|
||||
}
|
||||
// Async bytes toggle
|
||||
if asyncBytesEnv == "" {
|
||||
flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
|
||||
} else {
|
||||
if v, err := strconv.ParseBool(asyncBytesEnv); err == nil {
|
||||
metricsAsyncBytes = v
|
||||
}
|
||||
}
|
||||
// Optional region flag (resource attribute)
|
||||
if regionEnv == "" {
|
||||
flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)")
|
||||
} else {
|
||||
region = regionEnv
|
||||
}
|
||||
|
||||
// do a --version check
|
||||
version := flag.Bool("version", false, "Print the version")
|
||||
|
||||
@@ -300,12 +374,57 @@ func main() {
|
||||
loggerLevel := parseLogLevel(logLevel)
|
||||
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
|
||||
|
||||
newtVersion := "version_replaceme"
|
||||
// Initialize telemetry after flags are parsed (so flags override env)
|
||||
tcfg := telemetry.FromEnv()
|
||||
tcfg.PromEnabled = metricsEnabled
|
||||
tcfg.OTLPEnabled = otlpEnabled
|
||||
if adminAddr != "" {
|
||||
tcfg.AdminAddr = adminAddr
|
||||
}
|
||||
// Resource attributes (if available)
|
||||
tcfg.SiteID = id
|
||||
tcfg.Region = region
|
||||
// Build info
|
||||
tcfg.BuildVersion = newtVersion
|
||||
tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
|
||||
|
||||
tel, telErr := telemetry.Init(ctx, tcfg)
|
||||
if telErr != nil {
|
||||
logger.Warn("Telemetry init failed: %v", telErr)
|
||||
}
|
||||
if tel != nil {
|
||||
// Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
|
||||
if tel.PrometheusHandler != nil {
|
||||
mux.Handle("/metrics", tel.PrometheusHandler)
|
||||
}
|
||||
admin := &http.Server{
|
||||
Addr: tcfg.AdminAddr,
|
||||
Handler: otelhttp.NewHandler(mux, "newt-admin"),
|
||||
ReadTimeout: 5 * time.Second,
|
||||
WriteTimeout: 10 * time.Second,
|
||||
ReadHeaderTimeout: 5 * time.Second,
|
||||
IdleTimeout: 30 * time.Second,
|
||||
}
|
||||
go func() {
|
||||
if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
logger.Warn("admin http error: %v", err)
|
||||
}
|
||||
}()
|
||||
defer func() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
_ = admin.Shutdown(ctx)
|
||||
}()
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
}
|
||||
|
||||
if *version {
|
||||
fmt.Println("Newt version " + newtVersion)
|
||||
os.Exit(0)
|
||||
} else {
|
||||
logger.Info("Newt version " + newtVersion)
|
||||
logger.Info("Newt version %s", newtVersion)
|
||||
}
|
||||
|
||||
if err := updates.CheckForUpdate("fosrl", "newt", newtVersion); err != nil {
|
||||
@@ -376,6 +495,8 @@ func main() {
|
||||
}
|
||||
endpoint = client.GetConfig().Endpoint // Update endpoint from config
|
||||
id = client.GetConfig().ID // Update ID from config
|
||||
// Update site labels for metrics with the resolved ID
|
||||
telemetry.UpdateSiteInfo(id, region)
|
||||
|
||||
// output env var values if set
|
||||
logger.Debug("Endpoint: %v", endpoint)
|
||||
@@ -484,6 +605,10 @@ func main() {
|
||||
// Register handlers for different message types
|
||||
client.RegisterHandler("newt/wg/connect", func(msg websocket.WSMessage) {
|
||||
logger.Debug("Received registration message")
|
||||
regResult := "success"
|
||||
defer func() {
|
||||
telemetry.IncSiteRegistration(ctx, regResult)
|
||||
}()
|
||||
if stopFunc != nil {
|
||||
stopFunc() // stop the ws from sending more requests
|
||||
stopFunc = nil // reset stopFunc to nil to avoid double stopping
|
||||
@@ -502,22 +627,25 @@ func main() {
|
||||
|
||||
jsonData, err := json.Marshal(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error marshaling data: %v", err)
|
||||
logger.Info(fmtErrMarshaling, err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(jsonData, &wgData); err != nil {
|
||||
logger.Info("Error unmarshaling target data: %v", err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
logger.Debug("Received: %+v", msg)
|
||||
logger.Debug(fmtReceivedMsg, msg)
|
||||
tun, tnet, err = netstack.CreateNetTUN(
|
||||
[]netip.Addr{netip.MustParseAddr(wgData.TunnelIP)},
|
||||
[]netip.Addr{netip.MustParseAddr(dns)},
|
||||
mtuInt)
|
||||
if err != nil {
|
||||
logger.Error("Failed to create TUN device: %v", err)
|
||||
regResult = "failure"
|
||||
}
|
||||
|
||||
setDownstreamTNetstack(tnet)
|
||||
@@ -531,6 +659,7 @@ func main() {
|
||||
host, _, err := net.SplitHostPort(wgData.Endpoint)
|
||||
if err != nil {
|
||||
logger.Error("Failed to split endpoint: %v", err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
@@ -539,6 +668,7 @@ func main() {
|
||||
endpoint, err := resolveDomain(wgData.Endpoint)
|
||||
if err != nil {
|
||||
logger.Error("Failed to resolve endpoint: %v", err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
@@ -554,12 +684,14 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
err = dev.IpcSet(config)
|
||||
if err != nil {
|
||||
logger.Error("Failed to configure WireGuard device: %v", err)
|
||||
regResult = "failure"
|
||||
}
|
||||
|
||||
// Bring up the device
|
||||
err = dev.Up()
|
||||
if err != nil {
|
||||
logger.Error("Failed to bring up WireGuard device: %v", err)
|
||||
regResult = "failure"
|
||||
}
|
||||
|
||||
logger.Debug("WireGuard device created. Lets ping the server now...")
|
||||
@@ -572,9 +704,13 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
// Use reliable ping for initial connection test
|
||||
logger.Debug("Testing initial connection with reliable ping...")
|
||||
_, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
if err == nil && wgData.PublicKey != "" {
|
||||
telemetry.ObserveTunnelLatency(ctx, wgData.PublicKey, "wireguard", lat.Seconds())
|
||||
}
|
||||
if err != nil {
|
||||
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
|
||||
regResult = "failure"
|
||||
} else {
|
||||
logger.Debug("Initial connection test successful")
|
||||
}
|
||||
@@ -585,11 +721,14 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
// as the pings will continue in the background
|
||||
if !connected {
|
||||
logger.Debug("Starting ping check")
|
||||
pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
|
||||
pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
|
||||
}
|
||||
|
||||
// Create proxy manager
|
||||
pm = proxy.NewProxyManager(tnet)
|
||||
pm.SetAsyncBytes(metricsAsyncBytes)
|
||||
// Set tunnel_id for metrics (WireGuard peer public key)
|
||||
pm.SetTunnelID(wgData.PublicKey)
|
||||
|
||||
connected = true
|
||||
|
||||
@@ -626,10 +765,19 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received reconnect message")
|
||||
if wgData.PublicKey != "" {
|
||||
telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest)
|
||||
}
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
|
||||
// Clear metrics attrs and sessions for the tunnel
|
||||
if pm != nil {
|
||||
pm.ClearTunnelID()
|
||||
state.Global().ClearTunnel(wgData.PublicKey)
|
||||
}
|
||||
|
||||
// Mark as disconnected
|
||||
connected = false
|
||||
|
||||
@@ -648,6 +796,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received termination message")
|
||||
if wgData.PublicKey != "" {
|
||||
telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest)
|
||||
}
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
@@ -675,7 +826,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
jsonData, err := json.Marshal(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error marshaling data: %v", err)
|
||||
logger.Info(fmtErrMarshaling, err)
|
||||
return
|
||||
}
|
||||
if err := json.Unmarshal(jsonData, &exitNodeData); err != nil {
|
||||
@@ -716,7 +867,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
},
|
||||
}
|
||||
|
||||
stopFunc = client.SendMessageInterval("newt/wg/register", map[string]interface{}{
|
||||
stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
|
||||
"publicKey": publicKey.String(),
|
||||
"pingResults": pingResults,
|
||||
"newtVersion": newtVersion,
|
||||
@@ -819,7 +970,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
|
||||
// Send the ping results to the cloud for selection
|
||||
stopFunc = client.SendMessageInterval("newt/wg/register", map[string]interface{}{
|
||||
stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
|
||||
"publicKey": publicKey.String(),
|
||||
"pingResults": pingResults,
|
||||
"newtVersion": newtVersion,
|
||||
@@ -829,17 +980,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
})
|
||||
|
||||
client.RegisterHandler("newt/tcp/add", func(msg websocket.WSMessage) {
|
||||
logger.Debug("Received: %+v", msg)
|
||||
logger.Debug(fmtReceivedMsg, msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if wgData.TunnelIP == "" || pm == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
logger.Info(msgNoTunnelOrProxy)
|
||||
return
|
||||
}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error parsing target data: %v", err)
|
||||
logger.Info(fmtErrParsingTargetData, err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -854,17 +1005,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
})
|
||||
|
||||
client.RegisterHandler("newt/udp/add", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received: %+v", msg)
|
||||
logger.Info(fmtReceivedMsg, msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if wgData.TunnelIP == "" || pm == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
logger.Info(msgNoTunnelOrProxy)
|
||||
return
|
||||
}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error parsing target data: %v", err)
|
||||
logger.Info(fmtErrParsingTargetData, err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -879,17 +1030,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
})
|
||||
|
||||
client.RegisterHandler("newt/udp/remove", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received: %+v", msg)
|
||||
logger.Info(fmtReceivedMsg, msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if wgData.TunnelIP == "" || pm == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
logger.Info(msgNoTunnelOrProxy)
|
||||
return
|
||||
}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error parsing target data: %v", err)
|
||||
logger.Info(fmtErrParsingTargetData, err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -904,17 +1055,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
})
|
||||
|
||||
client.RegisterHandler("newt/tcp/remove", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received: %+v", msg)
|
||||
logger.Info(fmtReceivedMsg, msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if wgData.TunnelIP == "" || pm == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
logger.Info(msgNoTunnelOrProxy)
|
||||
return
|
||||
}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error parsing target data: %v", err)
|
||||
logger.Info(fmtErrParsingTargetData, err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -998,7 +1149,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
jsonData, err := json.Marshal(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error marshaling data: %v", err)
|
||||
logger.Info(fmtErrMarshaling, err)
|
||||
return
|
||||
}
|
||||
if err := json.Unmarshal(jsonData, &sshPublicKeyData); err != nil {
|
||||
@@ -1155,9 +1306,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
|
||||
if err := healthMonitor.EnableTarget(requestData.ID); err != nil {
|
||||
logger.Error("Failed to enable health check target %s: %v", requestData.ID, err)
|
||||
logger.Error("Failed to enable health check target %d: %v", requestData.ID, err)
|
||||
} else {
|
||||
logger.Info("Enabled health check target: %s", requestData.ID)
|
||||
logger.Info("Enabled health check target: %d", requestData.ID)
|
||||
}
|
||||
})
|
||||
|
||||
@@ -1180,9 +1331,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
|
||||
if err := healthMonitor.DisableTarget(requestData.ID); err != nil {
|
||||
logger.Error("Failed to disable health check target %s: %v", requestData.ID, err)
|
||||
logger.Error("Failed to disable health check target %d: %v", requestData.ID, err)
|
||||
} else {
|
||||
logger.Info("Disabled health check target: %s", requestData.ID)
|
||||
logger.Info("Disabled health check target: %d", requestData.ID)
|
||||
}
|
||||
})
|
||||
|
||||
@@ -1252,7 +1403,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
|
||||
// Send registration message to the server for backward compatibility
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
err := client.SendMessage(topicWGRegister, map[string]interface{}{
|
||||
"publicKey": publicKey.String(),
|
||||
"newtVersion": newtVersion,
|
||||
"backwardsCompatible": true,
|
||||
|
||||
802
patches/00_all_changes.patch
Normal file
802
patches/00_all_changes.patch
Normal file
@@ -0,0 +1,802 @@
|
||||
diff --git a/Dockerfile b/Dockerfile
|
||||
index b9c4d29..b9b6dea 100644
|
||||
--- a/Dockerfile
|
||||
+++ b/Dockerfile
|
||||
@@ -22,6 +22,9 @@ RUN apk --no-cache add ca-certificates tzdata
|
||||
COPY --from=builder /newt /usr/local/bin/
|
||||
COPY entrypoint.sh /
|
||||
|
||||
+# Admin/metrics endpoint (Prometheus scrape)
|
||||
+EXPOSE 2112
|
||||
+
|
||||
RUN chmod +x /entrypoint.sh
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
-CMD ["newt"]
|
||||
\ No newline at end of file
|
||||
+CMD ["newt"]
|
||||
diff --git a/go.mod b/go.mod
|
||||
index d475835..5909955 100644
|
||||
--- a/go.mod
|
||||
+++ b/go.mod
|
||||
@@ -7,6 +7,14 @@ require (
|
||||
github.com/google/gopacket v1.1.19
|
||||
github.com/gorilla/websocket v1.5.3
|
||||
github.com/vishvananda/netlink v1.3.1
|
||||
+ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0
|
||||
+ go.opentelemetry.io/contrib/instrumentation/runtime v0.62.0
|
||||
+ go.opentelemetry.io/otel v1.37.0
|
||||
+ go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.37.0
|
||||
+ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.37.0
|
||||
+ go.opentelemetry.io/otel/sdk/metric v1.37.0
|
||||
+ go.opentelemetry.io/otel/sdk/trace v1.37.0
|
||||
+ go.opentelemetry.io/otel/semconv v1.26.0
|
||||
golang.org/x/crypto v0.42.0
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792
|
||||
golang.org/x/net v0.44.0
|
||||
diff --git a/main.go b/main.go
|
||||
index 12849b1..c223b75 100644
|
||||
--- a/main.go
|
||||
+++ b/main.go
|
||||
@@ -1,7 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
+ "errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net"
|
||||
@@ -22,6 +24,9 @@ import (
|
||||
"github.com/fosrl/newt/updates"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
|
||||
+ "github.com/fosrl/newt/internal/state"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
@@ -116,6 +121,13 @@ var (
|
||||
healthMonitor *healthcheck.Monitor
|
||||
enforceHealthcheckCert bool
|
||||
|
||||
+ // Observability/metrics flags
|
||||
+ metricsEnabled bool
|
||||
+ otlpEnabled bool
|
||||
+ adminAddr string
|
||||
+ region string
|
||||
+ metricsAsyncBytes bool
|
||||
+
|
||||
// New mTLS configuration variables
|
||||
tlsClientCert string
|
||||
tlsClientKey string
|
||||
@@ -126,6 +138,10 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
+ // Prepare context for graceful shutdown and signal handling
|
||||
+ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
+ defer stop()
|
||||
+
|
||||
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
|
||||
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
|
||||
id = os.Getenv("NEWT_ID")
|
||||
@@ -141,6 +157,13 @@ func main() {
|
||||
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
|
||||
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
|
||||
|
||||
+ // Metrics/observability env mirrors
|
||||
+ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
|
||||
+ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
|
||||
+ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
|
||||
+ regionEnv := os.Getenv("NEWT_REGION")
|
||||
+ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
|
||||
+
|
||||
keepInterface = keepInterfaceEnv == "true"
|
||||
acceptClients = acceptClientsEnv == "true"
|
||||
useNativeInterface = useNativeInterfaceEnv == "true"
|
||||
@@ -272,6 +295,35 @@ func main() {
|
||||
flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)")
|
||||
}
|
||||
|
||||
+ // Metrics/observability flags (mirror ENV if unset)
|
||||
+ if metricsEnabledEnv == "" {
|
||||
+ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true }
|
||||
+ }
|
||||
+ if otlpEnabledEnv == "" {
|
||||
+ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v }
|
||||
+ }
|
||||
+ if adminAddrEnv == "" {
|
||||
+ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
|
||||
+ } else {
|
||||
+ adminAddr = adminAddrEnv
|
||||
+ }
|
||||
+ // Async bytes toggle
|
||||
+ if asyncBytesEnv == "" {
|
||||
+ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v }
|
||||
+ }
|
||||
+ // Optional region flag (resource attribute)
|
||||
+ if regionEnv == "" {
|
||||
+ flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)")
|
||||
+ } else {
|
||||
+ region = regionEnv
|
||||
+ }
|
||||
+
|
||||
// do a --version check
|
||||
version := flag.Bool("version", false, "Print the version")
|
||||
|
||||
@@ -286,6 +338,50 @@ func main() {
|
||||
loggerLevel := parseLogLevel(logLevel)
|
||||
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
|
||||
|
||||
+ // Initialize telemetry after flags are parsed (so flags override env)
|
||||
+ tcfg := telemetry.FromEnv()
|
||||
+ tcfg.PromEnabled = metricsEnabled
|
||||
+ tcfg.OTLPEnabled = otlpEnabled
|
||||
+ if adminAddr != "" { tcfg.AdminAddr = adminAddr }
|
||||
+ // Resource attributes (if available)
|
||||
+ tcfg.SiteID = id
|
||||
+ tcfg.Region = region
|
||||
+ // Build info
|
||||
+ tcfg.BuildVersion = newtVersion
|
||||
+ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
|
||||
+
|
||||
+ tel, telErr := telemetry.Init(ctx, tcfg)
|
||||
+ if telErr != nil {
|
||||
+ logger.Warn("Telemetry init failed: %v", telErr)
|
||||
+ }
|
||||
+ if tel != nil {
|
||||
+ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
|
||||
+ mux := http.NewServeMux()
|
||||
+ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
|
||||
+ if tel.PrometheusHandler != nil {
|
||||
+ mux.Handle("/metrics", tel.PrometheusHandler)
|
||||
+ }
|
||||
+ admin := &http.Server{
|
||||
+ Addr: tcfg.AdminAddr,
|
||||
+ Handler: otelhttp.NewHandler(mux, "newt-admin"),
|
||||
+ ReadTimeout: 5 * time.Second,
|
||||
+ WriteTimeout: 10 * time.Second,
|
||||
+ ReadHeaderTimeout: 5 * time.Second,
|
||||
+ IdleTimeout: 30 * time.Second,
|
||||
+ }
|
||||
+ go func() {
|
||||
+ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
+ logger.Warn("admin http error: %v", err)
|
||||
+ }
|
||||
+ }()
|
||||
+ defer func() {
|
||||
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
+ defer cancel()
|
||||
+ _ = admin.Shutdown(ctx)
|
||||
+ }()
|
||||
+ defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
+ }
|
||||
+
|
||||
newtVersion := "version_replaceme"
|
||||
if *version {
|
||||
fmt.Println("Newt version " + newtVersion)
|
||||
@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
// Use reliable ping for initial connection test
|
||||
logger.Debug("Testing initial connection with reliable ping...")
|
||||
- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ if err == nil && wgData.PublicKey != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds())
|
||||
+ }
|
||||
if err != nil {
|
||||
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
|
||||
} else {
|
||||
@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
// as the pings will continue in the background
|
||||
if !connected {
|
||||
logger.Debug("Starting ping check")
|
||||
- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
|
||||
+ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
|
||||
}
|
||||
|
||||
// Create proxy manager
|
||||
pm = proxy.NewProxyManager(tnet)
|
||||
+ pm.SetAsyncBytes(metricsAsyncBytes)
|
||||
+ // Set tunnel_id for metrics (WireGuard peer public key)
|
||||
+ pm.SetTunnelID(wgData.PublicKey)
|
||||
|
||||
connected = true
|
||||
|
||||
+ // telemetry: record a successful site registration (omit region unless available)
|
||||
+ telemetry.IncSiteRegistration(context.Background(), id, "", "success")
|
||||
+
|
||||
// add the targets if there are any
|
||||
if len(wgData.Targets.TCP) > 0 {
|
||||
updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP})
|
||||
@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received reconnect message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
// Mark as disconnected
|
||||
connected = false
|
||||
|
||||
@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received termination message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
diff --git a/proxy/manager.go b/proxy/manager.go
|
||||
index bf10322..86c47a8 100644
|
||||
--- a/proxy/manager.go
|
||||
+++ b/proxy/manager.go
|
||||
@@ -1,16 +1,22 @@
|
||||
package proxy
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
+ "os"
|
||||
"strings"
|
||||
"sync"
|
||||
+ "sync/atomic"
|
||||
"time"
|
||||
|
||||
+ "github.com/fosrl/newt/internal/state"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
"github.com/fosrl/newt/logger"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
|
||||
+ "go.opentelemetry.io/otel/attribute"
|
||||
)
|
||||
|
||||
// Target represents a proxy target with its address and port
|
||||
@@ -28,6 +34,52 @@ type ProxyManager struct {
|
||||
udpConns []*gonet.UDPConn
|
||||
running bool
|
||||
mutex sync.RWMutex
|
||||
+
|
||||
+ // telemetry (multi-tunnel)
|
||||
+ currentTunnelID string
|
||||
+ tunnels map[string]*tunnelEntry
|
||||
+ asyncBytes bool
|
||||
+ flushStop chan struct{}
|
||||
+}
|
||||
+
|
||||
+// tunnelEntry holds per-tunnel attributes and (optional) async counters.
|
||||
+type tunnelEntry struct {
|
||||
+ attrInTCP attribute.Set
|
||||
+ attrOutTCP attribute.Set
|
||||
+ attrInUDP attribute.Set
|
||||
+ attrOutUDP attribute.Set
|
||||
+
|
||||
+ bytesInTCP atomic.Uint64
|
||||
+ bytesOutTCP atomic.Uint64
|
||||
+ bytesInUDP atomic.Uint64
|
||||
+ bytesOutUDP atomic.Uint64
|
||||
+}
|
||||
+
|
||||
+// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set.
|
||||
+type countingWriter struct {
|
||||
+ ctx context.Context
|
||||
+ w io.Writer
|
||||
+ set attribute.Set
|
||||
+ pm *ProxyManager
|
||||
+ ent *tunnelEntry
|
||||
+ out bool // false=in, true=out
|
||||
+ proto string // "tcp" or "udp"
|
||||
+}
|
||||
+
|
||||
+func (cw *countingWriter) Write(p []byte) (int, error) {
|
||||
+ n, err := cw.w.Write(p)
|
||||
+ if n > 0 {
|
||||
+ if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil {
|
||||
+ if cw.proto == "tcp" {
|
||||
+ if cw.out { cw.ent.bytesOutTCP.Add(uint64(n)) } else { cw.ent.bytesInTCP.Add(uint64(n)) }
|
||||
+ } else if cw.proto == "udp" {
|
||||
+ if cw.out { cw.ent.bytesOutUDP.Add(uint64(n)) } else { cw.ent.bytesInUDP.Add(uint64(n)) }
|
||||
+ }
|
||||
+ } else {
|
||||
+ telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set)
|
||||
+ }
|
||||
+ }
|
||||
+ return n, err
|
||||
}
|
||||
|
||||
// NewProxyManager creates a new proxy manager instance
|
||||
@@ -38,9 +90,46 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager {
|
||||
udpTargets: make(map[string]map[int]string),
|
||||
listeners: make([]*gonet.TCPListener, 0),
|
||||
udpConns: make([]*gonet.UDPConn, 0),
|
||||
+ tunnels: make(map[string]*tunnelEntry),
|
||||
}
|
||||
}
|
||||
|
||||
+// SetTunnelID sets the WireGuard peer public key used as tunnel_id label.
|
||||
+func (pm *ProxyManager) SetTunnelID(id string) {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ pm.currentTunnelID = id
|
||||
+ if _, ok := pm.tunnels[id]; !ok {
|
||||
+ pm.tunnels[id] = &tunnelEntry{}
|
||||
+ }
|
||||
+ e := pm.tunnels[id]
|
||||
+ e.attrInTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "tcp"))
|
||||
+ e.attrOutTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "tcp"))
|
||||
+ e.attrInUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "udp"))
|
||||
+ e.attrOutUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "udp"))
|
||||
+}
|
||||
+
|
||||
+// ClearTunnelID clears cached attribute sets for the current tunnel.
|
||||
+func (pm *ProxyManager) ClearTunnelID() {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ id := pm.currentTunnelID
|
||||
+ if id == "" { return }
|
||||
+ if e, ok := pm.tunnels[id]; ok {
|
||||
+ // final flush for this tunnel
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ delete(pm.tunnels, id)
|
||||
+ }
|
||||
+ pm.currentTunnelID = ""
|
||||
+}
|
||||
+
|
||||
// init function without tnet
|
||||
func NewProxyManagerWithoutTNet() *ProxyManager {
|
||||
return &ProxyManager{
|
||||
@@ -160,6 +249,57 @@ func (pm *ProxyManager) Start() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
+func (pm *ProxyManager) SetAsyncBytes(b bool) {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ pm.asyncBytes = b
|
||||
+ if b && pm.flushStop == nil {
|
||||
+ pm.flushStop = make(chan struct{})
|
||||
+ go pm.flushLoop()
|
||||
+ }
|
||||
+}
|
||||
+func (pm *ProxyManager) flushLoop() {
|
||||
+ flushInterval := 2 * time.Second
|
||||
+ if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" {
|
||||
+ if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||
+ if d/2 < flushInterval { flushInterval = d / 2 }
|
||||
+ }
|
||||
+ }
|
||||
+ ticker := time.NewTicker(flushInterval)
|
||||
+ defer ticker.Stop()
|
||||
+ for {
|
||||
+ select {
|
||||
+ case <-ticker.C:
|
||||
+ pm.mutex.RLock()
|
||||
+ for _, e := range pm.tunnels {
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ }
|
||||
+ pm.mutex.RUnlock()
|
||||
+ case <-pm.flushStop:
|
||||
+ pm.mutex.RLock()
|
||||
+ for _, e := range pm.tunnels {
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ }
|
||||
+ pm.mutex.RUnlock()
|
||||
+ return
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
func (pm *ProxyManager) Stop() error {
|
||||
pm.mutex.Lock()
|
||||
defer pm.mutex.Unlock()
|
||||
@@ -236,6 +376,14 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr
|
||||
return nil
|
||||
}
|
||||
|
||||
+// getEntry returns per-tunnel entry or nil.
|
||||
+func (pm *ProxyManager) getEntry(id string) *tunnelEntry {
|
||||
+ pm.mutex.RLock()
|
||||
+ e := pm.tunnels[id]
|
||||
+ pm.mutex.RUnlock()
|
||||
+ return e
|
||||
+}
|
||||
+
|
||||
func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) {
|
||||
for {
|
||||
conn, err := listener.Accept()
|
||||
@@ -257,6 +405,9 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
|
||||
continue
|
||||
}
|
||||
|
||||
+// Count sessions only once per accepted TCP connection
|
||||
+ if pm.tunnelID != "" { state.Global().IncSessions(pm.tunnelID) }
|
||||
+
|
||||
go func() {
|
||||
target, err := net.Dial("tcp", targetAddr)
|
||||
if err != nil {
|
||||
@@ -265,24 +416,33 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
|
||||
return
|
||||
}
|
||||
|
||||
+ // already incremented on accept
|
||||
+
|
||||
// Create a WaitGroup to ensure both copy operations complete
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
|
||||
+ // client -> target (direction=in)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
- io.Copy(target, conn)
|
||||
- target.Close()
|
||||
+e := pm.getEntry(pm.currentTunnelID)
|
||||
+cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"}
|
||||
+ _, _ = io.Copy(cw, conn)
|
||||
+ _ = target.Close()
|
||||
}()
|
||||
|
||||
+ // target -> client (direction=out)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
- io.Copy(conn, target)
|
||||
- conn.Close()
|
||||
+e := pm.getEntry(pm.currentTunnelID)
|
||||
+cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"}
|
||||
+ _, _ = io.Copy(cw, target)
|
||||
+ _ = conn.Close()
|
||||
}()
|
||||
|
||||
- // Wait for both copies to complete
|
||||
+ // Wait for both copies to complete then session -1
|
||||
wg.Wait()
|
||||
+ if pm.tunnelID != "" { state.Global().DecSessions(pm.tunnelID) }
|
||||
}()
|
||||
}
|
||||
}
|
||||
@@ -326,6 +486,14 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
}
|
||||
|
||||
clientKey := remoteAddr.String()
|
||||
+ // bytes from client -> target (direction=in)
|
||||
+if pm.currentTunnelID != "" && n > 0 {
|
||||
+if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(n)) }
|
||||
+ } else {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP) }
|
||||
+ }
|
||||
+ }
|
||||
clientsMutex.RLock()
|
||||
targetConn, exists := clientConns[clientKey]
|
||||
clientsMutex.RUnlock()
|
||||
@@ -366,6 +534,15 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
return // defer will handle cleanup
|
||||
}
|
||||
|
||||
+ // bytes from target -> client (direction=out)
|
||||
+ if pm.currentTunnelID != "" && n > 0 {
|
||||
+ if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesOutUDP.Add(uint64(n)) }
|
||||
+ } else {
|
||||
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP) }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
_, err = conn.WriteTo(buffer[:n], remoteAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error writing to client: %v", err)
|
||||
@@ -375,13 +552,19 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
}(clientKey, targetConn, remoteAddr)
|
||||
}
|
||||
|
||||
- _, err = targetConn.Write(buffer[:n])
|
||||
+ written, err := targetConn.Write(buffer[:n])
|
||||
if err != nil {
|
||||
logger.Error("Error writing to target: %v", err)
|
||||
targetConn.Close()
|
||||
clientsMutex.Lock()
|
||||
delete(clientConns, clientKey)
|
||||
clientsMutex.Unlock()
|
||||
+} else if pm.currentTunnelID != "" && written > 0 {
|
||||
+ if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(written)) }
|
||||
+ } else {
|
||||
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP) }
|
||||
+ }
|
||||
}
|
||||
}
|
||||
}
|
||||
diff --git a/util.go b/util.go
|
||||
index 7d6da4f..c1f4915 100644
|
||||
--- a/util.go
|
||||
+++ b/util.go
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/fosrl/newt/proxy"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
"golang.org/x/net/icmp"
|
||||
"golang.org/x/net/ipv4"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
|
||||
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
|
||||
}
|
||||
|
||||
-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} {
|
||||
+func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
|
||||
maxInterval := 6 * time.Second
|
||||
currentInterval := pingInterval
|
||||
consecutiveFailures := 0
|
||||
@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
if !connectionLost {
|
||||
connectionLost = true
|
||||
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout)
|
||||
+ }
|
||||
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second)
|
||||
// Send registration message to the server for backward compatibility
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
} else {
|
||||
// Track recent latencies
|
||||
recentLatencies = append(recentLatencies, latency)
|
||||
+ // Record tunnel latency (limit sampling to this periodic check)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds())
|
||||
+ }
|
||||
if len(recentLatencies) > 10 {
|
||||
recentLatencies = recentLatencies[1:]
|
||||
}
|
||||
diff --git a/websocket/client.go b/websocket/client.go
|
||||
index 0c0664a..c9ac264 100644
|
||||
--- a/websocket/client.go
|
||||
+++ b/websocket/client.go
|
||||
@@ -18,6 +18,10 @@ import (
|
||||
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/gorilla/websocket"
|
||||
+
|
||||
+ "context"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/otel"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err))
|
||||
return "", fmt.Errorf("failed to request new token: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure")
|
||||
+ bin := "http_other"
|
||||
+ if resp.StatusCode >= 500 {
|
||||
+ bin = "http_5xx"
|
||||
+ } else if resp.StatusCode >= 400 {
|
||||
+ bin = "http_4xx"
|
||||
+ }
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin)
|
||||
+ // Reconnect reason mapping for auth failures
|
||||
+ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError)
|
||||
+ }
|
||||
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
|
||||
logger.Debug("Received token: %s", tokenResp.Data.Token)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success")
|
||||
|
||||
return tokenResp.Data.Token, nil
|
||||
}
|
||||
|
||||
+// classifyConnError maps common errors to low-cardinality error_type labels
|
||||
+func classifyConnError(err error) string {
|
||||
+ if err == nil {
|
||||
+ return ""
|
||||
+ }
|
||||
+ msg := strings.ToLower(err.Error())
|
||||
+ switch {
|
||||
+ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
|
||||
+ return "tls"
|
||||
+ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"):
|
||||
+ return "timeout"
|
||||
+ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"):
|
||||
+ return "dns"
|
||||
+ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
|
||||
+ return "auth"
|
||||
+ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"):
|
||||
+ return "io"
|
||||
+ default:
|
||||
+ return "other"
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
func (c *Client) connectWithRetry() {
|
||||
for {
|
||||
select {
|
||||
@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error {
|
||||
// Get token for authentication
|
||||
token, err := c.getToken()
|
||||
if err != nil {
|
||||
+ // telemetry: connection attempt failed before dialing
|
||||
+ // site_id isn't globally available here; use client ID as site_id (low cardinality)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err))
|
||||
return fmt.Errorf("failed to get token: %w", err)
|
||||
}
|
||||
|
||||
@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error {
|
||||
q.Set("clientType", c.clientType)
|
||||
u.RawQuery = q.Encode()
|
||||
|
||||
- // Connect to WebSocket
|
||||
+ // Connect to WebSocket (optional span)
|
||||
+ tr := otel.Tracer("newt")
|
||||
+ spanCtx, span := tr.Start(context.Background(), "ws.connect")
|
||||
+ defer span.End()
|
||||
+
|
||||
dialer := websocket.DefaultDialer
|
||||
|
||||
// Use new TLS configuration method
|
||||
@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error {
|
||||
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
|
||||
}
|
||||
|
||||
- conn, _, err := dialer.Dial(u.String(), nil)
|
||||
+conn, _, err := dialer.DialContext(spanCtx, u.String(), nil)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ etype := classifyConnError(err)
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype)
|
||||
+ // Map handshake-related errors to reconnect reasons where appropriate
|
||||
+ if etype == "tls" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError)
|
||||
+ } else if etype == "timeout" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout)
|
||||
+ } else {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError)
|
||||
+ }
|
||||
return fmt.Errorf("failed to connect to WebSocket: %w", err)
|
||||
}
|
||||
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success")
|
||||
c.conn = conn
|
||||
c.setConnected(true)
|
||||
|
||||
diff --git a/wg/wg.go b/wg/wg.go
|
||||
index 3cee1a9..a765279 100644
|
||||
--- a/wg/wg.go
|
||||
+++ b/wg/wg.go
|
||||
@@ -3,6 +3,7 @@
|
||||
package wg
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -23,6 +24,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/wgctrl"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
s.stopGetConfig = nil
|
||||
}
|
||||
|
||||
+ // telemetry: config reload success
|
||||
+ telemetry.IncConfigReload(context.Background(), "success")
|
||||
+ // Optional reconnect reason mapping: config change
|
||||
+ if s.serverPubKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange)
|
||||
+ }
|
||||
+
|
||||
// Ensure the WireGuard interface and peers are configured
|
||||
if err := s.ensureWireguardInterface(config); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard interface: %v", err)
|
||||
diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go
|
||||
index 6684c40..09f160e 100644
|
||||
--- a/wgnetstack/wgnetstack.go
|
||||
+++ b/wgnetstack/wgnetstack.go
|
||||
@@ -1,6 +1,7 @@
|
||||
package wgnetstack
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
@@ -26,6 +27,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str
|
||||
return service, nil
|
||||
}
|
||||
|
||||
+// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally.
|
||||
+func (s *WireGuardService) ReportRTT(seconds float64) {
|
||||
+ if s.serverPubKey == "" { return }
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds)
|
||||
+}
|
||||
+
|
||||
func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) {
|
||||
logger.Debug("Received: %+v", msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if s.TunnelIP == "" || s.proxyManager == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
- return
|
||||
- }
|
||||
+ return
|
||||
+}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
301
patches/01_proxy_multitunnel.patch
Normal file
301
patches/01_proxy_multitunnel.patch
Normal file
@@ -0,0 +1,301 @@
|
||||
diff --git a/proxy/manager.go b/proxy/manager.go
|
||||
index bf10322..86c47a8 100644
|
||||
--- a/proxy/manager.go
|
||||
+++ b/proxy/manager.go
|
||||
@@ -1,16 +1,22 @@
|
||||
package proxy
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
+ "os"
|
||||
"strings"
|
||||
"sync"
|
||||
+ "sync/atomic"
|
||||
"time"
|
||||
|
||||
+ "github.com/fosrl/newt/internal/state"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
"github.com/fosrl/newt/logger"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
|
||||
+ "go.opentelemetry.io/otel/attribute"
|
||||
)
|
||||
|
||||
// Target represents a proxy target with its address and port
|
||||
@@ -28,6 +34,52 @@ type ProxyManager struct {
|
||||
udpConns []*gonet.UDPConn
|
||||
running bool
|
||||
mutex sync.RWMutex
|
||||
+
|
||||
+ // telemetry (multi-tunnel)
|
||||
+ currentTunnelID string
|
||||
+ tunnels map[string]*tunnelEntry
|
||||
+ asyncBytes bool
|
||||
+ flushStop chan struct{}
|
||||
+}
|
||||
+
|
||||
+// tunnelEntry holds per-tunnel attributes and (optional) async counters.
|
||||
+type tunnelEntry struct {
|
||||
+ attrInTCP attribute.Set
|
||||
+ attrOutTCP attribute.Set
|
||||
+ attrInUDP attribute.Set
|
||||
+ attrOutUDP attribute.Set
|
||||
+
|
||||
+ bytesInTCP atomic.Uint64
|
||||
+ bytesOutTCP atomic.Uint64
|
||||
+ bytesInUDP atomic.Uint64
|
||||
+ bytesOutUDP atomic.Uint64
|
||||
+}
|
||||
+
|
||||
+// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set.
|
||||
+type countingWriter struct {
|
||||
+ ctx context.Context
|
||||
+ w io.Writer
|
||||
+ set attribute.Set
|
||||
+ pm *ProxyManager
|
||||
+ ent *tunnelEntry
|
||||
+ out bool // false=in, true=out
|
||||
+ proto string // "tcp" or "udp"
|
||||
+}
|
||||
+
|
||||
+func (cw *countingWriter) Write(p []byte) (int, error) {
|
||||
+ n, err := cw.w.Write(p)
|
||||
+ if n > 0 {
|
||||
+ if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil {
|
||||
+ if cw.proto == "tcp" {
|
||||
+ if cw.out { cw.ent.bytesOutTCP.Add(uint64(n)) } else { cw.ent.bytesInTCP.Add(uint64(n)) }
|
||||
+ } else if cw.proto == "udp" {
|
||||
+ if cw.out { cw.ent.bytesOutUDP.Add(uint64(n)) } else { cw.ent.bytesInUDP.Add(uint64(n)) }
|
||||
+ }
|
||||
+ } else {
|
||||
+ telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set)
|
||||
+ }
|
||||
+ }
|
||||
+ return n, err
|
||||
}
|
||||
|
||||
// NewProxyManager creates a new proxy manager instance
|
||||
@@ -38,9 +90,46 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager {
|
||||
udpTargets: make(map[string]map[int]string),
|
||||
listeners: make([]*gonet.TCPListener, 0),
|
||||
udpConns: make([]*gonet.UDPConn, 0),
|
||||
+ tunnels: make(map[string]*tunnelEntry),
|
||||
}
|
||||
}
|
||||
|
||||
+// SetTunnelID sets the WireGuard peer public key used as tunnel_id label.
|
||||
+func (pm *ProxyManager) SetTunnelID(id string) {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ pm.currentTunnelID = id
|
||||
+ if _, ok := pm.tunnels[id]; !ok {
|
||||
+ pm.tunnels[id] = &tunnelEntry{}
|
||||
+ }
|
||||
+ e := pm.tunnels[id]
|
||||
+ e.attrInTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "tcp"))
|
||||
+ e.attrOutTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "tcp"))
|
||||
+ e.attrInUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "udp"))
|
||||
+ e.attrOutUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "udp"))
|
||||
+}
|
||||
+
|
||||
+// ClearTunnelID clears cached attribute sets for the current tunnel.
|
||||
+func (pm *ProxyManager) ClearTunnelID() {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ id := pm.currentTunnelID
|
||||
+ if id == "" { return }
|
||||
+ if e, ok := pm.tunnels[id]; ok {
|
||||
+ // final flush for this tunnel
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ delete(pm.tunnels, id)
|
||||
+ }
|
||||
+ pm.currentTunnelID = ""
|
||||
+}
|
||||
+
|
||||
// init function without tnet
|
||||
func NewProxyManagerWithoutTNet() *ProxyManager {
|
||||
return &ProxyManager{
|
||||
@@ -160,6 +249,57 @@ func (pm *ProxyManager) Start() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
+func (pm *ProxyManager) SetAsyncBytes(b bool) {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ pm.asyncBytes = b
|
||||
+ if b && pm.flushStop == nil {
|
||||
+ pm.flushStop = make(chan struct{})
|
||||
+ go pm.flushLoop()
|
||||
+ }
|
||||
+}
|
||||
+func (pm *ProxyManager) flushLoop() {
|
||||
+ flushInterval := 2 * time.Second
|
||||
+ if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" {
|
||||
+ if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||
+ if d/2 < flushInterval { flushInterval = d / 2 }
|
||||
+ }
|
||||
+ }
|
||||
+ ticker := time.NewTicker(flushInterval)
|
||||
+ defer ticker.Stop()
|
||||
+ for {
|
||||
+ select {
|
||||
+ case <-ticker.C:
|
||||
+ pm.mutex.RLock()
|
||||
+ for _, e := range pm.tunnels {
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ }
|
||||
+ pm.mutex.RUnlock()
|
||||
+ case <-pm.flushStop:
|
||||
+ pm.mutex.RLock()
|
||||
+ for _, e := range pm.tunnels {
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ }
|
||||
+ pm.mutex.RUnlock()
|
||||
+ return
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
func (pm *ProxyManager) Stop() error {
|
||||
pm.mutex.Lock()
|
||||
defer pm.mutex.Unlock()
|
||||
@@ -236,6 +376,14 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr
|
||||
return nil
|
||||
}
|
||||
|
||||
+// getEntry returns per-tunnel entry or nil.
|
||||
+func (pm *ProxyManager) getEntry(id string) *tunnelEntry {
|
||||
+ pm.mutex.RLock()
|
||||
+ e := pm.tunnels[id]
|
||||
+ pm.mutex.RUnlock()
|
||||
+ return e
|
||||
+}
|
||||
+
|
||||
func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) {
|
||||
for {
|
||||
conn, err := listener.Accept()
|
||||
@@ -257,6 +405,9 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
|
||||
continue
|
||||
}
|
||||
|
||||
+// Count sessions only once per accepted TCP connection
|
||||
+ if pm.tunnelID != "" { state.Global().IncSessions(pm.tunnelID) }
|
||||
+
|
||||
go func() {
|
||||
target, err := net.Dial("tcp", targetAddr)
|
||||
if err != nil {
|
||||
@@ -265,24 +416,33 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
|
||||
return
|
||||
}
|
||||
|
||||
+ // already incremented on accept
|
||||
+
|
||||
// Create a WaitGroup to ensure both copy operations complete
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
|
||||
+ // client -> target (direction=in)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
- io.Copy(target, conn)
|
||||
- target.Close()
|
||||
+e := pm.getEntry(pm.currentTunnelID)
|
||||
+cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"}
|
||||
+ _, _ = io.Copy(cw, conn)
|
||||
+ _ = target.Close()
|
||||
}()
|
||||
|
||||
+ // target -> client (direction=out)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
- io.Copy(conn, target)
|
||||
- conn.Close()
|
||||
+e := pm.getEntry(pm.currentTunnelID)
|
||||
+cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"}
|
||||
+ _, _ = io.Copy(cw, target)
|
||||
+ _ = conn.Close()
|
||||
}()
|
||||
|
||||
- // Wait for both copies to complete
|
||||
+ // Wait for both copies to complete then session -1
|
||||
wg.Wait()
|
||||
+ if pm.tunnelID != "" { state.Global().DecSessions(pm.tunnelID) }
|
||||
}()
|
||||
}
|
||||
}
|
||||
@@ -326,6 +486,14 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
}
|
||||
|
||||
clientKey := remoteAddr.String()
|
||||
+ // bytes from client -> target (direction=in)
|
||||
+if pm.currentTunnelID != "" && n > 0 {
|
||||
+if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(n)) }
|
||||
+ } else {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP) }
|
||||
+ }
|
||||
+ }
|
||||
clientsMutex.RLock()
|
||||
targetConn, exists := clientConns[clientKey]
|
||||
clientsMutex.RUnlock()
|
||||
@@ -366,6 +534,15 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
return // defer will handle cleanup
|
||||
}
|
||||
|
||||
+ // bytes from target -> client (direction=out)
|
||||
+ if pm.currentTunnelID != "" && n > 0 {
|
||||
+ if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesOutUDP.Add(uint64(n)) }
|
||||
+ } else {
|
||||
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP) }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
_, err = conn.WriteTo(buffer[:n], remoteAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error writing to client: %v", err)
|
||||
@@ -375,13 +552,19 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
}(clientKey, targetConn, remoteAddr)
|
||||
}
|
||||
|
||||
- _, err = targetConn.Write(buffer[:n])
|
||||
+ written, err := targetConn.Write(buffer[:n])
|
||||
if err != nil {
|
||||
logger.Error("Error writing to target: %v", err)
|
||||
targetConn.Close()
|
||||
clientsMutex.Lock()
|
||||
delete(clientConns, clientKey)
|
||||
clientsMutex.Unlock()
|
||||
+} else if pm.currentTunnelID != "" && written > 0 {
|
||||
+ if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(written)) }
|
||||
+ } else {
|
||||
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP) }
|
||||
+ }
|
||||
}
|
||||
}
|
||||
}
|
||||
422
patches/02_reconnect_reasons.patch
Normal file
422
patches/02_reconnect_reasons.patch
Normal file
@@ -0,0 +1,422 @@
|
||||
diff --git a/main.go b/main.go
|
||||
index 12849b1..c223b75 100644
|
||||
--- a/main.go
|
||||
+++ b/main.go
|
||||
@@ -1,7 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
+ "errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net"
|
||||
@@ -22,6 +24,9 @@ import (
|
||||
"github.com/fosrl/newt/updates"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
|
||||
+ "github.com/fosrl/newt/internal/state"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
@@ -116,6 +121,13 @@ var (
|
||||
healthMonitor *healthcheck.Monitor
|
||||
enforceHealthcheckCert bool
|
||||
|
||||
+ // Observability/metrics flags
|
||||
+ metricsEnabled bool
|
||||
+ otlpEnabled bool
|
||||
+ adminAddr string
|
||||
+ region string
|
||||
+ metricsAsyncBytes bool
|
||||
+
|
||||
// New mTLS configuration variables
|
||||
tlsClientCert string
|
||||
tlsClientKey string
|
||||
@@ -126,6 +138,10 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
+ // Prepare context for graceful shutdown and signal handling
|
||||
+ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
+ defer stop()
|
||||
+
|
||||
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
|
||||
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
|
||||
id = os.Getenv("NEWT_ID")
|
||||
@@ -141,6 +157,13 @@ func main() {
|
||||
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
|
||||
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
|
||||
|
||||
+ // Metrics/observability env mirrors
|
||||
+ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
|
||||
+ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
|
||||
+ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
|
||||
+ regionEnv := os.Getenv("NEWT_REGION")
|
||||
+ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
|
||||
+
|
||||
keepInterface = keepInterfaceEnv == "true"
|
||||
acceptClients = acceptClientsEnv == "true"
|
||||
useNativeInterface = useNativeInterfaceEnv == "true"
|
||||
@@ -272,6 +295,35 @@ func main() {
|
||||
flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)")
|
||||
}
|
||||
|
||||
+ // Metrics/observability flags (mirror ENV if unset)
|
||||
+ if metricsEnabledEnv == "" {
|
||||
+ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true }
|
||||
+ }
|
||||
+ if otlpEnabledEnv == "" {
|
||||
+ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v }
|
||||
+ }
|
||||
+ if adminAddrEnv == "" {
|
||||
+ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
|
||||
+ } else {
|
||||
+ adminAddr = adminAddrEnv
|
||||
+ }
|
||||
+ // Async bytes toggle
|
||||
+ if asyncBytesEnv == "" {
|
||||
+ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v }
|
||||
+ }
|
||||
+ // Optional region flag (resource attribute)
|
||||
+ if regionEnv == "" {
|
||||
+ flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)")
|
||||
+ } else {
|
||||
+ region = regionEnv
|
||||
+ }
|
||||
+
|
||||
// do a --version check
|
||||
version := flag.Bool("version", false, "Print the version")
|
||||
|
||||
@@ -286,6 +338,50 @@ func main() {
|
||||
loggerLevel := parseLogLevel(logLevel)
|
||||
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
|
||||
|
||||
+ // Initialize telemetry after flags are parsed (so flags override env)
|
||||
+ tcfg := telemetry.FromEnv()
|
||||
+ tcfg.PromEnabled = metricsEnabled
|
||||
+ tcfg.OTLPEnabled = otlpEnabled
|
||||
+ if adminAddr != "" { tcfg.AdminAddr = adminAddr }
|
||||
+ // Resource attributes (if available)
|
||||
+ tcfg.SiteID = id
|
||||
+ tcfg.Region = region
|
||||
+ // Build info
|
||||
+ tcfg.BuildVersion = newtVersion
|
||||
+ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
|
||||
+
|
||||
+ tel, telErr := telemetry.Init(ctx, tcfg)
|
||||
+ if telErr != nil {
|
||||
+ logger.Warn("Telemetry init failed: %v", telErr)
|
||||
+ }
|
||||
+ if tel != nil {
|
||||
+ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
|
||||
+ mux := http.NewServeMux()
|
||||
+ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
|
||||
+ if tel.PrometheusHandler != nil {
|
||||
+ mux.Handle("/metrics", tel.PrometheusHandler)
|
||||
+ }
|
||||
+ admin := &http.Server{
|
||||
+ Addr: tcfg.AdminAddr,
|
||||
+ Handler: otelhttp.NewHandler(mux, "newt-admin"),
|
||||
+ ReadTimeout: 5 * time.Second,
|
||||
+ WriteTimeout: 10 * time.Second,
|
||||
+ ReadHeaderTimeout: 5 * time.Second,
|
||||
+ IdleTimeout: 30 * time.Second,
|
||||
+ }
|
||||
+ go func() {
|
||||
+ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
+ logger.Warn("admin http error: %v", err)
|
||||
+ }
|
||||
+ }()
|
||||
+ defer func() {
|
||||
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
+ defer cancel()
|
||||
+ _ = admin.Shutdown(ctx)
|
||||
+ }()
|
||||
+ defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
+ }
|
||||
+
|
||||
newtVersion := "version_replaceme"
|
||||
if *version {
|
||||
fmt.Println("Newt version " + newtVersion)
|
||||
@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
// Use reliable ping for initial connection test
|
||||
logger.Debug("Testing initial connection with reliable ping...")
|
||||
- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ if err == nil && wgData.PublicKey != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds())
|
||||
+ }
|
||||
if err != nil {
|
||||
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
|
||||
} else {
|
||||
@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
// as the pings will continue in the background
|
||||
if !connected {
|
||||
logger.Debug("Starting ping check")
|
||||
- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
|
||||
+ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
|
||||
}
|
||||
|
||||
// Create proxy manager
|
||||
pm = proxy.NewProxyManager(tnet)
|
||||
+ pm.SetAsyncBytes(metricsAsyncBytes)
|
||||
+ // Set tunnel_id for metrics (WireGuard peer public key)
|
||||
+ pm.SetTunnelID(wgData.PublicKey)
|
||||
|
||||
connected = true
|
||||
|
||||
+ // telemetry: record a successful site registration (omit region unless available)
|
||||
+ telemetry.IncSiteRegistration(context.Background(), id, "", "success")
|
||||
+
|
||||
// add the targets if there are any
|
||||
if len(wgData.Targets.TCP) > 0 {
|
||||
updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP})
|
||||
@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received reconnect message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
// Mark as disconnected
|
||||
connected = false
|
||||
|
||||
@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received termination message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
diff --git a/util.go b/util.go
|
||||
index 7d6da4f..c1f4915 100644
|
||||
--- a/util.go
|
||||
+++ b/util.go
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/fosrl/newt/proxy"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
"golang.org/x/net/icmp"
|
||||
"golang.org/x/net/ipv4"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
|
||||
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
|
||||
}
|
||||
|
||||
-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} {
|
||||
+func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
|
||||
maxInterval := 6 * time.Second
|
||||
currentInterval := pingInterval
|
||||
consecutiveFailures := 0
|
||||
@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
if !connectionLost {
|
||||
connectionLost = true
|
||||
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout)
|
||||
+ }
|
||||
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second)
|
||||
// Send registration message to the server for backward compatibility
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
} else {
|
||||
// Track recent latencies
|
||||
recentLatencies = append(recentLatencies, latency)
|
||||
+ // Record tunnel latency (limit sampling to this periodic check)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds())
|
||||
+ }
|
||||
if len(recentLatencies) > 10 {
|
||||
recentLatencies = recentLatencies[1:]
|
||||
}
|
||||
diff --git a/websocket/client.go b/websocket/client.go
|
||||
index 0c0664a..c9ac264 100644
|
||||
--- a/websocket/client.go
|
||||
+++ b/websocket/client.go
|
||||
@@ -18,6 +18,10 @@ import (
|
||||
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/gorilla/websocket"
|
||||
+
|
||||
+ "context"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/otel"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err))
|
||||
return "", fmt.Errorf("failed to request new token: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure")
|
||||
+ bin := "http_other"
|
||||
+ if resp.StatusCode >= 500 {
|
||||
+ bin = "http_5xx"
|
||||
+ } else if resp.StatusCode >= 400 {
|
||||
+ bin = "http_4xx"
|
||||
+ }
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin)
|
||||
+ // Reconnect reason mapping for auth failures
|
||||
+ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError)
|
||||
+ }
|
||||
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
|
||||
logger.Debug("Received token: %s", tokenResp.Data.Token)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success")
|
||||
|
||||
return tokenResp.Data.Token, nil
|
||||
}
|
||||
|
||||
+// classifyConnError maps common errors to low-cardinality error_type labels
|
||||
+func classifyConnError(err error) string {
|
||||
+ if err == nil {
|
||||
+ return ""
|
||||
+ }
|
||||
+ msg := strings.ToLower(err.Error())
|
||||
+ switch {
|
||||
+ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
|
||||
+ return "tls"
|
||||
+ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"):
|
||||
+ return "timeout"
|
||||
+ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"):
|
||||
+ return "dns"
|
||||
+ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
|
||||
+ return "auth"
|
||||
+ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"):
|
||||
+ return "io"
|
||||
+ default:
|
||||
+ return "other"
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
func (c *Client) connectWithRetry() {
|
||||
for {
|
||||
select {
|
||||
@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error {
|
||||
// Get token for authentication
|
||||
token, err := c.getToken()
|
||||
if err != nil {
|
||||
+ // telemetry: connection attempt failed before dialing
|
||||
+ // site_id isn't globally available here; use client ID as site_id (low cardinality)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err))
|
||||
return fmt.Errorf("failed to get token: %w", err)
|
||||
}
|
||||
|
||||
@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error {
|
||||
q.Set("clientType", c.clientType)
|
||||
u.RawQuery = q.Encode()
|
||||
|
||||
- // Connect to WebSocket
|
||||
+ // Connect to WebSocket (optional span)
|
||||
+ tr := otel.Tracer("newt")
|
||||
+ spanCtx, span := tr.Start(context.Background(), "ws.connect")
|
||||
+ defer span.End()
|
||||
+
|
||||
dialer := websocket.DefaultDialer
|
||||
|
||||
// Use new TLS configuration method
|
||||
@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error {
|
||||
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
|
||||
}
|
||||
|
||||
- conn, _, err := dialer.Dial(u.String(), nil)
|
||||
+conn, _, err := dialer.DialContext(spanCtx, u.String(), nil)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ etype := classifyConnError(err)
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype)
|
||||
+ // Map handshake-related errors to reconnect reasons where appropriate
|
||||
+ if etype == "tls" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError)
|
||||
+ } else if etype == "timeout" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout)
|
||||
+ } else {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError)
|
||||
+ }
|
||||
return fmt.Errorf("failed to connect to WebSocket: %w", err)
|
||||
}
|
||||
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success")
|
||||
c.conn = conn
|
||||
c.setConnected(true)
|
||||
|
||||
diff --git a/wg/wg.go b/wg/wg.go
|
||||
index 3cee1a9..a765279 100644
|
||||
--- a/wg/wg.go
|
||||
+++ b/wg/wg.go
|
||||
@@ -3,6 +3,7 @@
|
||||
package wg
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -23,6 +24,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/wgctrl"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
s.stopGetConfig = nil
|
||||
}
|
||||
|
||||
+ // telemetry: config reload success
|
||||
+ telemetry.IncConfigReload(context.Background(), "success")
|
||||
+ // Optional reconnect reason mapping: config change
|
||||
+ if s.serverPubKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange)
|
||||
+ }
|
||||
+
|
||||
// Ensure the WireGuard interface and peers are configured
|
||||
if err := s.ensureWireguardInterface(config); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard interface: %v", err)
|
||||
466
patches/02_reconnect_rtt.patch
Normal file
466
patches/02_reconnect_rtt.patch
Normal file
@@ -0,0 +1,466 @@
|
||||
diff --git a/main.go b/main.go
|
||||
index 12849b1..c223b75 100644
|
||||
--- a/main.go
|
||||
+++ b/main.go
|
||||
@@ -1,7 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
+ "errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net"
|
||||
@@ -22,6 +24,9 @@ import (
|
||||
"github.com/fosrl/newt/updates"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
|
||||
+ "github.com/fosrl/newt/internal/state"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
@@ -116,6 +121,13 @@ var (
|
||||
healthMonitor *healthcheck.Monitor
|
||||
enforceHealthcheckCert bool
|
||||
|
||||
+ // Observability/metrics flags
|
||||
+ metricsEnabled bool
|
||||
+ otlpEnabled bool
|
||||
+ adminAddr string
|
||||
+ region string
|
||||
+ metricsAsyncBytes bool
|
||||
+
|
||||
// New mTLS configuration variables
|
||||
tlsClientCert string
|
||||
tlsClientKey string
|
||||
@@ -126,6 +138,10 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
+ // Prepare context for graceful shutdown and signal handling
|
||||
+ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
+ defer stop()
|
||||
+
|
||||
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
|
||||
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
|
||||
id = os.Getenv("NEWT_ID")
|
||||
@@ -141,6 +157,13 @@ func main() {
|
||||
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
|
||||
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
|
||||
|
||||
+ // Metrics/observability env mirrors
|
||||
+ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
|
||||
+ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
|
||||
+ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
|
||||
+ regionEnv := os.Getenv("NEWT_REGION")
|
||||
+ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
|
||||
+
|
||||
keepInterface = keepInterfaceEnv == "true"
|
||||
acceptClients = acceptClientsEnv == "true"
|
||||
useNativeInterface = useNativeInterfaceEnv == "true"
|
||||
@@ -272,6 +295,35 @@ func main() {
|
||||
flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)")
|
||||
}
|
||||
|
||||
+ // Metrics/observability flags (mirror ENV if unset)
|
||||
+ if metricsEnabledEnv == "" {
|
||||
+ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true }
|
||||
+ }
|
||||
+ if otlpEnabledEnv == "" {
|
||||
+ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v }
|
||||
+ }
|
||||
+ if adminAddrEnv == "" {
|
||||
+ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
|
||||
+ } else {
|
||||
+ adminAddr = adminAddrEnv
|
||||
+ }
|
||||
+ // Async bytes toggle
|
||||
+ if asyncBytesEnv == "" {
|
||||
+ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v }
|
||||
+ }
|
||||
+ // Optional region flag (resource attribute)
|
||||
+ if regionEnv == "" {
|
||||
+ flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)")
|
||||
+ } else {
|
||||
+ region = regionEnv
|
||||
+ }
|
||||
+
|
||||
// do a --version check
|
||||
version := flag.Bool("version", false, "Print the version")
|
||||
|
||||
@@ -286,6 +338,50 @@ func main() {
|
||||
loggerLevel := parseLogLevel(logLevel)
|
||||
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
|
||||
|
||||
+ // Initialize telemetry after flags are parsed (so flags override env)
|
||||
+ tcfg := telemetry.FromEnv()
|
||||
+ tcfg.PromEnabled = metricsEnabled
|
||||
+ tcfg.OTLPEnabled = otlpEnabled
|
||||
+ if adminAddr != "" { tcfg.AdminAddr = adminAddr }
|
||||
+ // Resource attributes (if available)
|
||||
+ tcfg.SiteID = id
|
||||
+ tcfg.Region = region
|
||||
+ // Build info
|
||||
+ tcfg.BuildVersion = newtVersion
|
||||
+ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
|
||||
+
|
||||
+ tel, telErr := telemetry.Init(ctx, tcfg)
|
||||
+ if telErr != nil {
|
||||
+ logger.Warn("Telemetry init failed: %v", telErr)
|
||||
+ }
|
||||
+ if tel != nil {
|
||||
+ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
|
||||
+ mux := http.NewServeMux()
|
||||
+ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
|
||||
+ if tel.PrometheusHandler != nil {
|
||||
+ mux.Handle("/metrics", tel.PrometheusHandler)
|
||||
+ }
|
||||
+ admin := &http.Server{
|
||||
+ Addr: tcfg.AdminAddr,
|
||||
+ Handler: otelhttp.NewHandler(mux, "newt-admin"),
|
||||
+ ReadTimeout: 5 * time.Second,
|
||||
+ WriteTimeout: 10 * time.Second,
|
||||
+ ReadHeaderTimeout: 5 * time.Second,
|
||||
+ IdleTimeout: 30 * time.Second,
|
||||
+ }
|
||||
+ go func() {
|
||||
+ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
+ logger.Warn("admin http error: %v", err)
|
||||
+ }
|
||||
+ }()
|
||||
+ defer func() {
|
||||
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
+ defer cancel()
|
||||
+ _ = admin.Shutdown(ctx)
|
||||
+ }()
|
||||
+ defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
+ }
|
||||
+
|
||||
newtVersion := "version_replaceme"
|
||||
if *version {
|
||||
fmt.Println("Newt version " + newtVersion)
|
||||
@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
// Use reliable ping for initial connection test
|
||||
logger.Debug("Testing initial connection with reliable ping...")
|
||||
- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ if err == nil && wgData.PublicKey != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds())
|
||||
+ }
|
||||
if err != nil {
|
||||
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
|
||||
} else {
|
||||
@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
// as the pings will continue in the background
|
||||
if !connected {
|
||||
logger.Debug("Starting ping check")
|
||||
- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
|
||||
+ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
|
||||
}
|
||||
|
||||
// Create proxy manager
|
||||
pm = proxy.NewProxyManager(tnet)
|
||||
+ pm.SetAsyncBytes(metricsAsyncBytes)
|
||||
+ // Set tunnel_id for metrics (WireGuard peer public key)
|
||||
+ pm.SetTunnelID(wgData.PublicKey)
|
||||
|
||||
connected = true
|
||||
|
||||
+ // telemetry: record a successful site registration (omit region unless available)
|
||||
+ telemetry.IncSiteRegistration(context.Background(), id, "", "success")
|
||||
+
|
||||
// add the targets if there are any
|
||||
if len(wgData.Targets.TCP) > 0 {
|
||||
updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP})
|
||||
@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received reconnect message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
// Mark as disconnected
|
||||
connected = false
|
||||
|
||||
@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received termination message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
diff --git a/util.go b/util.go
|
||||
index 7d6da4f..c1f4915 100644
|
||||
--- a/util.go
|
||||
+++ b/util.go
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/fosrl/newt/proxy"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
"golang.org/x/net/icmp"
|
||||
"golang.org/x/net/ipv4"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
|
||||
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
|
||||
}
|
||||
|
||||
-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} {
|
||||
+func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
|
||||
maxInterval := 6 * time.Second
|
||||
currentInterval := pingInterval
|
||||
consecutiveFailures := 0
|
||||
@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
if !connectionLost {
|
||||
connectionLost = true
|
||||
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout)
|
||||
+ }
|
||||
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second)
|
||||
// Send registration message to the server for backward compatibility
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
} else {
|
||||
// Track recent latencies
|
||||
recentLatencies = append(recentLatencies, latency)
|
||||
+ // Record tunnel latency (limit sampling to this periodic check)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds())
|
||||
+ }
|
||||
if len(recentLatencies) > 10 {
|
||||
recentLatencies = recentLatencies[1:]
|
||||
}
|
||||
diff --git a/websocket/client.go b/websocket/client.go
|
||||
index 0c0664a..c9ac264 100644
|
||||
--- a/websocket/client.go
|
||||
+++ b/websocket/client.go
|
||||
@@ -18,6 +18,10 @@ import (
|
||||
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/gorilla/websocket"
|
||||
+
|
||||
+ "context"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/otel"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err))
|
||||
return "", fmt.Errorf("failed to request new token: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure")
|
||||
+ bin := "http_other"
|
||||
+ if resp.StatusCode >= 500 {
|
||||
+ bin = "http_5xx"
|
||||
+ } else if resp.StatusCode >= 400 {
|
||||
+ bin = "http_4xx"
|
||||
+ }
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin)
|
||||
+ // Reconnect reason mapping for auth failures
|
||||
+ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError)
|
||||
+ }
|
||||
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
|
||||
logger.Debug("Received token: %s", tokenResp.Data.Token)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success")
|
||||
|
||||
return tokenResp.Data.Token, nil
|
||||
}
|
||||
|
||||
+// classifyConnError maps common errors to low-cardinality error_type labels
|
||||
+func classifyConnError(err error) string {
|
||||
+ if err == nil {
|
||||
+ return ""
|
||||
+ }
|
||||
+ msg := strings.ToLower(err.Error())
|
||||
+ switch {
|
||||
+ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
|
||||
+ return "tls"
|
||||
+ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"):
|
||||
+ return "timeout"
|
||||
+ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"):
|
||||
+ return "dns"
|
||||
+ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
|
||||
+ return "auth"
|
||||
+ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"):
|
||||
+ return "io"
|
||||
+ default:
|
||||
+ return "other"
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
func (c *Client) connectWithRetry() {
|
||||
for {
|
||||
select {
|
||||
@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error {
|
||||
// Get token for authentication
|
||||
token, err := c.getToken()
|
||||
if err != nil {
|
||||
+ // telemetry: connection attempt failed before dialing
|
||||
+ // site_id isn't globally available here; use client ID as site_id (low cardinality)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err))
|
||||
return fmt.Errorf("failed to get token: %w", err)
|
||||
}
|
||||
|
||||
@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error {
|
||||
q.Set("clientType", c.clientType)
|
||||
u.RawQuery = q.Encode()
|
||||
|
||||
- // Connect to WebSocket
|
||||
+ // Connect to WebSocket (optional span)
|
||||
+ tr := otel.Tracer("newt")
|
||||
+ spanCtx, span := tr.Start(context.Background(), "ws.connect")
|
||||
+ defer span.End()
|
||||
+
|
||||
dialer := websocket.DefaultDialer
|
||||
|
||||
// Use new TLS configuration method
|
||||
@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error {
|
||||
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
|
||||
}
|
||||
|
||||
- conn, _, err := dialer.Dial(u.String(), nil)
|
||||
+conn, _, err := dialer.DialContext(spanCtx, u.String(), nil)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ etype := classifyConnError(err)
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype)
|
||||
+ // Map handshake-related errors to reconnect reasons where appropriate
|
||||
+ if etype == "tls" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError)
|
||||
+ } else if etype == "timeout" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout)
|
||||
+ } else {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError)
|
||||
+ }
|
||||
return fmt.Errorf("failed to connect to WebSocket: %w", err)
|
||||
}
|
||||
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success")
|
||||
c.conn = conn
|
||||
c.setConnected(true)
|
||||
|
||||
diff --git a/wg/wg.go b/wg/wg.go
|
||||
index 3cee1a9..a765279 100644
|
||||
--- a/wg/wg.go
|
||||
+++ b/wg/wg.go
|
||||
@@ -3,6 +3,7 @@
|
||||
package wg
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -23,6 +24,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/wgctrl"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
s.stopGetConfig = nil
|
||||
}
|
||||
|
||||
+ // telemetry: config reload success
|
||||
+ telemetry.IncConfigReload(context.Background(), "success")
|
||||
+ // Optional reconnect reason mapping: config change
|
||||
+ if s.serverPubKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange)
|
||||
+ }
|
||||
+
|
||||
// Ensure the WireGuard interface and peers are configured
|
||||
if err := s.ensureWireguardInterface(config); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard interface: %v", err)
|
||||
diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go
|
||||
index 6684c40..09f160e 100644
|
||||
--- a/wgnetstack/wgnetstack.go
|
||||
+++ b/wgnetstack/wgnetstack.go
|
||||
@@ -1,6 +1,7 @@
|
||||
package wgnetstack
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
@@ -26,6 +27,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str
|
||||
return service, nil
|
||||
}
|
||||
|
||||
+// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally.
|
||||
+func (s *WireGuardService) ReportRTT(seconds float64) {
|
||||
+ if s.serverPubKey == "" { return }
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds)
|
||||
+}
|
||||
+
|
||||
func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) {
|
||||
logger.Debug("Received: %+v", msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if s.TunnelIP == "" || s.proxyManager == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
- return
|
||||
- }
|
||||
+ return
|
||||
+}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
0
patches/03_constants_docs.patch
Normal file
0
patches/03_constants_docs.patch
Normal file
44
patches/03_wg_rtt_hook.patch
Normal file
44
patches/03_wg_rtt_hook.patch
Normal file
@@ -0,0 +1,44 @@
|
||||
diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go
|
||||
index 6684c40..09f160e 100644
|
||||
--- a/wgnetstack/wgnetstack.go
|
||||
+++ b/wgnetstack/wgnetstack.go
|
||||
@@ -1,6 +1,7 @@
|
||||
package wgnetstack
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
@@ -26,6 +27,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str
|
||||
return service, nil
|
||||
}
|
||||
|
||||
+// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally.
|
||||
+func (s *WireGuardService) ReportRTT(seconds float64) {
|
||||
+ if s.serverPubKey == "" { return }
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds)
|
||||
+}
|
||||
+
|
||||
func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) {
|
||||
logger.Debug("Received: %+v", msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if s.TunnelIP == "" || s.proxyManager == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
- return
|
||||
- }
|
||||
+ return
|
||||
+}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
0
patches/04_tests_docs.patch
Normal file
0
patches/04_tests_docs.patch
Normal file
25
patches/HOWTO-APPLY.md
Normal file
25
patches/HOWTO-APPLY.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# How to apply patches
|
||||
|
||||
These patches were generated from the working tree without commits. You can apply them in one shot or in topic order.
|
||||
|
||||
One shot (recommended during review):
|
||||
|
||||
```bash
|
||||
git apply patches/00_all_changes.patch
|
||||
```
|
||||
|
||||
Topic order:
|
||||
|
||||
```bash
|
||||
git apply patches/01_proxy_multitunnel.patch
|
||||
git apply patches/02_reconnect_rtt.patch
|
||||
git apply patches/03_constants_docs.patch
|
||||
```
|
||||
|
||||
Rollback (restore to HEAD and clean untracked files):
|
||||
|
||||
```bash
|
||||
git restore --source=HEAD --worktree --staged .
|
||||
git clean -fd
|
||||
```
|
||||
|
||||
379
proxy/manager.go
379
proxy/manager.go
@@ -1,18 +1,28 @@
|
||||
package proxy
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/fosrl/newt/internal/state"
|
||||
"github.com/fosrl/newt/internal/telemetry"
|
||||
"github.com/fosrl/newt/logger"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
|
||||
)
|
||||
|
||||
const errUnsupportedProtoFmt = "unsupported protocol: %s"
|
||||
|
||||
// Target represents a proxy target with its address and port
|
||||
type Target struct {
|
||||
Address string
|
||||
@@ -28,6 +38,90 @@ type ProxyManager struct {
|
||||
udpConns []*gonet.UDPConn
|
||||
running bool
|
||||
mutex sync.RWMutex
|
||||
|
||||
// telemetry (multi-tunnel)
|
||||
currentTunnelID string
|
||||
tunnels map[string]*tunnelEntry
|
||||
asyncBytes bool
|
||||
flushStop chan struct{}
|
||||
}
|
||||
|
||||
// tunnelEntry holds per-tunnel attributes and (optional) async counters.
|
||||
type tunnelEntry struct {
|
||||
attrInTCP attribute.Set
|
||||
attrOutTCP attribute.Set
|
||||
attrInUDP attribute.Set
|
||||
attrOutUDP attribute.Set
|
||||
|
||||
bytesInTCP atomic.Uint64
|
||||
bytesOutTCP atomic.Uint64
|
||||
bytesInUDP atomic.Uint64
|
||||
bytesOutUDP atomic.Uint64
|
||||
|
||||
activeTCP atomic.Int64
|
||||
activeUDP atomic.Int64
|
||||
}
|
||||
|
||||
// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set.
|
||||
type countingWriter struct {
|
||||
ctx context.Context
|
||||
w io.Writer
|
||||
set attribute.Set
|
||||
pm *ProxyManager
|
||||
ent *tunnelEntry
|
||||
out bool // false=in, true=out
|
||||
proto string // "tcp" or "udp"
|
||||
}
|
||||
|
||||
func (cw *countingWriter) Write(p []byte) (int, error) {
|
||||
n, err := cw.w.Write(p)
|
||||
if n > 0 {
|
||||
if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil {
|
||||
switch cw.proto {
|
||||
case "tcp":
|
||||
if cw.out {
|
||||
cw.ent.bytesOutTCP.Add(uint64(n))
|
||||
} else {
|
||||
cw.ent.bytesInTCP.Add(uint64(n))
|
||||
}
|
||||
case "udp":
|
||||
if cw.out {
|
||||
cw.ent.bytesOutUDP.Add(uint64(n))
|
||||
} else {
|
||||
cw.ent.bytesInUDP.Add(uint64(n))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set)
|
||||
}
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
func classifyProxyError(err error) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
if errors.Is(err, net.ErrClosed) {
|
||||
return "closed"
|
||||
}
|
||||
if ne, ok := err.(net.Error); ok {
|
||||
if ne.Timeout() {
|
||||
return "timeout"
|
||||
}
|
||||
if ne.Temporary() {
|
||||
return "temporary"
|
||||
}
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
switch {
|
||||
case strings.Contains(msg, "refused"):
|
||||
return "refused"
|
||||
case strings.Contains(msg, "reset"):
|
||||
return "reset"
|
||||
default:
|
||||
return "io_error"
|
||||
}
|
||||
}
|
||||
|
||||
// NewProxyManager creates a new proxy manager instance
|
||||
@@ -38,9 +132,77 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager {
|
||||
udpTargets: make(map[string]map[int]string),
|
||||
listeners: make([]*gonet.TCPListener, 0),
|
||||
udpConns: make([]*gonet.UDPConn, 0),
|
||||
tunnels: make(map[string]*tunnelEntry),
|
||||
}
|
||||
}
|
||||
|
||||
// SetTunnelID sets the WireGuard peer public key used as tunnel_id label.
|
||||
func (pm *ProxyManager) SetTunnelID(id string) {
|
||||
pm.mutex.Lock()
|
||||
defer pm.mutex.Unlock()
|
||||
pm.currentTunnelID = id
|
||||
if _, ok := pm.tunnels[id]; !ok {
|
||||
pm.tunnels[id] = &tunnelEntry{}
|
||||
}
|
||||
e := pm.tunnels[id]
|
||||
// include site labels if available
|
||||
site := telemetry.SiteLabelKVs()
|
||||
build := func(base []attribute.KeyValue) attribute.Set {
|
||||
if telemetry.ShouldIncludeTunnelID() {
|
||||
base = append([]attribute.KeyValue{attribute.String("tunnel_id", id)}, base...)
|
||||
}
|
||||
base = append(site, base...)
|
||||
return attribute.NewSet(base...)
|
||||
}
|
||||
e.attrInTCP = build([]attribute.KeyValue{
|
||||
attribute.String("direction", "ingress"),
|
||||
attribute.String("protocol", "tcp"),
|
||||
})
|
||||
e.attrOutTCP = build([]attribute.KeyValue{
|
||||
attribute.String("direction", "egress"),
|
||||
attribute.String("protocol", "tcp"),
|
||||
})
|
||||
e.attrInUDP = build([]attribute.KeyValue{
|
||||
attribute.String("direction", "ingress"),
|
||||
attribute.String("protocol", "udp"),
|
||||
})
|
||||
e.attrOutUDP = build([]attribute.KeyValue{
|
||||
attribute.String("direction", "egress"),
|
||||
attribute.String("protocol", "udp"),
|
||||
})
|
||||
}
|
||||
|
||||
// ClearTunnelID clears cached attribute sets for the current tunnel.
|
||||
func (pm *ProxyManager) ClearTunnelID() {
|
||||
pm.mutex.Lock()
|
||||
defer pm.mutex.Unlock()
|
||||
id := pm.currentTunnelID
|
||||
if id == "" {
|
||||
return
|
||||
}
|
||||
if e, ok := pm.tunnels[id]; ok {
|
||||
// final flush for this tunnel
|
||||
inTCP := e.bytesInTCP.Swap(0)
|
||||
outTCP := e.bytesOutTCP.Swap(0)
|
||||
inUDP := e.bytesInUDP.Swap(0)
|
||||
outUDP := e.bytesOutUDP.Swap(0)
|
||||
if inTCP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP)
|
||||
}
|
||||
if outTCP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP)
|
||||
}
|
||||
if inUDP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP)
|
||||
}
|
||||
if outUDP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP)
|
||||
}
|
||||
delete(pm.tunnels, id)
|
||||
}
|
||||
pm.currentTunnelID = ""
|
||||
}
|
||||
|
||||
// init function without tnet
|
||||
func NewProxyManagerWithoutTNet() *ProxyManager {
|
||||
return &ProxyManager{
|
||||
@@ -75,7 +237,7 @@ func (pm *ProxyManager) AddTarget(proto, listenIP string, port int, targetAddr s
|
||||
}
|
||||
pm.udpTargets[listenIP][port] = targetAddr
|
||||
default:
|
||||
return fmt.Errorf("unsupported protocol: %s", proto)
|
||||
return fmt.Errorf(errUnsupportedProtoFmt, proto)
|
||||
}
|
||||
|
||||
if pm.running {
|
||||
@@ -124,13 +286,28 @@ func (pm *ProxyManager) RemoveTarget(proto, listenIP string, port int) error {
|
||||
return fmt.Errorf("target not found: %s:%d", listenIP, port)
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("unsupported protocol: %s", proto)
|
||||
return fmt.Errorf(errUnsupportedProtoFmt, proto)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Start begins listening for all configured proxy targets
|
||||
func (pm *ProxyManager) Start() error {
|
||||
// Register proxy observables once per process
|
||||
telemetry.SetProxyObservableCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
pm.mutex.RLock()
|
||||
defer pm.mutex.RUnlock()
|
||||
for _, e := range pm.tunnels {
|
||||
// active connections
|
||||
telemetry.ObserveProxyActiveConnsObs(o, e.activeTCP.Load(), e.attrOutTCP.ToSlice())
|
||||
telemetry.ObserveProxyActiveConnsObs(o, e.activeUDP.Load(), e.attrOutUDP.ToSlice())
|
||||
// backlog bytes (sum of unflushed counters)
|
||||
b := int64(e.bytesInTCP.Load() + e.bytesOutTCP.Load() + e.bytesInUDP.Load() + e.bytesOutUDP.Load())
|
||||
telemetry.ObserveProxyAsyncBacklogObs(o, b, e.attrOutTCP.ToSlice())
|
||||
telemetry.ObserveProxyBufferBytesObs(o, b, e.attrOutTCP.ToSlice())
|
||||
}
|
||||
return nil
|
||||
})
|
||||
pm.mutex.Lock()
|
||||
defer pm.mutex.Unlock()
|
||||
|
||||
@@ -160,6 +337,75 @@ func (pm *ProxyManager) Start() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (pm *ProxyManager) SetAsyncBytes(b bool) {
|
||||
pm.mutex.Lock()
|
||||
defer pm.mutex.Unlock()
|
||||
pm.asyncBytes = b
|
||||
if b && pm.flushStop == nil {
|
||||
pm.flushStop = make(chan struct{})
|
||||
go pm.flushLoop()
|
||||
}
|
||||
}
|
||||
func (pm *ProxyManager) flushLoop() {
|
||||
flushInterval := 2 * time.Second
|
||||
if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" {
|
||||
if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||
if d/2 < flushInterval {
|
||||
flushInterval = d / 2
|
||||
}
|
||||
}
|
||||
}
|
||||
ticker := time.NewTicker(flushInterval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
pm.mutex.RLock()
|
||||
for _, e := range pm.tunnels {
|
||||
inTCP := e.bytesInTCP.Swap(0)
|
||||
outTCP := e.bytesOutTCP.Swap(0)
|
||||
inUDP := e.bytesInUDP.Swap(0)
|
||||
outUDP := e.bytesOutUDP.Swap(0)
|
||||
if inTCP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP)
|
||||
}
|
||||
if outTCP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP)
|
||||
}
|
||||
if inUDP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP)
|
||||
}
|
||||
if outUDP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP)
|
||||
}
|
||||
}
|
||||
pm.mutex.RUnlock()
|
||||
case <-pm.flushStop:
|
||||
pm.mutex.RLock()
|
||||
for _, e := range pm.tunnels {
|
||||
inTCP := e.bytesInTCP.Swap(0)
|
||||
outTCP := e.bytesOutTCP.Swap(0)
|
||||
inUDP := e.bytesInUDP.Swap(0)
|
||||
outUDP := e.bytesOutUDP.Swap(0)
|
||||
if inTCP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP)
|
||||
}
|
||||
if outTCP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP)
|
||||
}
|
||||
if inUDP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP)
|
||||
}
|
||||
if outUDP > 0 {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP)
|
||||
}
|
||||
}
|
||||
pm.mutex.RUnlock()
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (pm *ProxyManager) Stop() error {
|
||||
pm.mutex.Lock()
|
||||
defer pm.mutex.Unlock()
|
||||
@@ -227,7 +473,7 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr
|
||||
go pm.handleUDPProxy(conn, targetAddr)
|
||||
|
||||
default:
|
||||
return fmt.Errorf("unsupported protocol: %s", proto)
|
||||
return fmt.Errorf(errUnsupportedProtoFmt, proto)
|
||||
}
|
||||
|
||||
logger.Info("Started %s proxy to %s", proto, targetAddr)
|
||||
@@ -236,54 +482,84 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr
|
||||
return nil
|
||||
}
|
||||
|
||||
// getEntry returns per-tunnel entry or nil.
|
||||
func (pm *ProxyManager) getEntry(id string) *tunnelEntry {
|
||||
pm.mutex.RLock()
|
||||
e := pm.tunnels[id]
|
||||
pm.mutex.RUnlock()
|
||||
return e
|
||||
}
|
||||
|
||||
func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) {
|
||||
for {
|
||||
conn, err := listener.Accept()
|
||||
if err != nil {
|
||||
// Check if we're shutting down or the listener was closed
|
||||
telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "tcp", "failure", classifyProxyError(err))
|
||||
if !pm.running {
|
||||
return
|
||||
}
|
||||
|
||||
// Check for specific network errors that indicate the listener is closed
|
||||
if ne, ok := err.(net.Error); ok && !ne.Temporary() {
|
||||
logger.Info("TCP listener closed, stopping proxy handler for %v", listener.Addr())
|
||||
return
|
||||
}
|
||||
|
||||
logger.Error("Error accepting TCP connection: %v", err)
|
||||
// Don't hammer the CPU if we hit a temporary error
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
continue
|
||||
}
|
||||
|
||||
go func() {
|
||||
tunnelID := pm.currentTunnelID
|
||||
telemetry.IncProxyAccept(context.Background(), tunnelID, "tcp", "success", "")
|
||||
telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "tcp", telemetry.ProxyConnectionOpened)
|
||||
if tunnelID != "" {
|
||||
state.Global().IncSessions(tunnelID)
|
||||
if e := pm.getEntry(tunnelID); e != nil {
|
||||
e.activeTCP.Add(1)
|
||||
}
|
||||
}
|
||||
|
||||
go func(tunnelID string, accepted net.Conn) {
|
||||
connStart := time.Now()
|
||||
target, err := net.Dial("tcp", targetAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error connecting to target: %v", err)
|
||||
conn.Close()
|
||||
accepted.Close()
|
||||
telemetry.IncProxyAccept(context.Background(), tunnelID, "tcp", "failure", classifyProxyError(err))
|
||||
telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "tcp", telemetry.ProxyConnectionClosed)
|
||||
telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "tcp", "failure", time.Since(connStart).Seconds())
|
||||
return
|
||||
}
|
||||
|
||||
// Create a WaitGroup to ensure both copy operations complete
|
||||
entry := pm.getEntry(tunnelID)
|
||||
if entry == nil {
|
||||
entry = &tunnelEntry{}
|
||||
}
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
|
||||
go func() {
|
||||
go func(ent *tunnelEntry) {
|
||||
defer wg.Done()
|
||||
io.Copy(target, conn)
|
||||
target.Close()
|
||||
}()
|
||||
cw := &countingWriter{ctx: context.Background(), w: target, set: ent.attrInTCP, pm: pm, ent: ent, out: false, proto: "tcp"}
|
||||
_, _ = io.Copy(cw, accepted)
|
||||
_ = target.Close()
|
||||
}(entry)
|
||||
|
||||
go func() {
|
||||
go func(ent *tunnelEntry) {
|
||||
defer wg.Done()
|
||||
io.Copy(conn, target)
|
||||
conn.Close()
|
||||
}()
|
||||
cw := &countingWriter{ctx: context.Background(), w: accepted, set: ent.attrOutTCP, pm: pm, ent: ent, out: true, proto: "tcp"}
|
||||
_, _ = io.Copy(cw, target)
|
||||
_ = accepted.Close()
|
||||
}(entry)
|
||||
|
||||
// Wait for both copies to complete
|
||||
wg.Wait()
|
||||
}()
|
||||
if tunnelID != "" {
|
||||
state.Global().DecSessions(tunnelID)
|
||||
if e := pm.getEntry(tunnelID); e != nil {
|
||||
e.activeTCP.Add(-1)
|
||||
}
|
||||
}
|
||||
telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "tcp", "success", time.Since(connStart).Seconds())
|
||||
telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "tcp", telemetry.ProxyConnectionClosed)
|
||||
}(tunnelID, conn)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -326,6 +602,18 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
}
|
||||
|
||||
clientKey := remoteAddr.String()
|
||||
// bytes from client -> target (direction=in)
|
||||
if pm.currentTunnelID != "" && n > 0 {
|
||||
if pm.asyncBytes {
|
||||
if e := pm.getEntry(pm.currentTunnelID); e != nil {
|
||||
e.bytesInUDP.Add(uint64(n))
|
||||
}
|
||||
} else {
|
||||
if e := pm.getEntry(pm.currentTunnelID); e != nil {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP)
|
||||
}
|
||||
}
|
||||
}
|
||||
clientsMutex.RLock()
|
||||
targetConn, exists := clientConns[clientKey]
|
||||
clientsMutex.RUnlock()
|
||||
@@ -334,28 +622,44 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
targetUDPAddr, err := net.ResolveUDPAddr("udp", targetAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error resolving target address: %v", err)
|
||||
telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "udp", "failure", "resolve")
|
||||
continue
|
||||
}
|
||||
|
||||
targetConn, err = net.DialUDP("udp", nil, targetUDPAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error connecting to target: %v", err)
|
||||
telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "udp", "failure", classifyProxyError(err))
|
||||
continue
|
||||
}
|
||||
tunnelID := pm.currentTunnelID
|
||||
telemetry.IncProxyAccept(context.Background(), tunnelID, "udp", "success", "")
|
||||
telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "udp", telemetry.ProxyConnectionOpened)
|
||||
// Only increment activeUDP after a successful DialUDP
|
||||
if e := pm.getEntry(tunnelID); e != nil {
|
||||
e.activeUDP.Add(1)
|
||||
}
|
||||
|
||||
clientsMutex.Lock()
|
||||
clientConns[clientKey] = targetConn
|
||||
clientsMutex.Unlock()
|
||||
|
||||
go func(clientKey string, targetConn *net.UDPConn, remoteAddr net.Addr) {
|
||||
go func(clientKey string, targetConn *net.UDPConn, remoteAddr net.Addr, tunnelID string) {
|
||||
start := time.Now()
|
||||
result := "success"
|
||||
defer func() {
|
||||
// Always clean up when this goroutine exits
|
||||
clientsMutex.Lock()
|
||||
if storedConn, exists := clientConns[clientKey]; exists && storedConn == targetConn {
|
||||
delete(clientConns, clientKey)
|
||||
targetConn.Close()
|
||||
if e := pm.getEntry(tunnelID); e != nil {
|
||||
e.activeUDP.Add(-1)
|
||||
}
|
||||
}
|
||||
clientsMutex.Unlock()
|
||||
telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "udp", result, time.Since(start).Seconds())
|
||||
telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "udp", telemetry.ProxyConnectionClosed)
|
||||
}()
|
||||
|
||||
buffer := make([]byte, 65507)
|
||||
@@ -363,25 +667,52 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
n, _, err := targetConn.ReadFromUDP(buffer)
|
||||
if err != nil {
|
||||
logger.Error("Error reading from target: %v", err)
|
||||
result = "failure"
|
||||
return // defer will handle cleanup
|
||||
}
|
||||
|
||||
// bytes from target -> client (direction=out)
|
||||
if pm.currentTunnelID != "" && n > 0 {
|
||||
if pm.asyncBytes {
|
||||
if e := pm.getEntry(pm.currentTunnelID); e != nil {
|
||||
e.bytesOutUDP.Add(uint64(n))
|
||||
}
|
||||
} else {
|
||||
if e := pm.getEntry(pm.currentTunnelID); e != nil {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_, err = conn.WriteTo(buffer[:n], remoteAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error writing to client: %v", err)
|
||||
telemetry.IncProxyDrops(context.Background(), pm.currentTunnelID, "udp")
|
||||
result = "failure"
|
||||
return // defer will handle cleanup
|
||||
}
|
||||
}
|
||||
}(clientKey, targetConn, remoteAddr)
|
||||
}(clientKey, targetConn, remoteAddr, tunnelID)
|
||||
}
|
||||
|
||||
_, err = targetConn.Write(buffer[:n])
|
||||
written, err := targetConn.Write(buffer[:n])
|
||||
if err != nil {
|
||||
logger.Error("Error writing to target: %v", err)
|
||||
telemetry.IncProxyDrops(context.Background(), pm.currentTunnelID, "udp")
|
||||
targetConn.Close()
|
||||
clientsMutex.Lock()
|
||||
delete(clientConns, clientKey)
|
||||
clientsMutex.Unlock()
|
||||
} else if pm.currentTunnelID != "" && written > 0 {
|
||||
if pm.asyncBytes {
|
||||
if e := pm.getEntry(pm.currentTunnelID); e != nil {
|
||||
e.bytesInUDP.Add(uint64(written))
|
||||
}
|
||||
} else {
|
||||
if e := pm.getEntry(pm.currentTunnelID); e != nil {
|
||||
telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
55
scripts/smoke-metrics.sh
Normal file
55
scripts/smoke-metrics.sh
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
NEWTHOST=${NEWTHOST:-localhost}
|
||||
NEWTPORT=${NEWTPORT:-2112}
|
||||
METRICS_URL="http://${NEWTHOST}:${NEWTPORT}/metrics"
|
||||
|
||||
probe() {
|
||||
local name=$1
|
||||
local pattern=$2
|
||||
echo "[probe] ${name}"
|
||||
curl -sf "${METRICS_URL}" | grep -E "${pattern}" || {
|
||||
echo "[warn] ${name} not found"
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
# Basic presence
|
||||
probe "newt_* presence" "^newt_" || true
|
||||
|
||||
# Site gauges with site_id
|
||||
probe "site_online with site_id" "^newt_site_online\{.*site_id=\"[^\"]+\"" || true
|
||||
probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_timestamp_seconds\{.*site_id=\"[^\"]+\"" || true
|
||||
|
||||
# Bytes with direction ingress/egress and protocol
|
||||
probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true
|
||||
probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true
|
||||
|
||||
# Optional: verify absence/presence of tunnel_id based on EXPECT_TUNNEL_ID (default true)
|
||||
EXPECT_TUNNEL_ID=${EXPECT_TUNNEL_ID:-true}
|
||||
if [ "$EXPECT_TUNNEL_ID" = "false" ]; then
|
||||
echo "[probe] ensure tunnel_id label is absent when NEWT_METRICS_INCLUDE_TUNNEL_ID=false"
|
||||
! curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[fail] tunnel_id present but EXPECT_TUNNEL_ID=false"; exit 1; }
|
||||
else
|
||||
echo "[probe] ensure tunnel_id label is present (default)"
|
||||
curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[warn] tunnel_id not found (may be expected if no tunnel is active)"; }
|
||||
fi
|
||||
|
||||
# WebSocket metrics (when OTLP/WS used)
|
||||
probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true
|
||||
probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true
|
||||
probe "websocket connected gauge" "^newt_websocket_connected" || true
|
||||
probe "websocket reconnects total" "^newt_websocket_reconnects_total\{" || true
|
||||
|
||||
# Proxy metrics (when proxy active)
|
||||
probe "proxy active connections" "^newt_proxy_active_connections\{" || true
|
||||
probe "proxy buffer bytes" "^newt_proxy_buffer_bytes\{" || true
|
||||
probe "proxy drops total" "^newt_proxy_drops_total\{" || true
|
||||
probe "proxy connections total" "^newt_proxy_connections_total\{" || true
|
||||
|
||||
# Config apply
|
||||
probe "config apply seconds buckets" "^newt_config_apply_seconds_bucket\{" || true
|
||||
|
||||
echo "Smoke checks completed (warnings above are acceptable if the feature isn't exercised yet)."
|
||||
|
||||
12
stub.go
12
stub.go
@@ -8,25 +8,27 @@ import (
|
||||
)
|
||||
|
||||
func setupClientsNative(client *websocket.Client, host string) {
|
||||
return // This function is not implemented for non-Linux systems.
|
||||
_ = client
|
||||
_ = host
|
||||
// No-op for non-Linux systems
|
||||
}
|
||||
|
||||
func closeWgServiceNative() {
|
||||
// No-op for non-Linux systems
|
||||
return
|
||||
}
|
||||
|
||||
func clientsOnConnectNative() {
|
||||
// No-op for non-Linux systems
|
||||
return
|
||||
}
|
||||
|
||||
func clientsHandleNewtConnectionNative(publicKey, endpoint string) {
|
||||
_ = publicKey
|
||||
_ = endpoint
|
||||
// No-op for non-Linux systems
|
||||
return
|
||||
}
|
||||
|
||||
func clientsAddProxyTargetNative(pm *proxy.ProxyManager, tunnelIp string) {
|
||||
_ = pm
|
||||
_ = tunnelIp
|
||||
// No-op for non-Linux systems
|
||||
return
|
||||
}
|
||||
|
||||
28
util.go
28
util.go
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
@@ -14,6 +15,7 @@ import (
|
||||
|
||||
"math/rand"
|
||||
|
||||
"github.com/fosrl/newt/internal/telemetry"
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/fosrl/newt/proxy"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
@@ -24,6 +26,8 @@ import (
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
const msgHealthFileWriteFailed = "Failed to write health file: %v"
|
||||
|
||||
func fixKey(key string) string {
|
||||
// Remove any whitespace
|
||||
key = strings.TrimSpace(key)
|
||||
@@ -176,7 +180,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
|
||||
if healthFile != "" {
|
||||
err := os.WriteFile(healthFile, []byte("ok"), 0644)
|
||||
if err != nil {
|
||||
logger.Warn("Failed to write health file: %v", err)
|
||||
logger.Warn(msgHealthFileWriteFailed, err)
|
||||
}
|
||||
}
|
||||
return stopChan, nil
|
||||
@@ -217,11 +221,13 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
|
||||
if healthFile != "" {
|
||||
err := os.WriteFile(healthFile, []byte("ok"), 0644)
|
||||
if err != nil {
|
||||
logger.Warn("Failed to write health file: %v", err)
|
||||
logger.Warn(msgHealthFileWriteFailed, err)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
case <-pingStopChan:
|
||||
// Stop the goroutine when signaled
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
@@ -230,7 +236,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
|
||||
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
|
||||
}
|
||||
|
||||
func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} {
|
||||
func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
|
||||
maxInterval := 6 * time.Second
|
||||
currentInterval := pingInterval
|
||||
consecutiveFailures := 0
|
||||
@@ -293,6 +299,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
if !connectionLost {
|
||||
connectionLost = true
|
||||
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
|
||||
if tunnelID != "" {
|
||||
telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout)
|
||||
}
|
||||
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second)
|
||||
// Send registration message to the server for backward compatibility
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
@@ -319,6 +328,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
} else {
|
||||
// Track recent latencies
|
||||
recentLatencies = append(recentLatencies, latency)
|
||||
// Record tunnel latency (limit sampling to this periodic check)
|
||||
if tunnelID != "" {
|
||||
telemetry.ObserveTunnelLatency(context.Background(), tunnelID, "wireguard", latency.Seconds())
|
||||
}
|
||||
if len(recentLatencies) > 10 {
|
||||
recentLatencies = recentLatencies[1:]
|
||||
}
|
||||
@@ -468,7 +481,8 @@ func updateTargets(pm *proxy.ProxyManager, action string, tunnelIP string, proto
|
||||
continue
|
||||
}
|
||||
|
||||
if action == "add" {
|
||||
switch action {
|
||||
case "add":
|
||||
target := parts[1] + ":" + parts[2]
|
||||
|
||||
// Call updown script if provided
|
||||
@@ -494,7 +508,7 @@ func updateTargets(pm *proxy.ProxyManager, action string, tunnelIP string, proto
|
||||
// Add the new target
|
||||
pm.AddTarget(proto, tunnelIP, port, processedTarget)
|
||||
|
||||
} else if action == "remove" {
|
||||
case "remove":
|
||||
logger.Info("Removing target with port %d", port)
|
||||
|
||||
target := parts[1] + ":" + parts[2]
|
||||
@@ -512,6 +526,8 @@ func updateTargets(pm *proxy.ProxyManager, action string, tunnelIP string, proto
|
||||
logger.Error("Failed to remove target: %v", err)
|
||||
return err
|
||||
}
|
||||
default:
|
||||
logger.Info("Unknown action: %s", action)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
@@ -18,6 +19,11 @@ import (
|
||||
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/gorilla/websocket"
|
||||
|
||||
"context"
|
||||
|
||||
"github.com/fosrl/newt/internal/telemetry"
|
||||
"go.opentelemetry.io/otel"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
@@ -37,6 +43,8 @@ type Client struct {
|
||||
writeMux sync.Mutex
|
||||
clientType string // Type of client (e.g., "newt", "olm")
|
||||
tlsConfig TLSConfig
|
||||
metricsCtxMu sync.RWMutex
|
||||
metricsCtx context.Context
|
||||
configNeedsSave bool // Flag to track if config needs to be saved
|
||||
}
|
||||
|
||||
@@ -81,6 +89,26 @@ func (c *Client) OnTokenUpdate(callback func(token string)) {
|
||||
c.onTokenUpdate = callback
|
||||
}
|
||||
|
||||
func (c *Client) metricsContext() context.Context {
|
||||
c.metricsCtxMu.RLock()
|
||||
defer c.metricsCtxMu.RUnlock()
|
||||
if c.metricsCtx != nil {
|
||||
return c.metricsCtx
|
||||
}
|
||||
return context.Background()
|
||||
}
|
||||
|
||||
func (c *Client) setMetricsContext(ctx context.Context) {
|
||||
c.metricsCtxMu.Lock()
|
||||
c.metricsCtx = ctx
|
||||
c.metricsCtxMu.Unlock()
|
||||
}
|
||||
|
||||
// MetricsContext exposes the context used for telemetry emission when a connection is active.
|
||||
func (c *Client) MetricsContext() context.Context {
|
||||
return c.metricsContext()
|
||||
}
|
||||
|
||||
// NewClient creates a new websocket client
|
||||
func NewClient(clientType string, ID, secret string, endpoint string, pingInterval time.Duration, pingTimeout time.Duration, opts ...ClientOption) (*Client, error) {
|
||||
config := &Config{
|
||||
@@ -140,6 +168,7 @@ func (c *Client) Close() error {
|
||||
|
||||
// Set connection status to false
|
||||
c.setConnected(false)
|
||||
telemetry.SetWSConnectionState(false)
|
||||
|
||||
// Close the WebSocket connection gracefully
|
||||
if c.conn != nil {
|
||||
@@ -170,7 +199,11 @@ func (c *Client) SendMessage(messageType string, data interface{}) error {
|
||||
|
||||
c.writeMux.Lock()
|
||||
defer c.writeMux.Unlock()
|
||||
return c.conn.WriteJSON(msg)
|
||||
if err := c.conn.WriteJSON(msg); err != nil {
|
||||
return err
|
||||
}
|
||||
telemetry.IncWSMessage(c.metricsContext(), "out", "text")
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) SendMessageInterval(messageType string, data interface{}, interval time.Duration) (stop func()) {
|
||||
@@ -265,8 +298,12 @@ func (c *Client) getToken() (string, error) {
|
||||
return "", fmt.Errorf("failed to marshal token request data: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Create a new request
|
||||
req, err := http.NewRequest(
|
||||
req, err := http.NewRequestWithContext(
|
||||
ctx,
|
||||
"POST",
|
||||
baseEndpoint+"/api/v1/auth/"+c.clientType+"/get-token",
|
||||
bytes.NewBuffer(jsonData),
|
||||
@@ -288,6 +325,8 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
telemetry.IncConnAttempt(ctx, "auth", "failure")
|
||||
telemetry.IncConnError(ctx, "auth", classifyConnError(err))
|
||||
return "", fmt.Errorf("failed to request new token: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -295,6 +334,16 @@ func (c *Client) getToken() (string, error) {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
telemetry.IncConnAttempt(ctx, "auth", "failure")
|
||||
etype := "io_error"
|
||||
if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
etype = "auth_failed"
|
||||
}
|
||||
telemetry.IncConnError(ctx, "auth", etype)
|
||||
// Reconnect reason mapping for auth failures
|
||||
if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonAuthError)
|
||||
}
|
||||
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
@@ -313,10 +362,55 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
|
||||
logger.Debug("Received token: %s", tokenResp.Data.Token)
|
||||
telemetry.IncConnAttempt(ctx, "auth", "success")
|
||||
|
||||
return tokenResp.Data.Token, nil
|
||||
}
|
||||
|
||||
// classifyConnError maps to fixed, low-cardinality error_type values.
|
||||
// Allowed enum: dial_timeout, tls_handshake, auth_failed, io_error
|
||||
func classifyConnError(err error) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
switch {
|
||||
case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
|
||||
return "tls_handshake"
|
||||
case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout") || strings.Contains(msg, "deadline exceeded"):
|
||||
return "dial_timeout"
|
||||
case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
|
||||
return "auth_failed"
|
||||
default:
|
||||
// Group remaining network/socket errors as io_error to avoid label explosion
|
||||
return "io_error"
|
||||
}
|
||||
}
|
||||
|
||||
func classifyWSDisconnect(err error) (result, reason string) {
|
||||
if err == nil {
|
||||
return "success", "normal"
|
||||
}
|
||||
if websocket.IsCloseError(err, websocket.CloseNormalClosure) {
|
||||
return "success", "normal"
|
||||
}
|
||||
if ne, ok := err.(net.Error); ok && ne.Timeout() {
|
||||
return "error", "timeout"
|
||||
}
|
||||
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) {
|
||||
return "error", "unexpected_close"
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
switch {
|
||||
case strings.Contains(msg, "eof"):
|
||||
return "error", "eof"
|
||||
case strings.Contains(msg, "reset"):
|
||||
return "error", "connection_reset"
|
||||
default:
|
||||
return "error", "read_error"
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) connectWithRetry() {
|
||||
for {
|
||||
select {
|
||||
@@ -335,9 +429,13 @@ func (c *Client) connectWithRetry() {
|
||||
}
|
||||
|
||||
func (c *Client) establishConnection() error {
|
||||
ctx := context.Background()
|
||||
|
||||
// Get token for authentication
|
||||
token, err := c.getToken()
|
||||
if err != nil {
|
||||
telemetry.IncConnAttempt(ctx, "websocket", "failure")
|
||||
telemetry.IncConnError(ctx, "websocket", classifyConnError(err))
|
||||
return fmt.Errorf("failed to get token: %w", err)
|
||||
}
|
||||
|
||||
@@ -370,7 +468,12 @@ func (c *Client) establishConnection() error {
|
||||
q.Set("clientType", c.clientType)
|
||||
u.RawQuery = q.Encode()
|
||||
|
||||
// Connect to WebSocket
|
||||
// Connect to WebSocket (optional span)
|
||||
tr := otel.Tracer("newt")
|
||||
ctx, span := tr.Start(ctx, "ws.connect")
|
||||
defer span.End()
|
||||
|
||||
start := time.Now()
|
||||
dialer := websocket.DefaultDialer
|
||||
|
||||
// Use new TLS configuration method
|
||||
@@ -392,18 +495,42 @@ func (c *Client) establishConnection() error {
|
||||
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
|
||||
}
|
||||
|
||||
conn, _, err := dialer.Dial(u.String(), nil)
|
||||
conn, _, err := dialer.DialContext(ctx, u.String(), nil)
|
||||
lat := time.Since(start).Seconds()
|
||||
if err != nil {
|
||||
telemetry.IncConnAttempt(ctx, "websocket", "failure")
|
||||
etype := classifyConnError(err)
|
||||
telemetry.IncConnError(ctx, "websocket", etype)
|
||||
telemetry.ObserveWSConnectLatency(ctx, lat, "failure", etype)
|
||||
// Map handshake-related errors to reconnect reasons where appropriate
|
||||
if etype == "tls_handshake" {
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonHandshakeError)
|
||||
} else if etype == "dial_timeout" {
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonTimeout)
|
||||
} else {
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonError)
|
||||
}
|
||||
telemetry.IncWSReconnect(ctx, etype)
|
||||
return fmt.Errorf("failed to connect to WebSocket: %w", err)
|
||||
}
|
||||
|
||||
telemetry.IncConnAttempt(ctx, "websocket", "success")
|
||||
telemetry.ObserveWSConnectLatency(ctx, lat, "success", "")
|
||||
c.conn = conn
|
||||
c.setConnected(true)
|
||||
telemetry.SetWSConnectionState(true)
|
||||
c.setMetricsContext(ctx)
|
||||
sessionStart := time.Now()
|
||||
// Wire up pong handler for metrics
|
||||
c.conn.SetPongHandler(func(appData string) error {
|
||||
telemetry.IncWSMessage(c.metricsContext(), "in", "pong")
|
||||
return nil
|
||||
})
|
||||
|
||||
// Start the ping monitor
|
||||
go c.pingMonitor()
|
||||
// Start the read pump with disconnect detection
|
||||
go c.readPumpWithDisconnectDetection()
|
||||
go c.readPumpWithDisconnectDetection(sessionStart)
|
||||
|
||||
if c.onConnect != nil {
|
||||
err := c.saveConfig()
|
||||
@@ -496,6 +623,9 @@ func (c *Client) pingMonitor() {
|
||||
}
|
||||
c.writeMux.Lock()
|
||||
err := c.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(c.pingTimeout))
|
||||
if err == nil {
|
||||
telemetry.IncWSMessage(c.metricsContext(), "out", "ping")
|
||||
}
|
||||
c.writeMux.Unlock()
|
||||
if err != nil {
|
||||
// Check if we're shutting down before logging error and reconnecting
|
||||
@@ -505,6 +635,8 @@ func (c *Client) pingMonitor() {
|
||||
return
|
||||
default:
|
||||
logger.Error("Ping failed: %v", err)
|
||||
telemetry.IncWSKeepaliveFailure(c.metricsContext(), "ping_write")
|
||||
telemetry.IncWSReconnect(c.metricsContext(), "ping_write")
|
||||
c.reconnect()
|
||||
return
|
||||
}
|
||||
@@ -514,17 +646,26 @@ func (c *Client) pingMonitor() {
|
||||
}
|
||||
|
||||
// readPumpWithDisconnectDetection reads messages and triggers reconnect on error
|
||||
func (c *Client) readPumpWithDisconnectDetection() {
|
||||
func (c *Client) readPumpWithDisconnectDetection(started time.Time) {
|
||||
ctx := c.metricsContext()
|
||||
disconnectReason := "shutdown"
|
||||
disconnectResult := "success"
|
||||
|
||||
defer func() {
|
||||
if c.conn != nil {
|
||||
c.conn.Close()
|
||||
}
|
||||
if !started.IsZero() {
|
||||
telemetry.ObserveWSSessionDuration(ctx, time.Since(started).Seconds(), disconnectResult)
|
||||
}
|
||||
telemetry.IncWSDisconnect(ctx, disconnectReason, disconnectResult)
|
||||
// Only attempt reconnect if we're not shutting down
|
||||
select {
|
||||
case <-c.done:
|
||||
// Shutting down, don't reconnect
|
||||
return
|
||||
default:
|
||||
telemetry.IncWSReconnect(ctx, disconnectReason)
|
||||
c.reconnect()
|
||||
}
|
||||
}()
|
||||
@@ -532,23 +673,33 @@ func (c *Client) readPumpWithDisconnectDetection() {
|
||||
for {
|
||||
select {
|
||||
case <-c.done:
|
||||
disconnectReason = "shutdown"
|
||||
disconnectResult = "success"
|
||||
return
|
||||
default:
|
||||
var msg WSMessage
|
||||
err := c.conn.ReadJSON(&msg)
|
||||
if err == nil {
|
||||
telemetry.IncWSMessage(c.metricsContext(), "in", "text")
|
||||
}
|
||||
if err != nil {
|
||||
// Check if we're shutting down before logging error
|
||||
select {
|
||||
case <-c.done:
|
||||
// Expected during shutdown, don't log as error
|
||||
logger.Debug("WebSocket connection closed during shutdown")
|
||||
disconnectReason = "shutdown"
|
||||
disconnectResult = "success"
|
||||
return
|
||||
default:
|
||||
// Unexpected error during normal operation
|
||||
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure, websocket.CloseNormalClosure) {
|
||||
logger.Error("WebSocket read error: %v", err)
|
||||
} else {
|
||||
logger.Debug("WebSocket connection closed: %v", err)
|
||||
disconnectResult, disconnectReason = classifyWSDisconnect(err)
|
||||
if disconnectResult == "error" {
|
||||
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure, websocket.CloseNormalClosure) {
|
||||
logger.Error("WebSocket read error: %v", err)
|
||||
} else {
|
||||
logger.Debug("WebSocket connection closed: %v", err)
|
||||
}
|
||||
}
|
||||
return // triggers reconnect via defer
|
||||
}
|
||||
@@ -565,6 +716,7 @@ func (c *Client) readPumpWithDisconnectDetection() {
|
||||
|
||||
func (c *Client) reconnect() {
|
||||
c.setConnected(false)
|
||||
telemetry.SetWSConnectionState(false)
|
||||
if c.conn != nil {
|
||||
c.conn.Close()
|
||||
c.conn = nil
|
||||
|
||||
41
wg/wg.go
41
wg/wg.go
@@ -3,6 +3,7 @@
|
||||
package wg
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -13,16 +14,19 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"math/rand"
|
||||
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/fosrl/newt/network"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
"github.com/vishvananda/netlink"
|
||||
"golang.org/x/crypto/chacha20poly1305"
|
||||
"golang.org/x/crypto/curve25519"
|
||||
"golang.org/x/exp/rand"
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/wgctrl"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
|
||||
"github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -106,7 +110,7 @@ func FindAvailableUDPPort(minPort, maxPort uint16) (uint16, error) {
|
||||
}
|
||||
|
||||
// Fisher-Yates shuffle to randomize the port order
|
||||
rand.Seed(uint64(time.Now().UnixNano()))
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
for i := len(portRange) - 1; i > 0; i-- {
|
||||
j := rand.Intn(i + 1)
|
||||
portRange[i], portRange[j] = portRange[j], portRange[i]
|
||||
@@ -280,6 +284,15 @@ func (s *WireGuardService) LoadRemoteConfig() error {
|
||||
}
|
||||
|
||||
func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
ctx := context.Background()
|
||||
if s.client != nil {
|
||||
ctx = s.client.MetricsContext()
|
||||
}
|
||||
result := "success"
|
||||
defer func() {
|
||||
telemetry.IncConfigReload(ctx, result)
|
||||
}()
|
||||
|
||||
var config WgConfig
|
||||
|
||||
logger.Debug("Received message: %v", msg)
|
||||
@@ -288,11 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
jsonData, err := json.Marshal(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error marshaling data: %v", err)
|
||||
result = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(jsonData, &config); err != nil {
|
||||
logger.Info("Error unmarshaling target data: %v", err)
|
||||
result = "failure"
|
||||
return
|
||||
}
|
||||
s.config = config
|
||||
@@ -302,13 +317,29 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
s.stopGetConfig = nil
|
||||
}
|
||||
|
||||
// Ensure the WireGuard interface and peers are configured
|
||||
if err := s.ensureWireguardInterface(config); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard interface: %v", err)
|
||||
// telemetry: config reload success
|
||||
// Optional reconnect reason mapping: config change
|
||||
if s.serverPubKey != "" {
|
||||
telemetry.IncReconnect(ctx, s.serverPubKey, "client", telemetry.ReasonConfigChange)
|
||||
}
|
||||
|
||||
// Ensure the WireGuard interface and peers are configured
|
||||
start := time.Now()
|
||||
if err := s.ensureWireguardInterface(config); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard interface: %v", err)
|
||||
telemetry.ObserveConfigApply(ctx, "interface", "failure", time.Since(start).Seconds())
|
||||
result = "failure"
|
||||
} else {
|
||||
telemetry.ObserveConfigApply(ctx, "interface", "success", time.Since(start).Seconds())
|
||||
}
|
||||
|
||||
startPeers := time.Now()
|
||||
if err := s.ensureWireguardPeers(config.Peers); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard peers: %v", err)
|
||||
telemetry.ObserveConfigApply(ctx, "peer", "failure", time.Since(startPeers).Seconds())
|
||||
result = "failure"
|
||||
} else {
|
||||
telemetry.ObserveConfigApply(ctx, "peer", "success", time.Since(startPeers).Seconds())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package wgnetstack
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
@@ -26,6 +27,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
|
||||
"github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -242,14 +245,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str
|
||||
return service, nil
|
||||
}
|
||||
|
||||
// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally.
|
||||
func (s *WireGuardService) ReportRTT(seconds float64) {
|
||||
if s.serverPubKey == "" { return }
|
||||
telemetry.ObserveTunnelLatency(context.Background(), s.serverPubKey, "wireguard", seconds)
|
||||
}
|
||||
|
||||
func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) {
|
||||
logger.Debug("Received: %+v", msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if s.TunnelIP == "" || s.proxyManager == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
|
||||
@@ -126,7 +126,7 @@ func (s *Server) Stop() {
|
||||
s.conn.Close()
|
||||
}
|
||||
s.isRunning = false
|
||||
logger.Info(s.outputPrefix + "Server stopped")
|
||||
logger.Info("%sServer stopped", s.outputPrefix)
|
||||
}
|
||||
|
||||
// RestartWithNetstack stops the current server and restarts it with netstack
|
||||
@@ -161,7 +161,7 @@ func (s *Server) handleConnections() {
|
||||
// Set read deadline to avoid blocking forever
|
||||
err := s.conn.SetReadDeadline(time.Now().Add(1 * time.Second))
|
||||
if err != nil {
|
||||
logger.Error(s.outputPrefix+"Error setting read deadline: %v", err)
|
||||
logger.Error("%sError setting read deadline: %v", s.outputPrefix, err)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -187,7 +187,7 @@ func (s *Server) handleConnections() {
|
||||
case <-s.shutdownCh:
|
||||
return // Don't log error if we're shutting down
|
||||
default:
|
||||
logger.Error(s.outputPrefix+"Error reading from UDP: %v", err)
|
||||
logger.Error("%sError reading from UDP: %v", s.outputPrefix, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
@@ -219,7 +219,7 @@ func (s *Server) handleConnections() {
|
||||
copy(responsePacket[5:13], buffer[5:13])
|
||||
|
||||
// Log response being sent for debugging
|
||||
logger.Debug(s.outputPrefix+"Sending response to %s", addr.String())
|
||||
logger.Debug("%sSending response to %s", s.outputPrefix, addr.String())
|
||||
|
||||
// Send the response packet - handle both regular UDP and netstack UDP
|
||||
if s.useNetstack {
|
||||
@@ -233,9 +233,9 @@ func (s *Server) handleConnections() {
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
logger.Error(s.outputPrefix+"Error sending response: %v", err)
|
||||
logger.Error("%sError sending response: %v", s.outputPrefix, err)
|
||||
} else {
|
||||
logger.Debug(s.outputPrefix + "Response sent successfully")
|
||||
logger.Debug("%sResponse sent successfully", s.outputPrefix)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user