Track SSH connections and add endpoint

This commit is contained in:
Sebastiaan de Schaetzen 2025-03-14 13:58:30 +01:00
parent ff7b0a3a70
commit ad86ad077a

161
main.go
View File

@ -1,15 +1,22 @@
package main package main
import ( import (
"encoding/json"
"fmt"
"log" "log"
"net/http"
"os" "os"
"os/exec" "os/exec"
"strconv"
"strings"
"sync"
"time" "time"
"github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/shirou/gopsutil/v3/cpu" "github.com/shirou/gopsutil/v3/cpu"
"github.com/shirou/gopsutil/v3/disk" "github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/net" "github.com/shirou/gopsutil/v3/net"
"github.com/shirou/gopsutil/v3/process"
) )
const ( const (
@ -19,6 +26,7 @@ const (
gpuThreshold = 20.0 // percentage gpuThreshold = 20.0 // percentage
diskThreshold = 5 * 1024 * 1024 // 5 MB/s diskThreshold = 5 * 1024 * 1024 // 5 MB/s
networkThreshold = 1 * 1024 * 1024 // 1 MB/s networkThreshold = 1 * 1024 * 1024 // 1 MB/s
httpPort = 8080
) )
type ResourceUsage struct { type ResourceUsage struct {
@ -27,8 +35,20 @@ type ResourceUsage struct {
gpuUsage float64 gpuUsage float64
diskIO uint64 diskIO uint64
networkIO uint64 networkIO uint64
sshConnections int
activeUsers int
} }
type SystemStatus struct {
CurrentUsage ResourceUsage `json:"current_usage"`
Blockers []string `json:"sleep_blockers"`
}
var (
currentStatus SystemStatus
statusMutex sync.RWMutex
)
func main() { func main() {
// Check if running as root // Check if running as root
if os.Geteuid() != 0 { if os.Geteuid() != 0 {
@ -42,6 +62,9 @@ func main() {
} }
defer nvml.Shutdown() defer nvml.Shutdown()
// Start HTTP server
go startHTTPServer()
usageHistory := make([]ResourceUsage, 0) usageHistory := make([]ResourceUsage, 0)
ticker := time.NewTicker(checkInterval) ticker := time.NewTicker(checkInterval)
defer ticker.Stop() defer ticker.Stop()
@ -51,12 +74,18 @@ func main() {
log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold) log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold)
log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024)) log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024))
log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024)) log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024))
log.Printf("- No active SSH connections\n")
log.Printf("- No active user sessions\n")
log.Printf("Over the last %v\n", monitoringPeriod) log.Printf("Over the last %v\n", monitoringPeriod)
log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort)
for range ticker.C { for range ticker.C {
usage := getCurrentUsage() usage := getCurrentUsage()
usageHistory = append(usageHistory, usage) usageHistory = append(usageHistory, usage)
// Update current status
updateSystemStatus(usage, usageHistory)
// Remove entries older than monitoring period // Remove entries older than monitoring period
cutoff := time.Now().Add(-monitoringPeriod) cutoff := time.Now().Add(-monitoringPeriod)
for i, u := range usageHistory { for i, u := range usageHistory {
@ -67,6 +96,11 @@ func main() {
} }
if len(usageHistory) > 0 && isSystemIdle(usageHistory) { if len(usageHistory) > 0 && isSystemIdle(usageHistory) {
log.Printf("System status before suspend:\n")
log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].cpuUsage)
log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].gpuUsage)
log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].sshConnections)
log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].activeUsers)
log.Println("System has been idle for the monitoring period. Suspending...") log.Println("System has been idle for the monitoring period. Suspending...")
if err := suspendSystem(); err != nil { if err := suspendSystem(); err != nil {
log.Printf("Failed to suspend system: %v", err) log.Printf("Failed to suspend system: %v", err)
@ -75,6 +109,68 @@ func main() {
} }
} }
func startHTTPServer() {
http.HandleFunc("/status", handleStatus)
if err := http.ListenAndServe(fmt.Sprintf(":%d", httpPort), nil); err != nil {
log.Printf("Failed to start HTTP server: %v", err)
}
}
func handleStatus(w http.ResponseWriter, r *http.Request) {
statusMutex.RLock()
defer statusMutex.RUnlock()
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(currentStatus)
}
func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
statusMutex.Lock()
defer statusMutex.Unlock()
currentStatus.CurrentUsage = current
currentStatus.Blockers = []string{}
if len(history) >= 2 {
// Calculate rates using last two samples
duration := history[len(history)-1].timestamp.Sub(history[len(history)-2].timestamp).Seconds()
diskIORate := float64(history[len(history)-1].diskIO-history[len(history)-2].diskIO) / duration
netIORate := float64(history[len(history)-1].networkIO-history[len(history)-2].networkIO) / duration
if current.cpuUsage >= cpuThreshold {
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.cpuUsage, cpuThreshold))
}
if current.gpuUsage >= gpuThreshold {
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.gpuUsage, gpuThreshold))
}
if diskIORate >= float64(diskThreshold) {
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s",
diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024)))
}
if netIORate >= float64(networkThreshold) {
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s",
netIORate/(1024*1024), float64(networkThreshold)/(1024*1024)))
}
}
if current.sshConnections > 0 {
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Active SSH connections: %d", current.sshConnections))
}
if current.activeUsers > 0 {
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Active user sessions: %d", current.activeUsers))
}
if len(currentStatus.Blockers) == 0 {
currentStatus.Blockers = append(currentStatus.Blockers, "No blockers - system can sleep")
}
}
func getCurrentUsage() ResourceUsage { func getCurrentUsage() ResourceUsage {
usage := ResourceUsage{ usage := ResourceUsage{
timestamp: time.Now(), timestamp: time.Now(),
@ -124,20 +220,81 @@ func getCurrentUsage() ResourceUsage {
usage.networkIO = netStats[0].BytesSent + netStats[0].BytesRecv usage.networkIO = netStats[0].BytesSent + netStats[0].BytesRecv
} }
// Count SSH connections
if sshCount, err := getSSHConnectionCount(); err == nil {
usage.sshConnections = sshCount
}
// Count active user sessions
if userCount, err := getActiveUserCount(); err == nil {
usage.activeUsers = userCount
}
return usage return usage
} }
func getSSHConnectionCount() (int, error) {
processes, err := process.Processes()
if err != nil {
return 0, err
}
count := 0
for _, p := range processes {
name, err := p.Name()
if err != nil {
continue
}
if name == "sshd" {
cmdline, err := p.Cmdline()
if err != nil {
continue
}
// Only count sshd processes that are handling connections
// The main sshd process doesn't have "@" in its cmdline
if strings.Contains(cmdline, "@") {
count++
}
}
}
return count, nil
}
func getActiveUserCount() (int, error) {
out, err := exec.Command("who", "-s").Output()
if err != nil {
return 0, err
}
// Count non-empty lines
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
count := 0
for _, line := range lines {
if line != "" {
count++
}
}
return count, nil
}
func isSystemIdle(history []ResourceUsage) bool { func isSystemIdle(history []ResourceUsage) bool {
if len(history) < 2 { if len(history) < 2 {
return false return false
} }
var avgCPU, avgGPU float64 var avgCPU, avgGPU float64
var maxSSHConnections, maxActiveUsers int
samples := len(history) samples := len(history)
for _, usage := range history { for _, usage := range history {
avgCPU += usage.cpuUsage avgCPU += usage.cpuUsage
avgGPU += usage.gpuUsage avgGPU += usage.gpuUsage
if usage.sshConnections > maxSSHConnections {
maxSSHConnections = usage.sshConnections
}
if usage.activeUsers > maxActiveUsers {
maxActiveUsers = usage.activeUsers
}
} }
// Calculate I/O rates using first and last samples // Calculate I/O rates using first and last samples
@ -151,7 +308,9 @@ func isSystemIdle(history []ResourceUsage) bool {
return avgCPU < cpuThreshold && return avgCPU < cpuThreshold &&
avgGPU < gpuThreshold && avgGPU < gpuThreshold &&
diskIORate < float64(diskThreshold) && diskIORate < float64(diskThreshold) &&
netIORate < float64(networkThreshold) netIORate < float64(networkThreshold) &&
maxSSHConnections == 0 &&
maxActiveUsers == 0
} }
func suspendSystem() error { func suspendSystem() error {