From ad86ad077a442f4f071baecaf8675afa3fd9301d Mon Sep 17 00:00:00 2001 From: Sebastiaan de Schaetzen Date: Fri, 14 Mar 2025 13:58:30 +0100 Subject: [PATCH] Track SSH connections and add endpoint --- main.go | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 165 insertions(+), 6 deletions(-) diff --git a/main.go b/main.go index fe59e26..81a43c3 100644 --- a/main.go +++ b/main.go @@ -1,15 +1,22 @@ package main import ( + "encoding/json" + "fmt" "log" + "net/http" "os" "os/exec" + "strconv" + "strings" + "sync" "time" "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/shirou/gopsutil/v3/cpu" "github.com/shirou/gopsutil/v3/disk" "github.com/shirou/gopsutil/v3/net" + "github.com/shirou/gopsutil/v3/process" ) const ( @@ -19,16 +26,29 @@ const ( gpuThreshold = 20.0 // percentage diskThreshold = 5 * 1024 * 1024 // 5 MB/s networkThreshold = 1 * 1024 * 1024 // 1 MB/s + httpPort = 8080 ) type ResourceUsage struct { - timestamp time.Time - cpuUsage float64 - gpuUsage float64 - diskIO uint64 - networkIO uint64 + timestamp time.Time + cpuUsage float64 + gpuUsage float64 + diskIO uint64 + networkIO uint64 + sshConnections int + activeUsers int } +type SystemStatus struct { + CurrentUsage ResourceUsage `json:"current_usage"` + Blockers []string `json:"sleep_blockers"` +} + +var ( + currentStatus SystemStatus + statusMutex sync.RWMutex +) + func main() { // Check if running as root if os.Geteuid() != 0 { @@ -42,6 +62,9 @@ func main() { } defer nvml.Shutdown() + // Start HTTP server + go startHTTPServer() + usageHistory := make([]ResourceUsage, 0) ticker := time.NewTicker(checkInterval) defer ticker.Stop() @@ -51,12 +74,18 @@ func main() { log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold) log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024)) log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024)) + log.Printf("- No active SSH connections\n") + log.Printf("- No active user sessions\n") log.Printf("Over the last %v\n", monitoringPeriod) + log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort) for range ticker.C { usage := getCurrentUsage() usageHistory = append(usageHistory, usage) + // Update current status + updateSystemStatus(usage, usageHistory) + // Remove entries older than monitoring period cutoff := time.Now().Add(-monitoringPeriod) for i, u := range usageHistory { @@ -67,6 +96,11 @@ func main() { } if len(usageHistory) > 0 && isSystemIdle(usageHistory) { + log.Printf("System status before suspend:\n") + log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].cpuUsage) + log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].gpuUsage) + log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].sshConnections) + log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].activeUsers) log.Println("System has been idle for the monitoring period. Suspending...") if err := suspendSystem(); err != nil { log.Printf("Failed to suspend system: %v", err) @@ -75,6 +109,68 @@ func main() { } } +func startHTTPServer() { + http.HandleFunc("/status", handleStatus) + if err := http.ListenAndServe(fmt.Sprintf(":%d", httpPort), nil); err != nil { + log.Printf("Failed to start HTTP server: %v", err) + } +} + +func handleStatus(w http.ResponseWriter, r *http.Request) { + statusMutex.RLock() + defer statusMutex.RUnlock() + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(currentStatus) +} + +func updateSystemStatus(current ResourceUsage, history []ResourceUsage) { + statusMutex.Lock() + defer statusMutex.Unlock() + + currentStatus.CurrentUsage = current + currentStatus.Blockers = []string{} + + if len(history) >= 2 { + // Calculate rates using last two samples + duration := history[len(history)-1].timestamp.Sub(history[len(history)-2].timestamp).Seconds() + diskIORate := float64(history[len(history)-1].diskIO-history[len(history)-2].diskIO) / duration + netIORate := float64(history[len(history)-1].networkIO-history[len(history)-2].networkIO) / duration + + if current.cpuUsage >= cpuThreshold { + currentStatus.Blockers = append(currentStatus.Blockers, + fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.cpuUsage, cpuThreshold)) + } + if current.gpuUsage >= gpuThreshold { + currentStatus.Blockers = append(currentStatus.Blockers, + fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.gpuUsage, gpuThreshold)) + } + if diskIORate >= float64(diskThreshold) { + currentStatus.Blockers = append(currentStatus.Blockers, + fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s", + diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024))) + } + if netIORate >= float64(networkThreshold) { + currentStatus.Blockers = append(currentStatus.Blockers, + fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s", + netIORate/(1024*1024), float64(networkThreshold)/(1024*1024))) + } + } + + if current.sshConnections > 0 { + currentStatus.Blockers = append(currentStatus.Blockers, + fmt.Sprintf("Active SSH connections: %d", current.sshConnections)) + } + if current.activeUsers > 0 { + currentStatus.Blockers = append(currentStatus.Blockers, + fmt.Sprintf("Active user sessions: %d", current.activeUsers)) + } + + if len(currentStatus.Blockers) == 0 { + currentStatus.Blockers = append(currentStatus.Blockers, "No blockers - system can sleep") + } +} + func getCurrentUsage() ResourceUsage { usage := ResourceUsage{ timestamp: time.Now(), @@ -124,20 +220,81 @@ func getCurrentUsage() ResourceUsage { usage.networkIO = netStats[0].BytesSent + netStats[0].BytesRecv } + // Count SSH connections + if sshCount, err := getSSHConnectionCount(); err == nil { + usage.sshConnections = sshCount + } + + // Count active user sessions + if userCount, err := getActiveUserCount(); err == nil { + usage.activeUsers = userCount + } + return usage } +func getSSHConnectionCount() (int, error) { + processes, err := process.Processes() + if err != nil { + return 0, err + } + + count := 0 + for _, p := range processes { + name, err := p.Name() + if err != nil { + continue + } + if name == "sshd" { + cmdline, err := p.Cmdline() + if err != nil { + continue + } + // Only count sshd processes that are handling connections + // The main sshd process doesn't have "@" in its cmdline + if strings.Contains(cmdline, "@") { + count++ + } + } + } + return count, nil +} + +func getActiveUserCount() (int, error) { + out, err := exec.Command("who", "-s").Output() + if err != nil { + return 0, err + } + + // Count non-empty lines + lines := strings.Split(strings.TrimSpace(string(out)), "\n") + count := 0 + for _, line := range lines { + if line != "" { + count++ + } + } + return count, nil +} + func isSystemIdle(history []ResourceUsage) bool { if len(history) < 2 { return false } var avgCPU, avgGPU float64 + var maxSSHConnections, maxActiveUsers int samples := len(history) for _, usage := range history { avgCPU += usage.cpuUsage avgGPU += usage.gpuUsage + if usage.sshConnections > maxSSHConnections { + maxSSHConnections = usage.sshConnections + } + if usage.activeUsers > maxActiveUsers { + maxActiveUsers = usage.activeUsers + } } // Calculate I/O rates using first and last samples @@ -151,7 +308,9 @@ func isSystemIdle(history []ResourceUsage) bool { return avgCPU < cpuThreshold && avgGPU < gpuThreshold && diskIORate < float64(diskThreshold) && - netIORate < float64(networkThreshold) + netIORate < float64(networkThreshold) && + maxSSHConnections == 0 && + maxActiveUsers == 0 } func suspendSystem() error {