From 8d36e1e2453f2640bfc357441edbd923e385631a Mon Sep 17 00:00:00 2001 From: Sebastiaan de Schaetzen Date: Fri, 14 Mar 2025 14:17:38 +0100 Subject: [PATCH] Add support for ctrl+c --- main.go | 234 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 151 insertions(+), 83 deletions(-) diff --git a/main.go b/main.go index 733e903..13185a3 100644 --- a/main.go +++ b/main.go @@ -1,14 +1,17 @@ package main import ( + "context" "encoding/json" "fmt" "log" "net/http" "os" "os/exec" + "os/signal" "strings" "sync" + "syscall" "time" "github.com/NVIDIA/go-nvml/pkg/nvml" @@ -29,13 +32,14 @@ const ( ) type ResourceUsage struct { - timestamp time.Time - cpuUsage float64 - gpuUsage float64 - diskIO uint64 - networkIO uint64 - sshConnections int - activeUsers int + Timestamp time.Time `json:"timestamp"` + CpuUsage float64 `json:"cpu_usage"` + GpuUsage float64 `json:"gpu_usage"` + GpuAvailable bool `json:"gpu_available"` + DiskIO uint64 `json:"disk_io"` + NetworkIO uint64 `json:"network_io"` + SshConnections int `json:"ssh_connections"` + ActiveUsers int `json:"active_users"` } type SystemStatus struct { @@ -46,6 +50,7 @@ type SystemStatus struct { var ( currentStatus SystemStatus statusMutex sync.RWMutex + nvmlAvailable bool ) func main() { @@ -58,11 +63,22 @@ func main() { ret := nvml.Init() if ret != nvml.SUCCESS { log.Printf("Warning: Could not initialize NVML: %v", ret) + nvmlAvailable = false + } else { + nvmlAvailable = true + defer nvml.Shutdown() } - defer nvml.Shutdown() - // Start HTTP server - go startHTTPServer() + // Create a context that we'll use to shut down the application + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Set up signal handling + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + // Start HTTP server with context + srv := startHTTPServer(ctx) usageHistory := make([]ResourceUsage, 0) ticker := time.NewTicker(checkInterval) @@ -70,49 +86,85 @@ func main() { log.Printf("Starting idle monitoring. System will suspend when:\n") log.Printf("- Average CPU usage across all cores < %.1f%%\n", cpuThreshold) - log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold) + if nvmlAvailable { + log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold) + } else { + log.Printf("- GPU monitoring disabled (NVML initialization failed)") + } log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024)) log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024)) log.Printf("- No active SSH connections\n") log.Printf("- No active user sessions\n") log.Printf("Over the last %v\n", monitoringPeriod) log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort) + log.Printf("Press Ctrl+C to exit\n") - for range ticker.C { - usage := getCurrentUsage() - usageHistory = append(usageHistory, usage) +mainLoop: + for { + select { + case <-ctx.Done(): + break mainLoop + case sig := <-sigChan: + log.Printf("Received signal %v, shutting down...", sig) + cancel() + break mainLoop + case <-ticker.C: + usage := getCurrentUsage() + usageHistory = append(usageHistory, usage) - // Update current status - updateSystemStatus(usage, usageHistory) + // Update current status + updateSystemStatus(usage, usageHistory) - // Remove entries older than monitoring period - cutoff := time.Now().Add(-monitoringPeriod) - for i, u := range usageHistory { - if u.timestamp.After(cutoff) { - usageHistory = usageHistory[i:] - break + // Remove entries older than monitoring period + cutoff := time.Now().Add(-monitoringPeriod) + for i, u := range usageHistory { + if u.Timestamp.After(cutoff) { + usageHistory = usageHistory[i:] + break + } } - } - if len(usageHistory) > 0 && isSystemIdle(usageHistory) { - log.Printf("System status before suspend:\n") - log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].cpuUsage) - log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].gpuUsage) - log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].sshConnections) - log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].activeUsers) - log.Println("System has been idle for the monitoring period. Suspending...") - if err := suspendSystem(); err != nil { - log.Printf("Failed to suspend system: %v", err) + if len(usageHistory) > 0 && isSystemIdle(usageHistory) { + log.Printf("System status before suspend:\n") + log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage) + if nvmlAvailable { + log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].GpuUsage) + } + log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].SshConnections) + log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].ActiveUsers) + log.Println("System has been idle for the monitoring period. Suspending...") + if err := suspendSystem(); err != nil { + log.Printf("Failed to suspend system: %v", err) + } } } } + + // Graceful shutdown of HTTP server + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer shutdownCancel() + + if err := srv.Shutdown(shutdownCtx); err != nil { + log.Printf("HTTP server shutdown error: %v", err) + } + + log.Println("Goodbye!") } -func startHTTPServer() { - http.HandleFunc("/status", handleStatus) - if err := http.ListenAndServe(fmt.Sprintf(":%d", httpPort), nil); err != nil { - log.Printf("Failed to start HTTP server: %v", err) +func startHTTPServer(ctx context.Context) *http.Server { + srv := &http.Server{ + Addr: fmt.Sprintf(":%d", httpPort), } + + http.HandleFunc("/status", handleStatus) + + go func() { + if err := srv.ListenAndServe(); err != http.ErrServerClosed { + log.Printf("HTTP server error: %v", err) + } + }() + + return srv } func handleStatus(w http.ResponseWriter, r *http.Request) { @@ -132,17 +184,17 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) { if len(history) >= 2 { // Calculate rates using last two samples - duration := history[len(history)-1].timestamp.Sub(history[len(history)-2].timestamp).Seconds() - diskIORate := float64(history[len(history)-1].diskIO-history[len(history)-2].diskIO) / duration - netIORate := float64(history[len(history)-1].networkIO-history[len(history)-2].networkIO) / duration + duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds() + diskIORate := float64(history[len(history)-1].DiskIO-history[len(history)-2].DiskIO) / duration + netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration - if current.cpuUsage >= cpuThreshold { + if current.CpuUsage >= cpuThreshold { currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.cpuUsage, cpuThreshold)) + fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold)) } - if current.gpuUsage >= gpuThreshold { + if nvmlAvailable && current.GpuUsage >= gpuThreshold { currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.gpuUsage, gpuThreshold)) + fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold)) } if diskIORate >= float64(diskThreshold) { currentStatus.Blockers = append(currentStatus.Blockers, @@ -156,13 +208,13 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) { } } - if current.sshConnections > 0 { + if current.SshConnections > 0 { currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("Active SSH connections: %d", current.sshConnections)) + fmt.Sprintf("Active SSH connections: %d", current.SshConnections)) } - if current.activeUsers > 0 { + if current.ActiveUsers > 0 { currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("Active user sessions: %d", current.activeUsers)) + fmt.Sprintf("Active user sessions: %d", current.ActiveUsers)) } if len(currentStatus.Blockers) == 0 { @@ -172,7 +224,8 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) { func getCurrentUsage() ResourceUsage { usage := ResourceUsage{ - timestamp: time.Now(), + Timestamp: time.Now(), + GpuAvailable: nvmlAvailable, } // Get CPU usage across all cores @@ -182,26 +235,28 @@ func getCurrentUsage() ResourceUsage { for _, percent := range cpuPercent { totalCPU += percent } - usage.cpuUsage = totalCPU / float64(len(cpuPercent)) + usage.CpuUsage = totalCPU / float64(len(cpuPercent)) } - // Get GPU usage across all GPUs - count, ret := nvml.DeviceGetCount() - if ret == nvml.SUCCESS && count > 0 { - var totalGPU float64 - var activeGPUs int - for i := 0; i < count; i++ { - device, ret := nvml.DeviceGetHandleByIndex(i) - if ret == nvml.SUCCESS { - utilization, ret := device.GetUtilizationRates() + // Get GPU usage across all GPUs if available + if nvmlAvailable { + count, ret := nvml.DeviceGetCount() + if ret == nvml.SUCCESS && count > 0 { + var totalGPU float64 + var activeGPUs int + for i := 0; i < count; i++ { + device, ret := nvml.DeviceGetHandleByIndex(i) if ret == nvml.SUCCESS { - totalGPU += float64(utilization.Gpu) - activeGPUs++ + utilization, ret := device.GetUtilizationRates() + if ret == nvml.SUCCESS { + totalGPU += float64(utilization.Gpu) + activeGPUs++ + } } } - } - if activeGPUs > 0 { - usage.gpuUsage = totalGPU / float64(activeGPUs) + if activeGPUs > 0 { + usage.GpuUsage = totalGPU / float64(activeGPUs) + } } } @@ -211,22 +266,22 @@ func getCurrentUsage() ResourceUsage { for _, stat := range diskStats { totalIO += stat.ReadBytes + stat.WriteBytes } - usage.diskIO = totalIO + usage.DiskIO = totalIO } // Get network I/O if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 { - usage.networkIO = netStats[0].BytesSent + netStats[0].BytesRecv + usage.NetworkIO = netStats[0].BytesSent + netStats[0].BytesRecv } // Count SSH connections if sshCount, err := getSSHConnectionCount(); err == nil { - usage.sshConnections = sshCount + usage.SshConnections = sshCount } // Count active user sessions if userCount, err := getActiveUserCount(); err == nil { - usage.activeUsers = userCount + usage.ActiveUsers = userCount } return usage @@ -286,30 +341,43 @@ func isSystemIdle(history []ResourceUsage) bool { samples := len(history) for _, usage := range history { - avgCPU += usage.cpuUsage - avgGPU += usage.gpuUsage - if usage.sshConnections > maxSSHConnections { - maxSSHConnections = usage.sshConnections + avgCPU += usage.CpuUsage + if nvmlAvailable { + avgGPU += usage.GpuUsage } - if usage.activeUsers > maxActiveUsers { - maxActiveUsers = usage.activeUsers + if usage.SshConnections > maxSSHConnections { + maxSSHConnections = usage.SshConnections + } + if usage.ActiveUsers > maxActiveUsers { + maxActiveUsers = usage.ActiveUsers } } // Calculate I/O rates using first and last samples - duration := history[samples-1].timestamp.Sub(history[0].timestamp).Seconds() - diskIORate := float64(history[samples-1].diskIO-history[0].diskIO) / duration - netIORate := float64(history[samples-1].networkIO-history[0].networkIO) / duration + duration := history[samples-1].Timestamp.Sub(history[0].Timestamp).Seconds() + diskIORate := float64(history[samples-1].DiskIO-history[0].DiskIO) / duration + netIORate := float64(history[samples-1].NetworkIO-history[0].NetworkIO) / duration avgCPU /= float64(samples) - avgGPU /= float64(samples) + if nvmlAvailable { + avgGPU /= float64(samples) + } - return avgCPU < cpuThreshold && - avgGPU < gpuThreshold && - diskIORate < float64(diskThreshold) && - netIORate < float64(networkThreshold) && - maxSSHConnections == 0 && - maxActiveUsers == 0 + // Basic checks that always apply + if avgCPU >= cpuThreshold || + diskIORate >= float64(diskThreshold) || + netIORate >= float64(networkThreshold) || + maxSSHConnections > 0 || + maxActiveUsers > 0 { + return false + } + + // GPU check only if NVML is available + if nvmlAvailable && avgGPU >= gpuThreshold { + return false + } + + return true } func suspendSystem() error {