diff --git a/main.go b/main.go index e190c5f..cef4f0c 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( "context" "encoding/json" + "errors" "fmt" "log" "net/http" @@ -37,27 +38,26 @@ type ResourceUsage struct { CpuUsage float64 `json:"cpu_usage"` GpuUsage float64 `json:"gpu_usage"` GpuAvailable bool `json:"gpu_available"` - DiskIO uint64 `json:"disk_io"` - NetworkIO uint64 `json:"network_io"` + DiskIO float64 `json:"disk_io"` + NetworkIO float64 `json:"network_io"` SshConnections int `json:"ssh_connections"` ActiveUsers int `json:"active_users"` } -type SystemStatus struct { - CurrentUsage ResourceUsage `json:"current_usage"` - Blockers []string `json:"sleep_blockers"` - InGracePeriod bool `json:"in_grace_period,omitempty"` - GraceTimeLeft string `json:"grace_time_left,omitempty"` -} - var ( - currentStatus SystemStatus - statusMutex sync.RWMutex - nvmlAvailable bool - lastResumeTime time.Time // Track when the system last resumed from sleep - lastTickTime time.Time // Track when we last processed a tick + statusMutex sync.RWMutex + blockers []string + currentStatus ResourceUsage + nvmlAvailable bool + lastBlockedTime time.Time ) +func Must(err error) { + if err != nil { + log.Fatalf("Error: %v", err) + } +} + func main() { // Check if running as root if os.Geteuid() != 0 { @@ -66,24 +66,20 @@ func main() { // Initialize NVML for GPU monitoring ret := nvml.Init() - if ret != nvml.SUCCESS { + if !errors.Is(ret, nvml.SUCCESS) { log.Printf("Warning: Could not initialize NVML: %v", ret) nvmlAvailable = false } else { nvmlAvailable = true - defer nvml.Shutdown() + defer Must(nvml.Shutdown()) } - // Create a context that we'll use to shut down the application - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - // Set up signal handling sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) // Start HTTP server with context - srv := startHTTPServer(ctx) + srv := startHTTPServer() usageHistory := make([]ResourceUsage, 0) ticker := time.NewTicker(checkInterval) @@ -108,42 +104,14 @@ func main() { mainLoop: for { select { - case <-ctx.Done(): - break mainLoop case sig := <-sigChan: log.Printf("Received signal %v, shutting down...", sig) - cancel() break mainLoop case <-ticker.C: - now := time.Now() + updateCurrentUsage() + updateSystemStatus() - // Check if we just resumed from sleep - if !lastTickTime.IsZero() { - gap := now.Sub(lastTickTime) - // If there was a significant gap, probably resumed from sleep - if gap > (checkInterval*3) && gap < time.Hour { - log.Printf("Detected system resume after gap of %v", gap) - lastResumeTime = now - } - } - lastTickTime = now - - usage := getCurrentUsage() - usageHistory = append(usageHistory, usage) - - // Update current status - updateSystemStatus(usage, usageHistory) - - // Remove entries older than monitoring period - cutoff := time.Now().Add(-monitoringPeriod) - for i, u := range usageHistory { - if u.Timestamp.After(cutoff) { - usageHistory = usageHistory[i:] - break - } - } - - if len(usageHistory) > 0 && isSystemIdle(usageHistory) { + if time.Now().Sub(lastBlockedTime) >= monitoringPeriod { log.Printf("System status before suspend:\n") log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage) if nvmlAvailable { @@ -155,6 +123,8 @@ mainLoop: if err := suspendSystem(); err != nil { log.Printf("Failed to suspend system: %v", err) } + lastBlockedTime = time.Now() + log.Printf("Resumed") } } } @@ -170,20 +140,7 @@ mainLoop: log.Println("Goodbye!") } -// Function to check if we're within the resume grace period -func isInsideResumeGracePeriod() bool { - return !lastResumeTime.IsZero() && time.Since(lastResumeTime) < resumeGracePeriod -} - -// Function to calculate time left in grace period -func timeLeftInGracePeriod() time.Duration { - if !isInsideResumeGracePeriod() { - return 0 - } - return resumeGracePeriod - time.Since(lastResumeTime) -} - -func startHTTPServer(ctx context.Context) *http.Server { +func startHTTPServer() *http.Server { srv := &http.Server{ Addr: fmt.Sprintf(":%d", httpPort), } @@ -191,7 +148,7 @@ func startHTTPServer(ctx context.Context) *http.Server { http.HandleFunc("/status", handleStatus) go func() { - if err := srv.ListenAndServe(); err != http.ErrServerClosed { + if err := srv.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { log.Printf("HTTP server error: %v", err) } }() @@ -199,74 +156,62 @@ func startHTTPServer(ctx context.Context) *http.Server { return srv } -func handleStatus(w http.ResponseWriter, r *http.Request) { +func handleStatus(w http.ResponseWriter, _ *http.Request) { statusMutex.RLock() defer statusMutex.RUnlock() w.Header().Set("Content-Type", "application/json") - json.NewEncoder(w).Encode(currentStatus) + err := json.NewEncoder(w).Encode(currentStatus) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + } } -func updateSystemStatus(current ResourceUsage, history []ResourceUsage) { +func updateSystemStatus() { statusMutex.Lock() defer statusMutex.Unlock() - currentStatus.CurrentUsage = current - currentStatus.Blockers = []string{} + blockers = []string{} - // Add grace period info to status - if isInsideResumeGracePeriod() { - timeLeft := timeLeftInGracePeriod() - currentStatus.InGracePeriod = true - currentStatus.GraceTimeLeft = timeLeft.Round(time.Second).String() - currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("Resume grace period: %v remaining", timeLeft.Round(time.Second))) + if currentStatus.CpuUsage >= cpuThreshold { + blockers = append(blockers, + fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", currentStatus.CpuUsage, cpuThreshold)) + } + if nvmlAvailable && currentStatus.GpuUsage >= gpuThreshold { + blockers = append(blockers, + fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", currentStatus.GpuUsage, gpuThreshold)) + } + if currentStatus.DiskIO >= float64(diskThreshold) { + blockers = append(blockers, + fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s", + currentStatus.DiskIO/(1024*1024), float64(diskThreshold)/(1024*1024))) + } + if currentStatus.NetworkIO >= float64(networkThreshold) { + blockers = append(blockers, + fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s", + currentStatus.NetworkIO/(1024*1024), float64(networkThreshold)/(1024*1024))) + } + + if currentStatus.SshConnections > 0 { + blockers = append(blockers, + fmt.Sprintf("Active SSH connections: %d", currentStatus.SshConnections)) + } + if currentStatus.ActiveUsers > 0 { + blockers = append(blockers, + fmt.Sprintf("Active user sessions: %d", currentStatus.ActiveUsers)) + } + + if len(blockers) == 0 { + blockers = append(blockers, "No blockers - system can sleep") } else { - currentStatus.InGracePeriod = false - currentStatus.GraceTimeLeft = "" - } - - if len(history) >= 2 { - // Calculate rates using last two samples - duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds() - diskIORate := float64(history[len(history)-1].DiskIO-history[len(history)-2].DiskIO) / duration - netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration - - if current.CpuUsage >= cpuThreshold { - currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold)) - } - if nvmlAvailable && current.GpuUsage >= gpuThreshold { - currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold)) - } - if diskIORate >= float64(diskThreshold) { - currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s", - diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024))) - } - if netIORate >= float64(networkThreshold) { - currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s", - netIORate/(1024*1024), float64(networkThreshold)/(1024*1024))) - } - } - - if current.SshConnections > 0 { - currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("Active SSH connections: %d", current.SshConnections)) - } - if current.ActiveUsers > 0 { - currentStatus.Blockers = append(currentStatus.Blockers, - fmt.Sprintf("Active user sessions: %d", current.ActiveUsers)) - } - - if len(currentStatus.Blockers) == 0 { - currentStatus.Blockers = append(currentStatus.Blockers, "No blockers - system can sleep") + lastBlockedTime = time.Now() } } -func getCurrentUsage() ResourceUsage { +func updateCurrentUsage() { + statusMutex.Lock() + defer statusMutex.Unlock() + usage := ResourceUsage{ Timestamp: time.Now(), GpuAvailable: nvmlAvailable, @@ -285,14 +230,14 @@ func getCurrentUsage() ResourceUsage { // Get GPU usage across all GPUs if available if nvmlAvailable { count, ret := nvml.DeviceGetCount() - if ret == nvml.SUCCESS && count > 0 { + if errors.Is(ret, nvml.SUCCESS) && count > 0 { var totalGPU float64 var activeGPUs int for i := 0; i < count; i++ { device, ret := nvml.DeviceGetHandleByIndex(i) - if ret == nvml.SUCCESS { + if errors.Is(ret, nvml.SUCCESS) { utilization, ret := device.GetUtilizationRates() - if ret == nvml.SUCCESS { + if errors.Is(ret, nvml.SUCCESS) { totalGPU += float64(utilization.Gpu) activeGPUs++ } @@ -310,12 +255,12 @@ func getCurrentUsage() ResourceUsage { for _, stat := range diskStats { totalIO += stat.ReadBytes + stat.WriteBytes } - usage.DiskIO = totalIO + usage.DiskIO = float64(totalIO) } // Get network I/O if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 { - usage.NetworkIO = netStats[0].BytesSent + netStats[0].BytesRecv + usage.NetworkIO = float64(netStats[0].BytesSent + netStats[0].BytesRecv) } // Count SSH connections @@ -328,7 +273,7 @@ func getCurrentUsage() ResourceUsage { usage.ActiveUsers = userCount } - return usage + currentStatus = usage } func getSSHConnectionCount() (int, error) { @@ -375,60 +320,6 @@ func getActiveUserCount() (int, error) { return count, nil } -func isSystemIdle(history []ResourceUsage) bool { - // Don't allow sleep during grace period after resume - if isInsideResumeGracePeriod() { - return false - } - - if len(history) < 2 { - return false - } - - var avgCPU, avgGPU float64 - var maxSSHConnections, maxActiveUsers int - samples := len(history) - - for _, usage := range history { - avgCPU += usage.CpuUsage - if nvmlAvailable { - avgGPU += usage.GpuUsage - } - if usage.SshConnections > maxSSHConnections { - maxSSHConnections = usage.SshConnections - } - if usage.ActiveUsers > maxActiveUsers { - maxActiveUsers = usage.ActiveUsers - } - } - - // Calculate I/O rates using first and last samples - duration := history[samples-1].Timestamp.Sub(history[0].Timestamp).Seconds() - diskIORate := float64(history[samples-1].DiskIO-history[0].DiskIO) / duration - netIORate := float64(history[samples-1].NetworkIO-history[0].NetworkIO) / duration - - avgCPU /= float64(samples) - if nvmlAvailable { - avgGPU /= float64(samples) - } - - // Basic checks that always apply - if avgCPU >= cpuThreshold || - diskIORate >= float64(diskThreshold) || - netIORate >= float64(networkThreshold) || - maxSSHConnections > 0 || - maxActiveUsers > 0 { - return false - } - - // GPU check only if NVML is available - if nvmlAvailable && avgGPU >= gpuThreshold { - return false - } - - return true -} - func suspendSystem() error { cmd := exec.Command("systemctl", "suspend") return cmd.Run()