package main import ( "context" "encoding/json" "fmt" "log" "net/http" "os" "os/exec" "os/signal" "strings" "sync" "syscall" "time" "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/shirou/gopsutil/v3/cpu" "github.com/shirou/gopsutil/v3/disk" "github.com/shirou/gopsutil/v3/net" "github.com/shirou/gopsutil/v3/process" ) const ( checkInterval = 10 * time.Second monitoringPeriod = 5 * time.Minute resumeGracePeriod = 5 * time.Minute // Time to wait after resume before allowing sleep again cpuThreshold = 20.0 // percentage gpuThreshold = 20.0 // percentage diskThreshold = 5 * 1024 * 1024 // 5 MB/s networkThreshold = 1 * 1024 * 1024 // 1 MB/s httpPort = 8081 ) type ResourceUsage struct { Timestamp time.Time `json:"timestamp"` CpuUsage float64 `json:"cpu_usage"` GpuUsage float64 `json:"gpu_usage"` GpuAvailable bool `json:"gpu_available"` DiskIO uint64 `json:"disk_io"` NetworkIO uint64 `json:"network_io"` SshConnections int `json:"ssh_connections"` ActiveUsers int `json:"active_users"` } type SystemStatus struct { CurrentUsage ResourceUsage `json:"current_usage"` Blockers []string `json:"sleep_blockers"` InGracePeriod bool `json:"in_grace_period,omitempty"` GraceTimeLeft string `json:"grace_time_left,omitempty"` } var ( currentStatus SystemStatus statusMutex sync.RWMutex nvmlAvailable bool lastResumeTime time.Time // Track when the system last resumed from sleep lastTickTime time.Time // Track when we last processed a tick ) func main() { // Check if running as root if os.Geteuid() != 0 { log.Fatal("This program must be run as root") } // Initialize NVML for GPU monitoring ret := nvml.Init() if ret != nvml.SUCCESS { log.Printf("Warning: Could not initialize NVML: %v", ret) nvmlAvailable = false } else { nvmlAvailable = true defer nvml.Shutdown() } // Create a context that we'll use to shut down the application ctx, cancel := context.WithCancel(context.Background()) defer cancel() // Set up signal handling sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) // Start HTTP server with context srv := startHTTPServer(ctx) usageHistory := make([]ResourceUsage, 0) ticker := time.NewTicker(checkInterval) defer ticker.Stop() log.Printf("Starting idle monitoring. System will suspend when:\n") log.Printf("- Average CPU usage across all cores < %.1f%%\n", cpuThreshold) if nvmlAvailable { log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold) } else { log.Printf("- GPU monitoring disabled (NVML initialization failed)") } log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024)) log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024)) log.Printf("- No active SSH connections\n") log.Printf("- No active user sessions\n") log.Printf("- Over the last %v\n", monitoringPeriod) log.Printf("- System will not suspend for %v after resuming from sleep\n", resumeGracePeriod) log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort) log.Printf("Press Ctrl+C to exit\n") mainLoop: for { select { case <-ctx.Done(): break mainLoop case sig := <-sigChan: log.Printf("Received signal %v, shutting down...", sig) cancel() break mainLoop case <-ticker.C: now := time.Now() // Check if we just resumed from sleep if !lastTickTime.IsZero() { gap := now.Sub(lastTickTime) // If there was a significant gap, probably resumed from sleep if gap > (checkInterval*3) && gap < time.Hour { log.Printf("Detected system resume after gap of %v", gap) lastResumeTime = now } } lastTickTime = now usage := getCurrentUsage() usageHistory = append(usageHistory, usage) // Update current status updateSystemStatus(usage, usageHistory) // Remove entries older than monitoring period cutoff := time.Now().Add(-monitoringPeriod) for i, u := range usageHistory { if u.Timestamp.After(cutoff) { usageHistory = usageHistory[i:] break } } if len(usageHistory) > 0 && isSystemIdle(usageHistory) { log.Printf("System status before suspend:\n") log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage) if nvmlAvailable { log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].GpuUsage) } log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].SshConnections) log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].ActiveUsers) log.Println("System has been idle for the monitoring period. Suspending...") if err := suspendSystem(); err != nil { log.Printf("Failed to suspend system: %v", err) } } } } // Graceful shutdown of HTTP server shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) defer shutdownCancel() if err := srv.Shutdown(shutdownCtx); err != nil { log.Printf("HTTP server shutdown error: %v", err) } log.Println("Goodbye!") } // Function to check if we're within the resume grace period func isInsideResumeGracePeriod() bool { return !lastResumeTime.IsZero() && time.Since(lastResumeTime) < resumeGracePeriod } // Function to calculate time left in grace period func timeLeftInGracePeriod() time.Duration { if !isInsideResumeGracePeriod() { return 0 } return resumeGracePeriod - time.Since(lastResumeTime) } func startHTTPServer(ctx context.Context) *http.Server { srv := &http.Server{ Addr: fmt.Sprintf(":%d", httpPort), } http.HandleFunc("/status", handleStatus) go func() { if err := srv.ListenAndServe(); err != http.ErrServerClosed { log.Printf("HTTP server error: %v", err) } }() return srv } func handleStatus(w http.ResponseWriter, r *http.Request) { statusMutex.RLock() defer statusMutex.RUnlock() w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(currentStatus) } func updateSystemStatus(current ResourceUsage, history []ResourceUsage) { statusMutex.Lock() defer statusMutex.Unlock() currentStatus.CurrentUsage = current currentStatus.Blockers = []string{} // Add grace period info to status if isInsideResumeGracePeriod() { timeLeft := timeLeftInGracePeriod() currentStatus.InGracePeriod = true currentStatus.GraceTimeLeft = timeLeft.Round(time.Second).String() currentStatus.Blockers = append(currentStatus.Blockers, fmt.Sprintf("Resume grace period: %v remaining", timeLeft.Round(time.Second))) } else { currentStatus.InGracePeriod = false currentStatus.GraceTimeLeft = "" } if len(history) >= 2 { // Calculate rates using last two samples duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds() diskIORate := float64(history[len(history)-1].DiskIO-history[len(history)-2].DiskIO) / duration netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration if current.CpuUsage >= cpuThreshold { currentStatus.Blockers = append(currentStatus.Blockers, fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold)) } if nvmlAvailable && current.GpuUsage >= gpuThreshold { currentStatus.Blockers = append(currentStatus.Blockers, fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold)) } if diskIORate >= float64(diskThreshold) { currentStatus.Blockers = append(currentStatus.Blockers, fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s", diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024))) } if netIORate >= float64(networkThreshold) { currentStatus.Blockers = append(currentStatus.Blockers, fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s", netIORate/(1024*1024), float64(networkThreshold)/(1024*1024))) } } if current.SshConnections > 0 { currentStatus.Blockers = append(currentStatus.Blockers, fmt.Sprintf("Active SSH connections: %d", current.SshConnections)) } if current.ActiveUsers > 0 { currentStatus.Blockers = append(currentStatus.Blockers, fmt.Sprintf("Active user sessions: %d", current.ActiveUsers)) } if len(currentStatus.Blockers) == 0 { currentStatus.Blockers = append(currentStatus.Blockers, "No blockers - system can sleep") } } func getCurrentUsage() ResourceUsage { usage := ResourceUsage{ Timestamp: time.Now(), GpuAvailable: nvmlAvailable, } // Get CPU usage across all cores if cpuPercent, err := cpu.Percent(0, true); err == nil && len(cpuPercent) > 0 { // Calculate average CPU usage across all cores var totalCPU float64 for _, percent := range cpuPercent { totalCPU += percent } usage.CpuUsage = totalCPU / float64(len(cpuPercent)) } // Get GPU usage across all GPUs if available if nvmlAvailable { count, ret := nvml.DeviceGetCount() if ret == nvml.SUCCESS && count > 0 { var totalGPU float64 var activeGPUs int for i := 0; i < count; i++ { device, ret := nvml.DeviceGetHandleByIndex(i) if ret == nvml.SUCCESS { utilization, ret := device.GetUtilizationRates() if ret == nvml.SUCCESS { totalGPU += float64(utilization.Gpu) activeGPUs++ } } } if activeGPUs > 0 { usage.GpuUsage = totalGPU / float64(activeGPUs) } } } // Get disk I/O if diskStats, err := disk.IOCounters(); err == nil { var totalIO uint64 for _, stat := range diskStats { totalIO += stat.ReadBytes + stat.WriteBytes } usage.DiskIO = totalIO } // Get network I/O if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 { usage.NetworkIO = netStats[0].BytesSent + netStats[0].BytesRecv } // Count SSH connections if sshCount, err := getSSHConnectionCount(); err == nil { usage.SshConnections = sshCount } // Count active user sessions if userCount, err := getActiveUserCount(); err == nil { usage.ActiveUsers = userCount } return usage } func getSSHConnectionCount() (int, error) { processes, err := process.Processes() if err != nil { return 0, err } count := 0 for _, p := range processes { name, err := p.Name() if err != nil { continue } if name == "sshd" { cmdline, err := p.Cmdline() if err != nil { continue } // Only count sshd processes that are handling connections // The main sshd process doesn't have "@" in its cmdline if strings.Contains(cmdline, "@") { count++ } } } return count, nil } func getActiveUserCount() (int, error) { out, err := exec.Command("who", "-s").Output() if err != nil { return 0, err } // Count non-empty lines lines := strings.Split(strings.TrimSpace(string(out)), "\n") count := 0 for _, line := range lines { if line != "" { count++ } } return count, nil } func isSystemIdle(history []ResourceUsage) bool { // Don't allow sleep during grace period after resume if isInsideResumeGracePeriod() { return false } if len(history) < 2 { return false } var avgCPU, avgGPU float64 var maxSSHConnections, maxActiveUsers int samples := len(history) for _, usage := range history { avgCPU += usage.CpuUsage if nvmlAvailable { avgGPU += usage.GpuUsage } if usage.SshConnections > maxSSHConnections { maxSSHConnections = usage.SshConnections } if usage.ActiveUsers > maxActiveUsers { maxActiveUsers = usage.ActiveUsers } } // Calculate I/O rates using first and last samples duration := history[samples-1].Timestamp.Sub(history[0].Timestamp).Seconds() diskIORate := float64(history[samples-1].DiskIO-history[0].DiskIO) / duration netIORate := float64(history[samples-1].NetworkIO-history[0].NetworkIO) / duration avgCPU /= float64(samples) if nvmlAvailable { avgGPU /= float64(samples) } // Basic checks that always apply if avgCPU >= cpuThreshold || diskIORate >= float64(diskThreshold) || netIORate >= float64(networkThreshold) || maxSSHConnections > 0 || maxActiveUsers > 0 { return false } // GPU check only if NVML is available if nvmlAvailable && avgGPU >= gpuThreshold { return false } return true } func suspendSystem() error { cmd := exec.Command("systemctl", "suspend") return cmd.Run() }