package main import ( "context" "encoding/json" "errors" "fmt" "log" "net/http" "os" "os/exec" "os/signal" "strings" "sync" "syscall" "time" "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/shirou/gopsutil/v3/cpu" "github.com/shirou/gopsutil/v3/disk" "github.com/shirou/gopsutil/v3/net" "github.com/shirou/gopsutil/v3/process" ) const ( checkInterval = 10 * time.Second monitoringPeriod = 3 * time.Minute cpuThreshold = 20.0 // percentage gpuThreshold = 20.0 // percentage diskThreshold = 5 * 1024 * 1024 // 5 MB/s networkThreshold = 1 * 1024 * 1024 // 1 MB/s httpPort = 8081 ) type ResourceUsage struct { Timestamp time.Time `json:"timestamp"` CpuUsage float64 `json:"cpu_usage"` GpuUsage float64 `json:"gpu_usage"` GpuAvailable bool `json:"gpu_available"` DiskIO float64 `json:"disk_io"` NetworkIO float64 `json:"network_io"` SshConnections int `json:"ssh_connections"` ActiveUsers int `json:"active_users"` HttpRequests int `json:"http_requests"` } type StatusResponse struct { Usage ResourceUsage `json:"usage"` Blockers []string `json:"blockers"` Timestamp time.Time `json:"timestamp"` SleepAt time.Time `json:"sleep_at"` } var ( statusMutex sync.RWMutex currentStatus ResourceUsage nvmlAvailable bool networkBytesIO uint64 diskBytesIO uint64 nextSleepTime time.Time ) func main() { // Check if running as root if os.Geteuid() != 0 { log.Fatal("This program must be run as root") } // Initialize NVML for GPU monitoring ret := nvml.Init() if !errors.Is(ret, nvml.SUCCESS) { log.Printf("Warning: Could not initialize NVML: %v", ret) nvmlAvailable = false } else { nvmlAvailable = true defer nvml.Shutdown() } // Set up signal handling sigChan := make(chan os.Signal, 1) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) // Start HTTP server with context srv := startHTTPServer() ticker := time.NewTicker(checkInterval) defer ticker.Stop() log.Printf("Starting idle monitoring. System will suspend when:\n") log.Printf("- Average CPU usage across all cores < %.1f%%\n", cpuThreshold) if nvmlAvailable { log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold) } else { log.Printf("- GPU monitoring disabled (NVML initialization failed)") } log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024)) log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024)) log.Printf("- No active SSH connections\n") log.Printf("- No active user sessions\n") log.Printf("- Over the last %v\n", monitoringPeriod) log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort) log.Printf("Press Ctrl+C to exit\n") mainLoop: for { select { case sig := <-sigChan: log.Printf("Received signal %v, shutting down...", sig) break mainLoop case <-ticker.C: updateCurrentUsage() blocked, _ := getBlockers() if blocked { delaySleep() } if time.Now().After(nextSleepTime) { log.Printf("System status before suspend:\n") log.Printf("- CPU: %.1f%%\n", currentStatus.CpuUsage) if nvmlAvailable { log.Printf("- GPU: %.1f%%\n", currentStatus.GpuUsage) } log.Printf("- SSH connections: %d\n", currentStatus.SshConnections) log.Printf("- Active users: %d\n", currentStatus.ActiveUsers) log.Printf("- Disk I/O: %.1f MB/s\n", currentStatus.DiskIO/(1024*1024)) log.Printf("- Network I/O: %.1f MB/s\n", currentStatus.NetworkIO/(1024*1024)) log.Println("System has been idle for the monitoring period. Suspending...") if err := suspendSystem(); err != nil { log.Printf("Failed to suspend system: %v", err) } nextSleepTime = time.Now().Add(monitoringPeriod) log.Printf("Resumed") } } } // Graceful shutdown of HTTP server shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) defer shutdownCancel() if err := srv.Shutdown(shutdownCtx); err != nil { log.Printf("HTTP server shutdown error: %v", err) } log.Println("Goodbye!") } func startHTTPServer() *http.Server { srv := &http.Server{ Addr: fmt.Sprintf(":%d", httpPort), } http.HandleFunc("/status", handleStatus) http.HandleFunc("/reset", handleReset) go func() { if err := srv.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { log.Printf("HTTP server error: %v", err) } }() return srv } func handleStatus(w http.ResponseWriter, _ *http.Request) { statusMutex.RLock() defer statusMutex.RUnlock() _, blockers := getBlockersNoLock() response := StatusResponse{ Usage: currentStatus, Blockers: blockers, Timestamp: time.Now(), SleepAt: nextSleepTime, } w.Header().Set("Content-Type", "application/json") err := json.NewEncoder(w).Encode(response) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } } func handleReset(w http.ResponseWriter, _ *http.Request) { statusMutex.Lock() defer statusMutex.Unlock() delaySleep() currentStatus.HttpRequests++ nextSleepTime = time.Now().Add(monitoringPeriod) w.WriteHeader(http.StatusOK) } func getBlockers() (bool, []string) { statusMutex.Lock() defer statusMutex.Unlock() return getBlockersNoLock() } func getBlockersNoLock() (bool, []string) { blocked := true blockers := []string{} if currentStatus.CpuUsage >= cpuThreshold { blockers = append(blockers, fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", currentStatus.CpuUsage, cpuThreshold)) } if nvmlAvailable && currentStatus.GpuUsage >= gpuThreshold { blockers = append(blockers, fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", currentStatus.GpuUsage, gpuThreshold)) } if currentStatus.DiskIO >= float64(diskThreshold) { blockers = append(blockers, fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s", currentStatus.DiskIO/(1024*1024), float64(diskThreshold)/(1024*1024))) } if currentStatus.NetworkIO >= float64(networkThreshold) { blockers = append(blockers, fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s", currentStatus.NetworkIO/(1024*1024), float64(networkThreshold)/(1024*1024))) } if currentStatus.SshConnections > 0 { blockers = append(blockers, fmt.Sprintf("Active SSH connections: %d", currentStatus.SshConnections)) } if currentStatus.ActiveUsers > 0 { blockers = append(blockers, fmt.Sprintf("Active user sessions: %d", currentStatus.ActiveUsers)) } if currentStatus.HttpRequests > 0 { blockers = append(blockers, fmt.Sprintf("HTTP requests received: %d", currentStatus.HttpRequests)) } if len(blockers) == 0 { blockers = append(blockers, "No blockers - system can sleep") blocked = false } return blocked, blockers } func updateCurrentUsage() { statusMutex.Lock() defer statusMutex.Unlock() usage := ResourceUsage{ Timestamp: time.Now(), GpuAvailable: nvmlAvailable, } // Get CPU usage across all cores if cpuPercent, err := cpu.Percent(0, true); err == nil && len(cpuPercent) > 0 { // Calculate average CPU usage across all cores var totalCPU float64 for _, percent := range cpuPercent { totalCPU += percent } usage.CpuUsage = totalCPU / float64(len(cpuPercent)) } // Get GPU usage across all GPUs if available if nvmlAvailable { count, ret := nvml.DeviceGetCount() if errors.Is(ret, nvml.SUCCESS) && count > 0 { var totalGPU float64 var activeGPUs int for i := 0; i < count; i++ { device, ret := nvml.DeviceGetHandleByIndex(i) if errors.Is(ret, nvml.SUCCESS) { utilization, ret := device.GetUtilizationRates() if errors.Is(ret, nvml.SUCCESS) { totalGPU += float64(utilization.Gpu) activeGPUs++ } } } if activeGPUs > 0 { usage.GpuUsage = totalGPU / float64(activeGPUs) } } } // Get disk I/O if diskStats, err := disk.IOCounters(); err == nil { var totalIO uint64 for _, stat := range diskStats { totalIO += stat.ReadBytes + stat.WriteBytes } previousDiskBytesIO := diskBytesIO diskBytesIO = totalIO diff := diskBytesIO - previousDiskBytesIO usage.DiskIO = float64(diff) / checkInterval.Seconds() } // Get network I/O if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 { previousNetworkBytesIO := networkBytesIO networkBytesIO = netStats[0].BytesSent + netStats[0].BytesRecv diff := networkBytesIO - previousNetworkBytesIO usage.NetworkIO = float64(diff) / checkInterval.Seconds() } // Count SSH connections if sshCount, err := getSSHConnectionCount(); err == nil { usage.SshConnections = sshCount } // Count active user sessions if userCount, err := getActiveUserCount(); err == nil { usage.ActiveUsers = userCount } currentStatus = usage } func getSSHConnectionCount() (int, error) { processes, err := process.Processes() if err != nil { return 0, err } count := 0 for _, p := range processes { name, err := p.Name() if err != nil { continue } if name == "sshd" { cmdline, err := p.Cmdline() if err != nil { continue } // Only count sshd processes that are handling connections // The main sshd process doesn't have "@" in its cmdline if strings.Contains(cmdline, "@") { count++ } } } return count, nil } func getActiveUserCount() (int, error) { out, err := exec.Command("who", "-s").Output() if err != nil { return 0, err } // Count non-empty lines lines := strings.Split(strings.TrimSpace(string(out)), "\n") count := 0 for _, line := range lines { if line != "" { count++ } } return count, nil } func suspendSystem() error { cmd := exec.Command("systemctl", "suspend") return cmd.Run() } func delaySleep() { nextSleepTime = time.Now().Add(monitoringPeriod) }