Add resume grace period functionality and update system status

This commit is contained in:
Sebastiaan de Schaetzen 2025-03-26 17:07:26 +01:00
parent d0816a0e3c
commit 5a27277e7c

99
main.go
View File

@ -23,16 +23,17 @@ import (
const ( const (
checkInterval = 10 * time.Second checkInterval = 10 * time.Second
monitoringPeriod = 5 * time.Minute monitoringPeriod = 5 * time.Minute
cpuThreshold = 20.0 // percentage resumeGracePeriod = 5 * time.Minute // Time to wait after resume before allowing sleep again
gpuThreshold = 20.0 // percentage cpuThreshold = 20.0 // percentage
diskThreshold = 5 * 1024 * 1024 // 5 MB/s gpuThreshold = 20.0 // percentage
networkThreshold = 1 * 1024 * 1024 // 1 MB/s diskThreshold = 5 * 1024 * 1024 // 5 MB/s
httpPort = 8081 networkThreshold = 1 * 1024 * 1024 // 1 MB/s
httpPort = 8081
) )
type ResourceUsage struct { type ResourceUsage struct {
Timestamp time.Time `json:"timestamp"` Timestamp time.Time `json:"timestamp"`
CpuUsage float64 `json:"cpu_usage"` CpuUsage float64 `json:"cpu_usage"`
GpuUsage float64 `json:"gpu_usage"` GpuUsage float64 `json:"gpu_usage"`
GpuAvailable bool `json:"gpu_available"` GpuAvailable bool `json:"gpu_available"`
@ -43,14 +44,18 @@ type ResourceUsage struct {
} }
type SystemStatus struct { type SystemStatus struct {
CurrentUsage ResourceUsage `json:"current_usage"` CurrentUsage ResourceUsage `json:"current_usage"`
Blockers []string `json:"sleep_blockers"` Blockers []string `json:"sleep_blockers"`
InGracePeriod bool `json:"in_grace_period,omitempty"`
GraceTimeLeft string `json:"grace_time_left,omitempty"`
} }
var ( var (
currentStatus SystemStatus currentStatus SystemStatus
statusMutex sync.RWMutex statusMutex sync.RWMutex
nvmlAvailable bool nvmlAvailable bool
lastResumeTime time.Time // Track when the system last resumed from sleep
lastTickTime time.Time // Track when we last processed a tick
) )
func main() { func main() {
@ -95,7 +100,8 @@ func main() {
log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024)) log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024))
log.Printf("- No active SSH connections\n") log.Printf("- No active SSH connections\n")
log.Printf("- No active user sessions\n") log.Printf("- No active user sessions\n")
log.Printf("Over the last %v\n", monitoringPeriod) log.Printf("- Over the last %v\n", monitoringPeriod)
log.Printf("- System will not suspend for %v after resuming from sleep\n", resumeGracePeriod)
log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort) log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort)
log.Printf("Press Ctrl+C to exit\n") log.Printf("Press Ctrl+C to exit\n")
@ -109,6 +115,19 @@ mainLoop:
cancel() cancel()
break mainLoop break mainLoop
case <-ticker.C: case <-ticker.C:
now := time.Now()
// Check if we just resumed from sleep
if !lastTickTime.IsZero() {
gap := now.Sub(lastTickTime)
// If there was a significant gap, probably resumed from sleep
if gap > (checkInterval*3) && gap < time.Hour {
log.Printf("Detected system resume after gap of %v", gap)
lastResumeTime = now
}
}
lastTickTime = now
usage := getCurrentUsage() usage := getCurrentUsage()
usageHistory = append(usageHistory, usage) usageHistory = append(usageHistory, usage)
@ -143,7 +162,7 @@ mainLoop:
// Graceful shutdown of HTTP server // Graceful shutdown of HTTP server
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer shutdownCancel() defer shutdownCancel()
if err := srv.Shutdown(shutdownCtx); err != nil { if err := srv.Shutdown(shutdownCtx); err != nil {
log.Printf("HTTP server shutdown error: %v", err) log.Printf("HTTP server shutdown error: %v", err)
} }
@ -151,6 +170,19 @@ mainLoop:
log.Println("Goodbye!") log.Println("Goodbye!")
} }
// Function to check if we're within the resume grace period
func isInsideResumeGracePeriod() bool {
return !lastResumeTime.IsZero() && time.Since(lastResumeTime) < resumeGracePeriod
}
// Function to calculate time left in grace period
func timeLeftInGracePeriod() time.Duration {
if !isInsideResumeGracePeriod() {
return 0
}
return resumeGracePeriod - time.Since(lastResumeTime)
}
func startHTTPServer(ctx context.Context) *http.Server { func startHTTPServer(ctx context.Context) *http.Server {
srv := &http.Server{ srv := &http.Server{
Addr: fmt.Sprintf(":%d", httpPort), Addr: fmt.Sprintf(":%d", httpPort),
@ -182,6 +214,18 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
currentStatus.CurrentUsage = current currentStatus.CurrentUsage = current
currentStatus.Blockers = []string{} currentStatus.Blockers = []string{}
// Add grace period info to status
if isInsideResumeGracePeriod() {
timeLeft := timeLeftInGracePeriod()
currentStatus.InGracePeriod = true
currentStatus.GraceTimeLeft = timeLeft.Round(time.Second).String()
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Resume grace period: %v remaining", timeLeft.Round(time.Second)))
} else {
currentStatus.InGracePeriod = false
currentStatus.GraceTimeLeft = ""
}
if len(history) >= 2 { if len(history) >= 2 {
// Calculate rates using last two samples // Calculate rates using last two samples
duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds() duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds()
@ -189,31 +233,31 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration
if current.CpuUsage >= cpuThreshold { if current.CpuUsage >= cpuThreshold {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold)) fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold))
} }
if nvmlAvailable && current.GpuUsage >= gpuThreshold { if nvmlAvailable && current.GpuUsage >= gpuThreshold {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold)) fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold))
} }
if diskIORate >= float64(diskThreshold) { if diskIORate >= float64(diskThreshold) {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s", fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s",
diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024))) diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024)))
} }
if netIORate >= float64(networkThreshold) { if netIORate >= float64(networkThreshold) {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s", fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s",
netIORate/(1024*1024), float64(networkThreshold)/(1024*1024))) netIORate/(1024*1024), float64(networkThreshold)/(1024*1024)))
} }
} }
if current.SshConnections > 0 { if current.SshConnections > 0 {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Active SSH connections: %d", current.SshConnections)) fmt.Sprintf("Active SSH connections: %d", current.SshConnections))
} }
if current.ActiveUsers > 0 { if current.ActiveUsers > 0 {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Active user sessions: %d", current.ActiveUsers)) fmt.Sprintf("Active user sessions: %d", current.ActiveUsers))
} }
@ -224,7 +268,7 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
func getCurrentUsage() ResourceUsage { func getCurrentUsage() ResourceUsage {
usage := ResourceUsage{ usage := ResourceUsage{
Timestamp: time.Now(), Timestamp: time.Now(),
GpuAvailable: nvmlAvailable, GpuAvailable: nvmlAvailable,
} }
@ -319,7 +363,7 @@ func getActiveUserCount() (int, error) {
if err != nil { if err != nil {
return 0, err return 0, err
} }
// Count non-empty lines // Count non-empty lines
lines := strings.Split(strings.TrimSpace(string(out)), "\n") lines := strings.Split(strings.TrimSpace(string(out)), "\n")
count := 0 count := 0
@ -332,6 +376,11 @@ func getActiveUserCount() (int, error) {
} }
func isSystemIdle(history []ResourceUsage) bool { func isSystemIdle(history []ResourceUsage) bool {
// Don't allow sleep during grace period after resume
if isInsideResumeGracePeriod() {
return false
}
if len(history) < 2 { if len(history) < 2 {
return false return false
} }
@ -383,4 +432,4 @@ func isSystemIdle(history []ResourceUsage) bool {
func suspendSystem() error { func suspendSystem() error {
cmd := exec.Command("systemctl", "suspend") cmd := exec.Command("systemctl", "suspend")
return cmd.Run() return cmd.Run()
} }