Simplify system

This commit is contained in:
Sebastiaan de Schaetzen 2025-03-30 12:12:32 +02:00
parent 5a27277e7c
commit 93fe9ebed3

239
main.go
View File

@ -3,6 +3,7 @@ package main
import ( import (
"context" "context"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"log" "log"
"net/http" "net/http"
@ -37,27 +38,26 @@ type ResourceUsage struct {
CpuUsage float64 `json:"cpu_usage"` CpuUsage float64 `json:"cpu_usage"`
GpuUsage float64 `json:"gpu_usage"` GpuUsage float64 `json:"gpu_usage"`
GpuAvailable bool `json:"gpu_available"` GpuAvailable bool `json:"gpu_available"`
DiskIO uint64 `json:"disk_io"` DiskIO float64 `json:"disk_io"`
NetworkIO uint64 `json:"network_io"` NetworkIO float64 `json:"network_io"`
SshConnections int `json:"ssh_connections"` SshConnections int `json:"ssh_connections"`
ActiveUsers int `json:"active_users"` ActiveUsers int `json:"active_users"`
} }
type SystemStatus struct {
CurrentUsage ResourceUsage `json:"current_usage"`
Blockers []string `json:"sleep_blockers"`
InGracePeriod bool `json:"in_grace_period,omitempty"`
GraceTimeLeft string `json:"grace_time_left,omitempty"`
}
var ( var (
currentStatus SystemStatus
statusMutex sync.RWMutex statusMutex sync.RWMutex
blockers []string
currentStatus ResourceUsage
nvmlAvailable bool nvmlAvailable bool
lastResumeTime time.Time // Track when the system last resumed from sleep lastBlockedTime time.Time
lastTickTime time.Time // Track when we last processed a tick
) )
func Must(err error) {
if err != nil {
log.Fatalf("Error: %v", err)
}
}
func main() { func main() {
// Check if running as root // Check if running as root
if os.Geteuid() != 0 { if os.Geteuid() != 0 {
@ -66,24 +66,20 @@ func main() {
// Initialize NVML for GPU monitoring // Initialize NVML for GPU monitoring
ret := nvml.Init() ret := nvml.Init()
if ret != nvml.SUCCESS { if !errors.Is(ret, nvml.SUCCESS) {
log.Printf("Warning: Could not initialize NVML: %v", ret) log.Printf("Warning: Could not initialize NVML: %v", ret)
nvmlAvailable = false nvmlAvailable = false
} else { } else {
nvmlAvailable = true nvmlAvailable = true
defer nvml.Shutdown() defer Must(nvml.Shutdown())
} }
// Create a context that we'll use to shut down the application
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Set up signal handling // Set up signal handling
sigChan := make(chan os.Signal, 1) sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
// Start HTTP server with context // Start HTTP server with context
srv := startHTTPServer(ctx) srv := startHTTPServer()
usageHistory := make([]ResourceUsage, 0) usageHistory := make([]ResourceUsage, 0)
ticker := time.NewTicker(checkInterval) ticker := time.NewTicker(checkInterval)
@ -108,42 +104,14 @@ func main() {
mainLoop: mainLoop:
for { for {
select { select {
case <-ctx.Done():
break mainLoop
case sig := <-sigChan: case sig := <-sigChan:
log.Printf("Received signal %v, shutting down...", sig) log.Printf("Received signal %v, shutting down...", sig)
cancel()
break mainLoop break mainLoop
case <-ticker.C: case <-ticker.C:
now := time.Now() updateCurrentUsage()
updateSystemStatus()
// Check if we just resumed from sleep if time.Now().Sub(lastBlockedTime) >= monitoringPeriod {
if !lastTickTime.IsZero() {
gap := now.Sub(lastTickTime)
// If there was a significant gap, probably resumed from sleep
if gap > (checkInterval*3) && gap < time.Hour {
log.Printf("Detected system resume after gap of %v", gap)
lastResumeTime = now
}
}
lastTickTime = now
usage := getCurrentUsage()
usageHistory = append(usageHistory, usage)
// Update current status
updateSystemStatus(usage, usageHistory)
// Remove entries older than monitoring period
cutoff := time.Now().Add(-monitoringPeriod)
for i, u := range usageHistory {
if u.Timestamp.After(cutoff) {
usageHistory = usageHistory[i:]
break
}
}
if len(usageHistory) > 0 && isSystemIdle(usageHistory) {
log.Printf("System status before suspend:\n") log.Printf("System status before suspend:\n")
log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage) log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage)
if nvmlAvailable { if nvmlAvailable {
@ -155,6 +123,8 @@ mainLoop:
if err := suspendSystem(); err != nil { if err := suspendSystem(); err != nil {
log.Printf("Failed to suspend system: %v", err) log.Printf("Failed to suspend system: %v", err)
} }
lastBlockedTime = time.Now()
log.Printf("Resumed")
} }
} }
} }
@ -170,20 +140,7 @@ mainLoop:
log.Println("Goodbye!") log.Println("Goodbye!")
} }
// Function to check if we're within the resume grace period func startHTTPServer() *http.Server {
func isInsideResumeGracePeriod() bool {
return !lastResumeTime.IsZero() && time.Since(lastResumeTime) < resumeGracePeriod
}
// Function to calculate time left in grace period
func timeLeftInGracePeriod() time.Duration {
if !isInsideResumeGracePeriod() {
return 0
}
return resumeGracePeriod - time.Since(lastResumeTime)
}
func startHTTPServer(ctx context.Context) *http.Server {
srv := &http.Server{ srv := &http.Server{
Addr: fmt.Sprintf(":%d", httpPort), Addr: fmt.Sprintf(":%d", httpPort),
} }
@ -191,7 +148,7 @@ func startHTTPServer(ctx context.Context) *http.Server {
http.HandleFunc("/status", handleStatus) http.HandleFunc("/status", handleStatus)
go func() { go func() {
if err := srv.ListenAndServe(); err != http.ErrServerClosed { if err := srv.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
log.Printf("HTTP server error: %v", err) log.Printf("HTTP server error: %v", err)
} }
}() }()
@ -199,74 +156,62 @@ func startHTTPServer(ctx context.Context) *http.Server {
return srv return srv
} }
func handleStatus(w http.ResponseWriter, r *http.Request) { func handleStatus(w http.ResponseWriter, _ *http.Request) {
statusMutex.RLock() statusMutex.RLock()
defer statusMutex.RUnlock() defer statusMutex.RUnlock()
w.Header().Set("Content-Type", "application/json") w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(currentStatus) err := json.NewEncoder(w).Encode(currentStatus)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
}
} }
func updateSystemStatus(current ResourceUsage, history []ResourceUsage) { func updateSystemStatus() {
statusMutex.Lock() statusMutex.Lock()
defer statusMutex.Unlock() defer statusMutex.Unlock()
currentStatus.CurrentUsage = current blockers = []string{}
currentStatus.Blockers = []string{}
// Add grace period info to status if currentStatus.CpuUsage >= cpuThreshold {
if isInsideResumeGracePeriod() { blockers = append(blockers,
timeLeft := timeLeftInGracePeriod() fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", currentStatus.CpuUsage, cpuThreshold))
currentStatus.InGracePeriod = true
currentStatus.GraceTimeLeft = timeLeft.Round(time.Second).String()
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Resume grace period: %v remaining", timeLeft.Round(time.Second)))
} else {
currentStatus.InGracePeriod = false
currentStatus.GraceTimeLeft = ""
} }
if nvmlAvailable && currentStatus.GpuUsage >= gpuThreshold {
if len(history) >= 2 { blockers = append(blockers,
// Calculate rates using last two samples fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", currentStatus.GpuUsage, gpuThreshold))
duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds()
diskIORate := float64(history[len(history)-1].DiskIO-history[len(history)-2].DiskIO) / duration
netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration
if current.CpuUsage >= cpuThreshold {
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold))
} }
if nvmlAvailable && current.GpuUsage >= gpuThreshold { if currentStatus.DiskIO >= float64(diskThreshold) {
currentStatus.Blockers = append(currentStatus.Blockers, blockers = append(blockers,
fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold))
}
if diskIORate >= float64(diskThreshold) {
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s", fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s",
diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024))) currentStatus.DiskIO/(1024*1024), float64(diskThreshold)/(1024*1024)))
} }
if netIORate >= float64(networkThreshold) { if currentStatus.NetworkIO >= float64(networkThreshold) {
currentStatus.Blockers = append(currentStatus.Blockers, blockers = append(blockers,
fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s", fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s",
netIORate/(1024*1024), float64(networkThreshold)/(1024*1024))) currentStatus.NetworkIO/(1024*1024), float64(networkThreshold)/(1024*1024)))
}
if currentStatus.SshConnections > 0 {
blockers = append(blockers,
fmt.Sprintf("Active SSH connections: %d", currentStatus.SshConnections))
}
if currentStatus.ActiveUsers > 0 {
blockers = append(blockers,
fmt.Sprintf("Active user sessions: %d", currentStatus.ActiveUsers))
}
if len(blockers) == 0 {
blockers = append(blockers, "No blockers - system can sleep")
} else {
lastBlockedTime = time.Now()
} }
} }
if current.SshConnections > 0 { func updateCurrentUsage() {
currentStatus.Blockers = append(currentStatus.Blockers, statusMutex.Lock()
fmt.Sprintf("Active SSH connections: %d", current.SshConnections)) defer statusMutex.Unlock()
}
if current.ActiveUsers > 0 {
currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Active user sessions: %d", current.ActiveUsers))
}
if len(currentStatus.Blockers) == 0 {
currentStatus.Blockers = append(currentStatus.Blockers, "No blockers - system can sleep")
}
}
func getCurrentUsage() ResourceUsage {
usage := ResourceUsage{ usage := ResourceUsage{
Timestamp: time.Now(), Timestamp: time.Now(),
GpuAvailable: nvmlAvailable, GpuAvailable: nvmlAvailable,
@ -285,14 +230,14 @@ func getCurrentUsage() ResourceUsage {
// Get GPU usage across all GPUs if available // Get GPU usage across all GPUs if available
if nvmlAvailable { if nvmlAvailable {
count, ret := nvml.DeviceGetCount() count, ret := nvml.DeviceGetCount()
if ret == nvml.SUCCESS && count > 0 { if errors.Is(ret, nvml.SUCCESS) && count > 0 {
var totalGPU float64 var totalGPU float64
var activeGPUs int var activeGPUs int
for i := 0; i < count; i++ { for i := 0; i < count; i++ {
device, ret := nvml.DeviceGetHandleByIndex(i) device, ret := nvml.DeviceGetHandleByIndex(i)
if ret == nvml.SUCCESS { if errors.Is(ret, nvml.SUCCESS) {
utilization, ret := device.GetUtilizationRates() utilization, ret := device.GetUtilizationRates()
if ret == nvml.SUCCESS { if errors.Is(ret, nvml.SUCCESS) {
totalGPU += float64(utilization.Gpu) totalGPU += float64(utilization.Gpu)
activeGPUs++ activeGPUs++
} }
@ -310,12 +255,12 @@ func getCurrentUsage() ResourceUsage {
for _, stat := range diskStats { for _, stat := range diskStats {
totalIO += stat.ReadBytes + stat.WriteBytes totalIO += stat.ReadBytes + stat.WriteBytes
} }
usage.DiskIO = totalIO usage.DiskIO = float64(totalIO)
} }
// Get network I/O // Get network I/O
if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 { if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 {
usage.NetworkIO = netStats[0].BytesSent + netStats[0].BytesRecv usage.NetworkIO = float64(netStats[0].BytesSent + netStats[0].BytesRecv)
} }
// Count SSH connections // Count SSH connections
@ -328,7 +273,7 @@ func getCurrentUsage() ResourceUsage {
usage.ActiveUsers = userCount usage.ActiveUsers = userCount
} }
return usage currentStatus = usage
} }
func getSSHConnectionCount() (int, error) { func getSSHConnectionCount() (int, error) {
@ -375,60 +320,6 @@ func getActiveUserCount() (int, error) {
return count, nil return count, nil
} }
func isSystemIdle(history []ResourceUsage) bool {
// Don't allow sleep during grace period after resume
if isInsideResumeGracePeriod() {
return false
}
if len(history) < 2 {
return false
}
var avgCPU, avgGPU float64
var maxSSHConnections, maxActiveUsers int
samples := len(history)
for _, usage := range history {
avgCPU += usage.CpuUsage
if nvmlAvailable {
avgGPU += usage.GpuUsage
}
if usage.SshConnections > maxSSHConnections {
maxSSHConnections = usage.SshConnections
}
if usage.ActiveUsers > maxActiveUsers {
maxActiveUsers = usage.ActiveUsers
}
}
// Calculate I/O rates using first and last samples
duration := history[samples-1].Timestamp.Sub(history[0].Timestamp).Seconds()
diskIORate := float64(history[samples-1].DiskIO-history[0].DiskIO) / duration
netIORate := float64(history[samples-1].NetworkIO-history[0].NetworkIO) / duration
avgCPU /= float64(samples)
if nvmlAvailable {
avgGPU /= float64(samples)
}
// Basic checks that always apply
if avgCPU >= cpuThreshold ||
diskIORate >= float64(diskThreshold) ||
netIORate >= float64(networkThreshold) ||
maxSSHConnections > 0 ||
maxActiveUsers > 0 {
return false
}
// GPU check only if NVML is available
if nvmlAvailable && avgGPU >= gpuThreshold {
return false
}
return true
}
func suspendSystem() error { func suspendSystem() error {
cmd := exec.Command("systemctl", "suspend") cmd := exec.Command("systemctl", "suspend")
return cmd.Run() return cmd.Run()