Simplify system

2025-03-30 12:12:32 +02:00
parent 5a27277e7c
commit 93fe9ebed3
1 changed files with 72 additions and 181 deletions
--- a/main.go
+++ b/main.go
@@ -3,6 +3,7 @@ package main
 import (
 	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"log"
 	"net/http"
@@ -37,27 +38,26 @@ type ResourceUsage struct {
 	CpuUsage       float64   `json:"cpu_usage"`
 	GpuUsage       float64   `json:"gpu_usage"`
 	GpuAvailable   bool      `json:"gpu_available"`
-	DiskIO         uint64    `json:"disk_io"`
+	DiskIO         float64   `json:"disk_io"`
-	NetworkIO      uint64    `json:"network_io"`
+	NetworkIO      float64   `json:"network_io"`
 	SshConnections int       `json:"ssh_connections"`
 	ActiveUsers    int       `json:"active_users"`
 }
 type SystemStatus struct {
 	CurrentUsage  ResourceUsage `json:"current_usage"`
 	Blockers      []string      `json:"sleep_blockers"`
 	InGracePeriod bool          `json:"in_grace_period,omitempty"`
 	GraceTimeLeft string        `json:"grace_time_left,omitempty"`
 }
 var (
 	currentStatus  SystemStatus
 	statusMutex     sync.RWMutex
 	blockers        []string
 	currentStatus   ResourceUsage
 	nvmlAvailable   bool
-	lastResumeTime time.Time // Track when the system last resumed from sleep
+	lastBlockedTime time.Time
 	lastTickTime   time.Time // Track when we last processed a tick
 )
 func Must(err error) {
 	if err != nil {
 		log.Fatalf("Error: %v", err)
 	}
 }
 func main() {
 	// Check if running as root
 	if os.Geteuid() != 0 {
@@ -66,24 +66,20 @@ func main() {
 	// Initialize NVML for GPU monitoring
 	ret := nvml.Init()
-	if ret != nvml.SUCCESS {
+	if !errors.Is(ret, nvml.SUCCESS) {
 		log.Printf("Warning: Could not initialize NVML: %v", ret)
 		nvmlAvailable = false
 	} else {
 		nvmlAvailable = true
-		defer nvml.Shutdown()
+		defer Must(nvml.Shutdown())
 	}
 	// Create a context that we'll use to shut down the application
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 	// Set up signal handling
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
 	// Start HTTP server with context
-	srv := startHTTPServer(ctx)
+	srv := startHTTPServer()
 	usageHistory := make([]ResourceUsage, 0)
 	ticker := time.NewTicker(checkInterval)
@@ -108,42 +104,14 @@ func main() {
 mainLoop:
 	for {
 		select {
 		case <-ctx.Done():
 			break mainLoop
 		case sig := <-sigChan:
 			log.Printf("Received signal %v, shutting down...", sig)
 			cancel()
 			break mainLoop
 		case <-ticker.C:
-			now := time.Now()
+			updateCurrentUsage()
 			updateSystemStatus()
-			// Check if we just resumed from sleep
+			if time.Now().Sub(lastBlockedTime) >= monitoringPeriod {
 			if !lastTickTime.IsZero() {
 				gap := now.Sub(lastTickTime)
 				// If there was a significant gap, probably resumed from sleep
 				if gap > (checkInterval*3) && gap < time.Hour {
 					log.Printf("Detected system resume after gap of %v", gap)
 					lastResumeTime = now
 				}
 			}
 			lastTickTime = now
 			usage := getCurrentUsage()
 			usageHistory = append(usageHistory, usage)
 			// Update current status
 			updateSystemStatus(usage, usageHistory)
 			// Remove entries older than monitoring period
 			cutoff := time.Now().Add(-monitoringPeriod)
 			for i, u := range usageHistory {
 				if u.Timestamp.After(cutoff) {
 					usageHistory = usageHistory[i:]
 					break
 				}
 			}
 			if len(usageHistory) > 0 && isSystemIdle(usageHistory) {
 				log.Printf("System status before suspend:\n")
 				log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage)
 				if nvmlAvailable {
@@ -155,6 +123,8 @@ mainLoop:
 				if err := suspendSystem(); err != nil {
 					log.Printf("Failed to suspend system: %v", err)
 				}
 				lastBlockedTime = time.Now()
 				log.Printf("Resumed")
 			}
 		}
 	}
@@ -170,20 +140,7 @@ mainLoop:
 	log.Println("Goodbye!")
 }
-// Function to check if we're within the resume grace period
+func startHTTPServer() *http.Server {
 func isInsideResumeGracePeriod() bool {
 	return !lastResumeTime.IsZero() && time.Since(lastResumeTime) < resumeGracePeriod
 }
 // Function to calculate time left in grace period
 func timeLeftInGracePeriod() time.Duration {
 	if !isInsideResumeGracePeriod() {
 		return 0
 	}
 	return resumeGracePeriod - time.Since(lastResumeTime)
 }
 func startHTTPServer(ctx context.Context) *http.Server {
 	srv := &http.Server{
 		Addr: fmt.Sprintf(":%d", httpPort),
 	}
@@ -191,7 +148,7 @@ func startHTTPServer(ctx context.Context) *http.Server {
 	http.HandleFunc("/status", handleStatus)
 	go func() {
-		if err := srv.ListenAndServe(); err != http.ErrServerClosed {
+		if err := srv.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
 			log.Printf("HTTP server error: %v", err)
 		}
 	}()
@@ -199,74 +156,62 @@ func startHTTPServer(ctx context.Context) *http.Server {
 	return srv
 }
-func handleStatus(w http.ResponseWriter, r *http.Request) {
+func handleStatus(w http.ResponseWriter, _ *http.Request) {
 	statusMutex.RLock()
 	defer statusMutex.RUnlock()
 	w.Header().Set("Content-Type", "application/json")
-	json.NewEncoder(w).Encode(currentStatus)
+	err := json.NewEncoder(w).Encode(currentStatus)
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 	}
 }
-func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
+func updateSystemStatus() {
 	statusMutex.Lock()
 	defer statusMutex.Unlock()
-	currentStatus.CurrentUsage = current
+	blockers = []string{}
 	currentStatus.Blockers = []string{}
-	// Add grace period info to status
+	if currentStatus.CpuUsage >= cpuThreshold {
-	if isInsideResumeGracePeriod() {
+		blockers = append(blockers,
-		timeLeft := timeLeftInGracePeriod()
+			fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", currentStatus.CpuUsage, cpuThreshold))
 		currentStatus.InGracePeriod = true
 		currentStatus.GraceTimeLeft = timeLeft.Round(time.Second).String()
 		currentStatus.Blockers = append(currentStatus.Blockers,
 			fmt.Sprintf("Resume grace period: %v remaining", timeLeft.Round(time.Second)))
 	} else {
 		currentStatus.InGracePeriod = false
 		currentStatus.GraceTimeLeft = ""
 	}
-
+	if nvmlAvailable && currentStatus.GpuUsage >= gpuThreshold {
-	if len(history) >= 2 {
+		blockers = append(blockers,
-		// Calculate rates using last two samples
+			fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", currentStatus.GpuUsage, gpuThreshold))
 		duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds()
 		diskIORate := float64(history[len(history)-1].DiskIO-history[len(history)-2].DiskIO) / duration
 		netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration
 		if current.CpuUsage >= cpuThreshold {
 			currentStatus.Blockers = append(currentStatus.Blockers,
 				fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold))
 	}
-		if nvmlAvailable && current.GpuUsage >= gpuThreshold {
+	if currentStatus.DiskIO >= float64(diskThreshold) {
-			currentStatus.Blockers = append(currentStatus.Blockers,
+		blockers = append(blockers,
 				fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold))
 		}
 		if diskIORate >= float64(diskThreshold) {
 			currentStatus.Blockers = append(currentStatus.Blockers,
 			fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s",
-					diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024)))
+				currentStatus.DiskIO/(1024*1024), float64(diskThreshold)/(1024*1024)))
 	}
-		if netIORate >= float64(networkThreshold) {
+	if currentStatus.NetworkIO >= float64(networkThreshold) {
-			currentStatus.Blockers = append(currentStatus.Blockers,
+		blockers = append(blockers,
 			fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s",
-					netIORate/(1024*1024), float64(networkThreshold)/(1024*1024)))
+				currentStatus.NetworkIO/(1024*1024), float64(networkThreshold)/(1024*1024)))
 	}
 	if currentStatus.SshConnections > 0 {
 		blockers = append(blockers,
 			fmt.Sprintf("Active SSH connections: %d", currentStatus.SshConnections))
 	}
 	if currentStatus.ActiveUsers > 0 {
 		blockers = append(blockers,
 			fmt.Sprintf("Active user sessions: %d", currentStatus.ActiveUsers))
 	}
 	if len(blockers) == 0 {
 		blockers = append(blockers, "No blockers - system can sleep")
 	} else {
 		lastBlockedTime = time.Now()
 	}
 }
-	if current.SshConnections > 0 {
+func updateCurrentUsage() {
-		currentStatus.Blockers = append(currentStatus.Blockers,
+	statusMutex.Lock()
-			fmt.Sprintf("Active SSH connections: %d", current.SshConnections))
+	defer statusMutex.Unlock()
 	}
 	if current.ActiveUsers > 0 {
 		currentStatus.Blockers = append(currentStatus.Blockers,
 			fmt.Sprintf("Active user sessions: %d", current.ActiveUsers))
 	}
 	if len(currentStatus.Blockers) == 0 {
 		currentStatus.Blockers = append(currentStatus.Blockers, "No blockers - system can sleep")
 	}
 }
 func getCurrentUsage() ResourceUsage {
 	usage := ResourceUsage{
 		Timestamp:    time.Now(),
 		GpuAvailable: nvmlAvailable,
@@ -285,14 +230,14 @@ func getCurrentUsage() ResourceUsage {
 	// Get GPU usage across all GPUs if available
 	if nvmlAvailable {
 		count, ret := nvml.DeviceGetCount()
-		if ret == nvml.SUCCESS && count > 0 {
+		if errors.Is(ret, nvml.SUCCESS) && count > 0 {
 			var totalGPU float64
 			var activeGPUs int
 			for i := 0; i < count; i++ {
 				device, ret := nvml.DeviceGetHandleByIndex(i)
-				if ret == nvml.SUCCESS {
+				if errors.Is(ret, nvml.SUCCESS) {
 					utilization, ret := device.GetUtilizationRates()
-					if ret == nvml.SUCCESS {
+					if errors.Is(ret, nvml.SUCCESS) {
 						totalGPU += float64(utilization.Gpu)
 						activeGPUs++
 					}
@@ -310,12 +255,12 @@ func getCurrentUsage() ResourceUsage {
 		for _, stat := range diskStats {
 			totalIO += stat.ReadBytes + stat.WriteBytes
 		}
-		usage.DiskIO = totalIO
+		usage.DiskIO = float64(totalIO)
 	}
 	// Get network I/O
 	if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 {
-		usage.NetworkIO = netStats[0].BytesSent + netStats[0].BytesRecv
+		usage.NetworkIO = float64(netStats[0].BytesSent + netStats[0].BytesRecv)
 	}
 	// Count SSH connections
@@ -328,7 +273,7 @@ func getCurrentUsage() ResourceUsage {
 		usage.ActiveUsers = userCount
 	}
-	return usage
+	currentStatus = usage
 }
 func getSSHConnectionCount() (int, error) {
@@ -375,60 +320,6 @@ func getActiveUserCount() (int, error) {
 	return count, nil
 }
 func isSystemIdle(history []ResourceUsage) bool {
 	// Don't allow sleep during grace period after resume
 	if isInsideResumeGracePeriod() {
 		return false
 	}
 	if len(history) < 2 {
 		return false
 	}
 	var avgCPU, avgGPU float64
 	var maxSSHConnections, maxActiveUsers int
 	samples := len(history)
 	for _, usage := range history {
 		avgCPU += usage.CpuUsage
 		if nvmlAvailable {
 			avgGPU += usage.GpuUsage
 		}
 		if usage.SshConnections > maxSSHConnections {
 			maxSSHConnections = usage.SshConnections
 		}
 		if usage.ActiveUsers > maxActiveUsers {
 			maxActiveUsers = usage.ActiveUsers
 		}
 	}
 	// Calculate I/O rates using first and last samples
 	duration := history[samples-1].Timestamp.Sub(history[0].Timestamp).Seconds()
 	diskIORate := float64(history[samples-1].DiskIO-history[0].DiskIO) / duration
 	netIORate := float64(history[samples-1].NetworkIO-history[0].NetworkIO) / duration
 	avgCPU /= float64(samples)
 	if nvmlAvailable {
 		avgGPU /= float64(samples)
 	}
 	// Basic checks that always apply
 	if avgCPU >= cpuThreshold ||
 		diskIORate >= float64(diskThreshold) ||
 		netIORate >= float64(networkThreshold) ||
 		maxSSHConnections > 0 ||
 		maxActiveUsers > 0 {
 		return false
 	}
 	// GPU check only if NVML is available
 	if nvmlAvailable && avgGPU >= gpuThreshold {
 		return false
 	}
 	return true
 }
 func suspendSystem() error {
 	cmd := exec.Command("systemctl", "suspend")
 	return cmd.Run()