Simplify system

2025-03-30 12:12:32 +02:00
parent 5a27277e7c
commit 93fe9ebed3
1 changed files with 72 additions and 181 deletions
--- a/main.go
+++ b/main.go
@@ -3,6 +3,7 @@ package main
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"log"
 	"net/http"
@@ -37,27 +38,26 @@ type ResourceUsage struct {
 	CpuUsage       float64   `json:"cpu_usage"`
 	GpuUsage       float64   `json:"gpu_usage"`
 	GpuAvailable   bool      `json:"gpu_available"`
-	DiskIO         uint64    `json:"disk_io"`
-	NetworkIO      uint64    `json:"network_io"`
+	DiskIO         float64   `json:"disk_io"`
+	NetworkIO      float64   `json:"network_io"`
 	SshConnections int       `json:"ssh_connections"`
 	ActiveUsers    int       `json:"active_users"`
 }

-type SystemStatus struct {
-	CurrentUsage  ResourceUsage `json:"current_usage"`
-	Blockers      []string      `json:"sleep_blockers"`
-	InGracePeriod bool          `json:"in_grace_period,omitempty"`
-	GraceTimeLeft string        `json:"grace_time_left,omitempty"`
-}
-
 var (
-	currentStatus  SystemStatus
 	statusMutex     sync.RWMutex
+	blockers        []string
+	currentStatus   ResourceUsage
 	nvmlAvailable   bool
-	lastResumeTime time.Time // Track when the system last resumed from sleep
-	lastTickTime   time.Time // Track when we last processed a tick
+	lastBlockedTime time.Time
 )

+func Must(err error) {
+	if err != nil {
+		log.Fatalf("Error: %v", err)
+	}
+}
+
 func main() {
 	// Check if running as root
 	if os.Geteuid() != 0 {
@@ -66,24 +66,20 @@ func main() {

 	// Initialize NVML for GPU monitoring
 	ret := nvml.Init()
-	if ret != nvml.SUCCESS {
+	if !errors.Is(ret, nvml.SUCCESS) {
 		log.Printf("Warning: Could not initialize NVML: %v", ret)
 		nvmlAvailable = false
 	} else {
 		nvmlAvailable = true
-		defer nvml.Shutdown()
+		defer Must(nvml.Shutdown())
 	}

-	// Create a context that we'll use to shut down the application
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
 	// Set up signal handling
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)

 	// Start HTTP server with context
-	srv := startHTTPServer(ctx)
+	srv := startHTTPServer()

 	usageHistory := make([]ResourceUsage, 0)
 	ticker := time.NewTicker(checkInterval)
@@ -108,42 +104,14 @@ func main() {
 mainLoop:
 	for {
 		select {
-		case <-ctx.Done():
-			break mainLoop
 		case sig := <-sigChan:
 			log.Printf("Received signal %v, shutting down...", sig)
-			cancel()
 			break mainLoop
 		case <-ticker.C:
-			now := time.Now()
+			updateCurrentUsage()
+			updateSystemStatus()

-			// Check if we just resumed from sleep
-			if !lastTickTime.IsZero() {
-				gap := now.Sub(lastTickTime)
-				// If there was a significant gap, probably resumed from sleep
-				if gap > (checkInterval*3) && gap < time.Hour {
-					log.Printf("Detected system resume after gap of %v", gap)
-					lastResumeTime = now
-				}
-			}
-			lastTickTime = now
-
-			usage := getCurrentUsage()
-			usageHistory = append(usageHistory, usage)
-
-			// Update current status
-			updateSystemStatus(usage, usageHistory)
-
-			// Remove entries older than monitoring period
-			cutoff := time.Now().Add(-monitoringPeriod)
-			for i, u := range usageHistory {
-				if u.Timestamp.After(cutoff) {
-					usageHistory = usageHistory[i:]
-					break
-				}
-			}
-
-			if len(usageHistory) > 0 && isSystemIdle(usageHistory) {
+			if time.Now().Sub(lastBlockedTime) >= monitoringPeriod {
 				log.Printf("System status before suspend:\n")
 				log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage)
 				if nvmlAvailable {
@@ -155,6 +123,8 @@ mainLoop:
 				if err := suspendSystem(); err != nil {
 					log.Printf("Failed to suspend system: %v", err)
 				}
+				lastBlockedTime = time.Now()
+				log.Printf("Resumed")
 			}
 		}
 	}
@@ -170,20 +140,7 @@ mainLoop:
 	log.Println("Goodbye!")
 }

-// Function to check if we're within the resume grace period
-func isInsideResumeGracePeriod() bool {
-	return !lastResumeTime.IsZero() && time.Since(lastResumeTime) < resumeGracePeriod
-}
-
-// Function to calculate time left in grace period
-func timeLeftInGracePeriod() time.Duration {
-	if !isInsideResumeGracePeriod() {
-		return 0
-	}
-	return resumeGracePeriod - time.Since(lastResumeTime)
-}
-
-func startHTTPServer(ctx context.Context) *http.Server {
+func startHTTPServer() *http.Server {
 	srv := &http.Server{
 		Addr: fmt.Sprintf(":%d", httpPort),
 	}
@@ -191,7 +148,7 @@ func startHTTPServer(ctx context.Context) *http.Server {
 	http.HandleFunc("/status", handleStatus)

 	go func() {
-		if err := srv.ListenAndServe(); err != http.ErrServerClosed {
+		if err := srv.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
 			log.Printf("HTTP server error: %v", err)
 		}
 	}()
@@ -199,74 +156,62 @@ func startHTTPServer(ctx context.Context) *http.Server {
 	return srv
 }

-func handleStatus(w http.ResponseWriter, r *http.Request) {
+func handleStatus(w http.ResponseWriter, _ *http.Request) {
 	statusMutex.RLock()
 	defer statusMutex.RUnlock()

 	w.Header().Set("Content-Type", "application/json")
-	json.NewEncoder(w).Encode(currentStatus)
+	err := json.NewEncoder(w).Encode(currentStatus)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+	}
 }

-func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
+func updateSystemStatus() {
 	statusMutex.Lock()
 	defer statusMutex.Unlock()

-	currentStatus.CurrentUsage = current
-	currentStatus.Blockers = []string{}
+	blockers = []string{}

-	// Add grace period info to status
-	if isInsideResumeGracePeriod() {
-		timeLeft := timeLeftInGracePeriod()
-		currentStatus.InGracePeriod = true
-		currentStatus.GraceTimeLeft = timeLeft.Round(time.Second).String()
-		currentStatus.Blockers = append(currentStatus.Blockers,
-			fmt.Sprintf("Resume grace period: %v remaining", timeLeft.Round(time.Second)))
-	} else {
-		currentStatus.InGracePeriod = false
-		currentStatus.GraceTimeLeft = ""
+	if currentStatus.CpuUsage >= cpuThreshold {
+		blockers = append(blockers,
+			fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", currentStatus.CpuUsage, cpuThreshold))
 	}
-
-	if len(history) >= 2 {
-		// Calculate rates using last two samples
-		duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds()
-		diskIORate := float64(history[len(history)-1].DiskIO-history[len(history)-2].DiskIO) / duration
-		netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration
-
-		if current.CpuUsage >= cpuThreshold {
-			currentStatus.Blockers = append(currentStatus.Blockers,
-				fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold))
+	if nvmlAvailable && currentStatus.GpuUsage >= gpuThreshold {
+		blockers = append(blockers,
+			fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", currentStatus.GpuUsage, gpuThreshold))
 	}
-		if nvmlAvailable && current.GpuUsage >= gpuThreshold {
-			currentStatus.Blockers = append(currentStatus.Blockers,
-				fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold))
-		}
-		if diskIORate >= float64(diskThreshold) {
-			currentStatus.Blockers = append(currentStatus.Blockers,
+	if currentStatus.DiskIO >= float64(diskThreshold) {
+		blockers = append(blockers,
 			fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s",
-					diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024)))
+				currentStatus.DiskIO/(1024*1024), float64(diskThreshold)/(1024*1024)))
 	}
-		if netIORate >= float64(networkThreshold) {
-			currentStatus.Blockers = append(currentStatus.Blockers,
+	if currentStatus.NetworkIO >= float64(networkThreshold) {
+		blockers = append(blockers,
 			fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s",
-					netIORate/(1024*1024), float64(networkThreshold)/(1024*1024)))
-		}
+				currentStatus.NetworkIO/(1024*1024), float64(networkThreshold)/(1024*1024)))
 	}

-	if current.SshConnections > 0 {
-		currentStatus.Blockers = append(currentStatus.Blockers,
-			fmt.Sprintf("Active SSH connections: %d", current.SshConnections))
+	if currentStatus.SshConnections > 0 {
+		blockers = append(blockers,
+			fmt.Sprintf("Active SSH connections: %d", currentStatus.SshConnections))
 	}
-	if current.ActiveUsers > 0 {
-		currentStatus.Blockers = append(currentStatus.Blockers,
-			fmt.Sprintf("Active user sessions: %d", current.ActiveUsers))
+	if currentStatus.ActiveUsers > 0 {
+		blockers = append(blockers,
+			fmt.Sprintf("Active user sessions: %d", currentStatus.ActiveUsers))
 	}

-	if len(currentStatus.Blockers) == 0 {
-		currentStatus.Blockers = append(currentStatus.Blockers, "No blockers - system can sleep")
+	if len(blockers) == 0 {
+		blockers = append(blockers, "No blockers - system can sleep")
+	} else {
+		lastBlockedTime = time.Now()
 	}
 }

-func getCurrentUsage() ResourceUsage {
+func updateCurrentUsage() {
+	statusMutex.Lock()
+	defer statusMutex.Unlock()
+
 	usage := ResourceUsage{
 		Timestamp:    time.Now(),
 		GpuAvailable: nvmlAvailable,
@@ -285,14 +230,14 @@ func getCurrentUsage() ResourceUsage {
 	// Get GPU usage across all GPUs if available
 	if nvmlAvailable {
 		count, ret := nvml.DeviceGetCount()
-		if ret == nvml.SUCCESS && count > 0 {
+		if errors.Is(ret, nvml.SUCCESS) && count > 0 {
 			var totalGPU float64
 			var activeGPUs int
 			for i := 0; i < count; i++ {
 				device, ret := nvml.DeviceGetHandleByIndex(i)
-				if ret == nvml.SUCCESS {
+				if errors.Is(ret, nvml.SUCCESS) {
 					utilization, ret := device.GetUtilizationRates()
-					if ret == nvml.SUCCESS {
+					if errors.Is(ret, nvml.SUCCESS) {
 						totalGPU += float64(utilization.Gpu)
 						activeGPUs++
 					}
@@ -310,12 +255,12 @@ func getCurrentUsage() ResourceUsage {
 		for _, stat := range diskStats {
 			totalIO += stat.ReadBytes + stat.WriteBytes
 		}
-		usage.DiskIO = totalIO
+		usage.DiskIO = float64(totalIO)
 	}

 	// Get network I/O
 	if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 {
-		usage.NetworkIO = netStats[0].BytesSent + netStats[0].BytesRecv
+		usage.NetworkIO = float64(netStats[0].BytesSent + netStats[0].BytesRecv)
 	}

 	// Count SSH connections
@@ -328,7 +273,7 @@ func getCurrentUsage() ResourceUsage {
 		usage.ActiveUsers = userCount
 	}

-	return usage
+	currentStatus = usage
 }

 func getSSHConnectionCount() (int, error) {
@@ -375,60 +320,6 @@ func getActiveUserCount() (int, error) {
 	return count, nil
 }

-func isSystemIdle(history []ResourceUsage) bool {
-	// Don't allow sleep during grace period after resume
-	if isInsideResumeGracePeriod() {
-		return false
-	}
-
-	if len(history) < 2 {
-		return false
-	}
-
-	var avgCPU, avgGPU float64
-	var maxSSHConnections, maxActiveUsers int
-	samples := len(history)
-
-	for _, usage := range history {
-		avgCPU += usage.CpuUsage
-		if nvmlAvailable {
-			avgGPU += usage.GpuUsage
-		}
-		if usage.SshConnections > maxSSHConnections {
-			maxSSHConnections = usage.SshConnections
-		}
-		if usage.ActiveUsers > maxActiveUsers {
-			maxActiveUsers = usage.ActiveUsers
-		}
-	}
-
-	// Calculate I/O rates using first and last samples
-	duration := history[samples-1].Timestamp.Sub(history[0].Timestamp).Seconds()
-	diskIORate := float64(history[samples-1].DiskIO-history[0].DiskIO) / duration
-	netIORate := float64(history[samples-1].NetworkIO-history[0].NetworkIO) / duration
-
-	avgCPU /= float64(samples)
-	if nvmlAvailable {
-		avgGPU /= float64(samples)
-	}
-
-	// Basic checks that always apply
-	if avgCPU >= cpuThreshold ||
-		diskIORate >= float64(diskThreshold) ||
-		netIORate >= float64(networkThreshold) ||
-		maxSSHConnections > 0 ||
-		maxActiveUsers > 0 {
-		return false
-	}
-
-	// GPU check only if NVML is available
-	if nvmlAvailable && avgGPU >= gpuThreshold {
-		return false
-	}
-
-	return true
-}
-
 func suspendSystem() error {
 	cmd := exec.Command("systemctl", "suspend")
 	return cmd.Run()