idlesleep/main.go

package main

import (
	"context"
	"encoding/json"
	"fmt"
	"log"
	"net/http"
	"os"
	"os/exec"
	"os/signal"
	"strings"
	"sync"
	"syscall"
	"time"

	"github.com/NVIDIA/go-nvml/pkg/nvml"
	"github.com/shirou/gopsutil/v3/cpu"
	"github.com/shirou/gopsutil/v3/disk"
	"github.com/shirou/gopsutil/v3/net"
	"github.com/shirou/gopsutil/v3/process"
)

const (
	checkInterval     = 10 * time.Second
	monitoringPeriod  = 5 * time.Minute
	resumeGracePeriod = 5 * time.Minute // Time to wait after resume before allowing sleep again
	cpuThreshold      = 20.0            // percentage
	gpuThreshold      = 20.0            // percentage
	diskThreshold     = 5 * 1024 * 1024 // 5 MB/s
	networkThreshold  = 1 * 1024 * 1024 // 1 MB/s
	httpPort          = 8081
)

type ResourceUsage struct {
	Timestamp      time.Time `json:"timestamp"`
	CpuUsage       float64   `json:"cpu_usage"`
	GpuUsage       float64   `json:"gpu_usage"`
	GpuAvailable   bool      `json:"gpu_available"`
	DiskIO         uint64    `json:"disk_io"`
	NetworkIO      uint64    `json:"network_io"`
	SshConnections int       `json:"ssh_connections"`
	ActiveUsers    int       `json:"active_users"`
}

type SystemStatus struct {
	CurrentUsage  ResourceUsage `json:"current_usage"`
	Blockers      []string      `json:"sleep_blockers"`
	InGracePeriod bool          `json:"in_grace_period,omitempty"`
	GraceTimeLeft string        `json:"grace_time_left,omitempty"`
}

var (
	currentStatus  SystemStatus
	statusMutex    sync.RWMutex
	nvmlAvailable  bool
	lastResumeTime time.Time // Track when the system last resumed from sleep
	lastTickTime   time.Time // Track when we last processed a tick
)

func main() {
	// Check if running as root
	if os.Geteuid() != 0 {
		log.Fatal("This program must be run as root")
	}

	// Initialize NVML for GPU monitoring
	ret := nvml.Init()
	if ret != nvml.SUCCESS {
		log.Printf("Warning: Could not initialize NVML: %v", ret)
		nvmlAvailable = false
	} else {
		nvmlAvailable = true
		defer nvml.Shutdown()
	}

	// Create a context that we'll use to shut down the application
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	// Set up signal handling
	sigChan := make(chan os.Signal, 1)
	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)

	// Start HTTP server with context
	srv := startHTTPServer(ctx)

	usageHistory := make([]ResourceUsage, 0)
	ticker := time.NewTicker(checkInterval)
	defer ticker.Stop()

	log.Printf("Starting idle monitoring. System will suspend when:\n")
	log.Printf("- Average CPU usage across all cores < %.1f%%\n", cpuThreshold)
	if nvmlAvailable {
		log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold)
	} else {
		log.Printf("- GPU monitoring disabled (NVML initialization failed)")
	}
	log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024))
	log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024))
	log.Printf("- No active SSH connections\n")
	log.Printf("- No active user sessions\n")
	log.Printf("- Over the last %v\n", monitoringPeriod)
	log.Printf("- System will not suspend for %v after resuming from sleep\n", resumeGracePeriod)
	log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort)
	log.Printf("Press Ctrl+C to exit\n")

mainLoop:
	for {
		select {
		case <-ctx.Done():
			break mainLoop
		case sig := <-sigChan:
			log.Printf("Received signal %v, shutting down...", sig)
			cancel()
			break mainLoop
		case <-ticker.C:
			now := time.Now()

			// Check if we just resumed from sleep
			if !lastTickTime.IsZero() {
				gap := now.Sub(lastTickTime)
				// If there was a significant gap, probably resumed from sleep
				if gap > (checkInterval*3) && gap < time.Hour {
					log.Printf("Detected system resume after gap of %v", gap)
					lastResumeTime = now
				}
			}
			lastTickTime = now

			usage := getCurrentUsage()
			usageHistory = append(usageHistory, usage)

			// Update current status
			updateSystemStatus(usage, usageHistory)

			// Remove entries older than monitoring period
			cutoff := time.Now().Add(-monitoringPeriod)
			for i, u := range usageHistory {
				if u.Timestamp.After(cutoff) {
					usageHistory = usageHistory[i:]
					break
				}
			}

			if len(usageHistory) > 0 && isSystemIdle(usageHistory) {
				log.Printf("System status before suspend:\n")
				log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage)
				if nvmlAvailable {
					log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].GpuUsage)
				}
				log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].SshConnections)
				log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].ActiveUsers)
				log.Println("System has been idle for the monitoring period. Suspending...")
				if err := suspendSystem(); err != nil {
					log.Printf("Failed to suspend system: %v", err)
				}
			}
		}
	}

	// Graceful shutdown of HTTP server
	shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
	defer shutdownCancel()

	if err := srv.Shutdown(shutdownCtx); err != nil {
		log.Printf("HTTP server shutdown error: %v", err)
	}

	log.Println("Goodbye!")
}

// Function to check if we're within the resume grace period
func isInsideResumeGracePeriod() bool {
	return !lastResumeTime.IsZero() && time.Since(lastResumeTime) < resumeGracePeriod
}

// Function to calculate time left in grace period
func timeLeftInGracePeriod() time.Duration {
	if !isInsideResumeGracePeriod() {
		return 0
	}
	return resumeGracePeriod - time.Since(lastResumeTime)
}

func startHTTPServer(ctx context.Context) *http.Server {
	srv := &http.Server{
		Addr: fmt.Sprintf(":%d", httpPort),
	}

	http.HandleFunc("/status", handleStatus)

	go func() {
		if err := srv.ListenAndServe(); err != http.ErrServerClosed {
			log.Printf("HTTP server error: %v", err)
		}
	}()

	return srv
}

func handleStatus(w http.ResponseWriter, r *http.Request) {
	statusMutex.RLock()
	defer statusMutex.RUnlock()

	w.Header().Set("Content-Type", "application/json")
	json.NewEncoder(w).Encode(currentStatus)
}

func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
	statusMutex.Lock()
	defer statusMutex.Unlock()

	currentStatus.CurrentUsage = current
	currentStatus.Blockers = []string{}

	// Add grace period info to status
	if isInsideResumeGracePeriod() {
		timeLeft := timeLeftInGracePeriod()
		currentStatus.InGracePeriod = true
		currentStatus.GraceTimeLeft = timeLeft.Round(time.Second).String()
		currentStatus.Blockers = append(currentStatus.Blockers,
			fmt.Sprintf("Resume grace period: %v remaining", timeLeft.Round(time.Second)))
	} else {
		currentStatus.InGracePeriod = false
		currentStatus.GraceTimeLeft = ""
	}

	if len(history) >= 2 {
		// Calculate rates using last two samples
		duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds()
		diskIORate := float64(history[len(history)-1].DiskIO-history[len(history)-2].DiskIO) / duration
		netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration

		if current.CpuUsage >= cpuThreshold {
			currentStatus.Blockers = append(currentStatus.Blockers,
				fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold))
		}
		if nvmlAvailable && current.GpuUsage >= gpuThreshold {
			currentStatus.Blockers = append(currentStatus.Blockers,
				fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold))
		}
		if diskIORate >= float64(diskThreshold) {
			currentStatus.Blockers = append(currentStatus.Blockers,
				fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s",
					diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024)))
		}
		if netIORate >= float64(networkThreshold) {
			currentStatus.Blockers = append(currentStatus.Blockers,
				fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s",
					netIORate/(1024*1024), float64(networkThreshold)/(1024*1024)))
		}
	}

	if current.SshConnections > 0 {
		currentStatus.Blockers = append(currentStatus.Blockers,
			fmt.Sprintf("Active SSH connections: %d", current.SshConnections))
	}
	if current.ActiveUsers > 0 {
		currentStatus.Blockers = append(currentStatus.Blockers,
			fmt.Sprintf("Active user sessions: %d", current.ActiveUsers))
	}

	if len(currentStatus.Blockers) == 0 {
		currentStatus.Blockers = append(currentStatus.Blockers, "No blockers - system can sleep")
	}
}

func getCurrentUsage() ResourceUsage {
	usage := ResourceUsage{
		Timestamp:    time.Now(),
		GpuAvailable: nvmlAvailable,
	}

	// Get CPU usage across all cores
	if cpuPercent, err := cpu.Percent(0, true); err == nil && len(cpuPercent) > 0 {
		// Calculate average CPU usage across all cores
		var totalCPU float64
		for _, percent := range cpuPercent {
			totalCPU += percent
		}
		usage.CpuUsage = totalCPU / float64(len(cpuPercent))
	}

	// Get GPU usage across all GPUs if available
	if nvmlAvailable {
		count, ret := nvml.DeviceGetCount()
		if ret == nvml.SUCCESS && count > 0 {
			var totalGPU float64
			var activeGPUs int
			for i := 0; i < count; i++ {
				device, ret := nvml.DeviceGetHandleByIndex(i)
				if ret == nvml.SUCCESS {
					utilization, ret := device.GetUtilizationRates()
					if ret == nvml.SUCCESS {
						totalGPU += float64(utilization.Gpu)
						activeGPUs++
					}
				}
			}
			if activeGPUs > 0 {
				usage.GpuUsage = totalGPU / float64(activeGPUs)
			}
		}
	}

	// Get disk I/O
	if diskStats, err := disk.IOCounters(); err == nil {
		var totalIO uint64
		for _, stat := range diskStats {
			totalIO += stat.ReadBytes + stat.WriteBytes
		}
		usage.DiskIO = totalIO
	}

	// Get network I/O
	if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 {
		usage.NetworkIO = netStats[0].BytesSent + netStats[0].BytesRecv
	}

	// Count SSH connections
	if sshCount, err := getSSHConnectionCount(); err == nil {
		usage.SshConnections = sshCount
	}

	// Count active user sessions
	if userCount, err := getActiveUserCount(); err == nil {
		usage.ActiveUsers = userCount
	}

	return usage
}

func getSSHConnectionCount() (int, error) {
	processes, err := process.Processes()
	if err != nil {
		return 0, err
	}

	count := 0
	for _, p := range processes {
		name, err := p.Name()
		if err != nil {
			continue
		}
		if name == "sshd" {
			cmdline, err := p.Cmdline()
			if err != nil {
				continue
			}
			// Only count sshd processes that are handling connections
			// The main sshd process doesn't have "@" in its cmdline
			if strings.Contains(cmdline, "@") {
				count++
			}
		}
	}
	return count, nil
}

func getActiveUserCount() (int, error) {
	out, err := exec.Command("who", "-s").Output()
	if err != nil {
		return 0, err
	}

	// Count non-empty lines
	lines := strings.Split(strings.TrimSpace(string(out)), "\n")
	count := 0
	for _, line := range lines {
		if line != "" {
			count++
		}
	}
	return count, nil
}

func isSystemIdle(history []ResourceUsage) bool {
	// Don't allow sleep during grace period after resume
	if isInsideResumeGracePeriod() {
		return false
	}

	if len(history) < 2 {
		return false
	}

	var avgCPU, avgGPU float64
	var maxSSHConnections, maxActiveUsers int
	samples := len(history)

	for _, usage := range history {
		avgCPU += usage.CpuUsage
		if nvmlAvailable {
			avgGPU += usage.GpuUsage
		}
		if usage.SshConnections > maxSSHConnections {
			maxSSHConnections = usage.SshConnections
		}
		if usage.ActiveUsers > maxActiveUsers {
			maxActiveUsers = usage.ActiveUsers
		}
	}

	// Calculate I/O rates using first and last samples
	duration := history[samples-1].Timestamp.Sub(history[0].Timestamp).Seconds()
	diskIORate := float64(history[samples-1].DiskIO-history[0].DiskIO) / duration
	netIORate := float64(history[samples-1].NetworkIO-history[0].NetworkIO) / duration

	avgCPU /= float64(samples)
	if nvmlAvailable {
		avgGPU /= float64(samples)
	}

	// Basic checks that always apply
	if avgCPU >= cpuThreshold ||
		diskIORate >= float64(diskThreshold) ||
		netIORate >= float64(networkThreshold) ||
		maxSSHConnections > 0 ||
		maxActiveUsers > 0 {
		return false
	}

	// GPU check only if NVML is available
	if nvmlAvailable && avgGPU >= gpuThreshold {
		return false
	}

	return true
}

func suspendSystem() error {
	cmd := exec.Command("systemctl", "suspend")
	return cmd.Run()
}