Add support for ctrl+c

This commit is contained in:
Sebastiaan de Schaetzen 2025-03-14 14:17:38 +01:00
parent 11b62aa1f0
commit 8d36e1e245

176
main.go
View File

@ -1,14 +1,17 @@
package main package main
import ( import (
"context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"log" "log"
"net/http" "net/http"
"os" "os"
"os/exec" "os/exec"
"os/signal"
"strings" "strings"
"sync" "sync"
"syscall"
"time" "time"
"github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/NVIDIA/go-nvml/pkg/nvml"
@ -29,13 +32,14 @@ const (
) )
type ResourceUsage struct { type ResourceUsage struct {
timestamp time.Time Timestamp time.Time `json:"timestamp"`
cpuUsage float64 CpuUsage float64 `json:"cpu_usage"`
gpuUsage float64 GpuUsage float64 `json:"gpu_usage"`
diskIO uint64 GpuAvailable bool `json:"gpu_available"`
networkIO uint64 DiskIO uint64 `json:"disk_io"`
sshConnections int NetworkIO uint64 `json:"network_io"`
activeUsers int SshConnections int `json:"ssh_connections"`
ActiveUsers int `json:"active_users"`
} }
type SystemStatus struct { type SystemStatus struct {
@ -46,6 +50,7 @@ type SystemStatus struct {
var ( var (
currentStatus SystemStatus currentStatus SystemStatus
statusMutex sync.RWMutex statusMutex sync.RWMutex
nvmlAvailable bool
) )
func main() { func main() {
@ -58,11 +63,22 @@ func main() {
ret := nvml.Init() ret := nvml.Init()
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
log.Printf("Warning: Could not initialize NVML: %v", ret) log.Printf("Warning: Could not initialize NVML: %v", ret)
} nvmlAvailable = false
} else {
nvmlAvailable = true
defer nvml.Shutdown() defer nvml.Shutdown()
}
// Start HTTP server // Create a context that we'll use to shut down the application
go startHTTPServer() ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Set up signal handling
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
// Start HTTP server with context
srv := startHTTPServer(ctx)
usageHistory := make([]ResourceUsage, 0) usageHistory := make([]ResourceUsage, 0)
ticker := time.NewTicker(checkInterval) ticker := time.NewTicker(checkInterval)
@ -70,15 +86,29 @@ func main() {
log.Printf("Starting idle monitoring. System will suspend when:\n") log.Printf("Starting idle monitoring. System will suspend when:\n")
log.Printf("- Average CPU usage across all cores < %.1f%%\n", cpuThreshold) log.Printf("- Average CPU usage across all cores < %.1f%%\n", cpuThreshold)
if nvmlAvailable {
log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold) log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold)
} else {
log.Printf("- GPU monitoring disabled (NVML initialization failed)")
}
log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024)) log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024))
log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024)) log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024))
log.Printf("- No active SSH connections\n") log.Printf("- No active SSH connections\n")
log.Printf("- No active user sessions\n") log.Printf("- No active user sessions\n")
log.Printf("Over the last %v\n", monitoringPeriod) log.Printf("Over the last %v\n", monitoringPeriod)
log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort) log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort)
log.Printf("Press Ctrl+C to exit\n")
for range ticker.C { mainLoop:
for {
select {
case <-ctx.Done():
break mainLoop
case sig := <-sigChan:
log.Printf("Received signal %v, shutting down...", sig)
cancel()
break mainLoop
case <-ticker.C:
usage := getCurrentUsage() usage := getCurrentUsage()
usageHistory = append(usageHistory, usage) usageHistory = append(usageHistory, usage)
@ -88,7 +118,7 @@ func main() {
// Remove entries older than monitoring period // Remove entries older than monitoring period
cutoff := time.Now().Add(-monitoringPeriod) cutoff := time.Now().Add(-monitoringPeriod)
for i, u := range usageHistory { for i, u := range usageHistory {
if u.timestamp.After(cutoff) { if u.Timestamp.After(cutoff) {
usageHistory = usageHistory[i:] usageHistory = usageHistory[i:]
break break
} }
@ -96,23 +126,45 @@ func main() {
if len(usageHistory) > 0 && isSystemIdle(usageHistory) { if len(usageHistory) > 0 && isSystemIdle(usageHistory) {
log.Printf("System status before suspend:\n") log.Printf("System status before suspend:\n")
log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].cpuUsage) log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage)
log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].gpuUsage) if nvmlAvailable {
log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].sshConnections) log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].GpuUsage)
log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].activeUsers) }
log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].SshConnections)
log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].ActiveUsers)
log.Println("System has been idle for the monitoring period. Suspending...") log.Println("System has been idle for the monitoring period. Suspending...")
if err := suspendSystem(); err != nil { if err := suspendSystem(); err != nil {
log.Printf("Failed to suspend system: %v", err) log.Printf("Failed to suspend system: %v", err)
} }
} }
} }
}
// Graceful shutdown of HTTP server
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer shutdownCancel()
if err := srv.Shutdown(shutdownCtx); err != nil {
log.Printf("HTTP server shutdown error: %v", err)
}
log.Println("Goodbye!")
} }
func startHTTPServer() { func startHTTPServer(ctx context.Context) *http.Server {
http.HandleFunc("/status", handleStatus) srv := &http.Server{
if err := http.ListenAndServe(fmt.Sprintf(":%d", httpPort), nil); err != nil { Addr: fmt.Sprintf(":%d", httpPort),
log.Printf("Failed to start HTTP server: %v", err)
} }
http.HandleFunc("/status", handleStatus)
go func() {
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
log.Printf("HTTP server error: %v", err)
}
}()
return srv
} }
func handleStatus(w http.ResponseWriter, r *http.Request) { func handleStatus(w http.ResponseWriter, r *http.Request) {
@ -132,17 +184,17 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
if len(history) >= 2 { if len(history) >= 2 {
// Calculate rates using last two samples // Calculate rates using last two samples
duration := history[len(history)-1].timestamp.Sub(history[len(history)-2].timestamp).Seconds() duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds()
diskIORate := float64(history[len(history)-1].diskIO-history[len(history)-2].diskIO) / duration diskIORate := float64(history[len(history)-1].DiskIO-history[len(history)-2].DiskIO) / duration
netIORate := float64(history[len(history)-1].networkIO-history[len(history)-2].networkIO) / duration netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration
if current.cpuUsage >= cpuThreshold { if current.CpuUsage >= cpuThreshold {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.cpuUsage, cpuThreshold)) fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold))
} }
if current.gpuUsage >= gpuThreshold { if nvmlAvailable && current.GpuUsage >= gpuThreshold {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.gpuUsage, gpuThreshold)) fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold))
} }
if diskIORate >= float64(diskThreshold) { if diskIORate >= float64(diskThreshold) {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
@ -156,13 +208,13 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
} }
} }
if current.sshConnections > 0 { if current.SshConnections > 0 {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Active SSH connections: %d", current.sshConnections)) fmt.Sprintf("Active SSH connections: %d", current.SshConnections))
} }
if current.activeUsers > 0 { if current.ActiveUsers > 0 {
currentStatus.Blockers = append(currentStatus.Blockers, currentStatus.Blockers = append(currentStatus.Blockers,
fmt.Sprintf("Active user sessions: %d", current.activeUsers)) fmt.Sprintf("Active user sessions: %d", current.ActiveUsers))
} }
if len(currentStatus.Blockers) == 0 { if len(currentStatus.Blockers) == 0 {
@ -172,7 +224,8 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
func getCurrentUsage() ResourceUsage { func getCurrentUsage() ResourceUsage {
usage := ResourceUsage{ usage := ResourceUsage{
timestamp: time.Now(), Timestamp: time.Now(),
GpuAvailable: nvmlAvailable,
} }
// Get CPU usage across all cores // Get CPU usage across all cores
@ -182,10 +235,11 @@ func getCurrentUsage() ResourceUsage {
for _, percent := range cpuPercent { for _, percent := range cpuPercent {
totalCPU += percent totalCPU += percent
} }
usage.cpuUsage = totalCPU / float64(len(cpuPercent)) usage.CpuUsage = totalCPU / float64(len(cpuPercent))
} }
// Get GPU usage across all GPUs // Get GPU usage across all GPUs if available
if nvmlAvailable {
count, ret := nvml.DeviceGetCount() count, ret := nvml.DeviceGetCount()
if ret == nvml.SUCCESS && count > 0 { if ret == nvml.SUCCESS && count > 0 {
var totalGPU float64 var totalGPU float64
@ -201,7 +255,8 @@ func getCurrentUsage() ResourceUsage {
} }
} }
if activeGPUs > 0 { if activeGPUs > 0 {
usage.gpuUsage = totalGPU / float64(activeGPUs) usage.GpuUsage = totalGPU / float64(activeGPUs)
}
} }
} }
@ -211,22 +266,22 @@ func getCurrentUsage() ResourceUsage {
for _, stat := range diskStats { for _, stat := range diskStats {
totalIO += stat.ReadBytes + stat.WriteBytes totalIO += stat.ReadBytes + stat.WriteBytes
} }
usage.diskIO = totalIO usage.DiskIO = totalIO
} }
// Get network I/O // Get network I/O
if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 { if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 {
usage.networkIO = netStats[0].BytesSent + netStats[0].BytesRecv usage.NetworkIO = netStats[0].BytesSent + netStats[0].BytesRecv
} }
// Count SSH connections // Count SSH connections
if sshCount, err := getSSHConnectionCount(); err == nil { if sshCount, err := getSSHConnectionCount(); err == nil {
usage.sshConnections = sshCount usage.SshConnections = sshCount
} }
// Count active user sessions // Count active user sessions
if userCount, err := getActiveUserCount(); err == nil { if userCount, err := getActiveUserCount(); err == nil {
usage.activeUsers = userCount usage.ActiveUsers = userCount
} }
return usage return usage
@ -286,30 +341,43 @@ func isSystemIdle(history []ResourceUsage) bool {
samples := len(history) samples := len(history)
for _, usage := range history { for _, usage := range history {
avgCPU += usage.cpuUsage avgCPU += usage.CpuUsage
avgGPU += usage.gpuUsage if nvmlAvailable {
if usage.sshConnections > maxSSHConnections { avgGPU += usage.GpuUsage
maxSSHConnections = usage.sshConnections
} }
if usage.activeUsers > maxActiveUsers { if usage.SshConnections > maxSSHConnections {
maxActiveUsers = usage.activeUsers maxSSHConnections = usage.SshConnections
}
if usage.ActiveUsers > maxActiveUsers {
maxActiveUsers = usage.ActiveUsers
} }
} }
// Calculate I/O rates using first and last samples // Calculate I/O rates using first and last samples
duration := history[samples-1].timestamp.Sub(history[0].timestamp).Seconds() duration := history[samples-1].Timestamp.Sub(history[0].Timestamp).Seconds()
diskIORate := float64(history[samples-1].diskIO-history[0].diskIO) / duration diskIORate := float64(history[samples-1].DiskIO-history[0].DiskIO) / duration
netIORate := float64(history[samples-1].networkIO-history[0].networkIO) / duration netIORate := float64(history[samples-1].NetworkIO-history[0].NetworkIO) / duration
avgCPU /= float64(samples) avgCPU /= float64(samples)
if nvmlAvailable {
avgGPU /= float64(samples) avgGPU /= float64(samples)
}
return avgCPU < cpuThreshold && // Basic checks that always apply
avgGPU < gpuThreshold && if avgCPU >= cpuThreshold ||
diskIORate < float64(diskThreshold) && diskIORate >= float64(diskThreshold) ||
netIORate < float64(networkThreshold) && netIORate >= float64(networkThreshold) ||
maxSSHConnections == 0 && maxSSHConnections > 0 ||
maxActiveUsers == 0 maxActiveUsers > 0 {
return false
}
// GPU check only if NVML is available
if nvmlAvailable && avgGPU >= gpuThreshold {
return false
}
return true
} }
func suspendSystem() error { func suspendSystem() error {