Add support for ctrl+c
This commit is contained in:
parent
11b62aa1f0
commit
8d36e1e245
176
main.go
176
main.go
@ -1,14 +1,17 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/signal"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
@ -29,13 +32,14 @@ const (
|
||||
)
|
||||
|
||||
type ResourceUsage struct {
|
||||
timestamp time.Time
|
||||
cpuUsage float64
|
||||
gpuUsage float64
|
||||
diskIO uint64
|
||||
networkIO uint64
|
||||
sshConnections int
|
||||
activeUsers int
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
CpuUsage float64 `json:"cpu_usage"`
|
||||
GpuUsage float64 `json:"gpu_usage"`
|
||||
GpuAvailable bool `json:"gpu_available"`
|
||||
DiskIO uint64 `json:"disk_io"`
|
||||
NetworkIO uint64 `json:"network_io"`
|
||||
SshConnections int `json:"ssh_connections"`
|
||||
ActiveUsers int `json:"active_users"`
|
||||
}
|
||||
|
||||
type SystemStatus struct {
|
||||
@ -46,6 +50,7 @@ type SystemStatus struct {
|
||||
var (
|
||||
currentStatus SystemStatus
|
||||
statusMutex sync.RWMutex
|
||||
nvmlAvailable bool
|
||||
)
|
||||
|
||||
func main() {
|
||||
@ -58,11 +63,22 @@ func main() {
|
||||
ret := nvml.Init()
|
||||
if ret != nvml.SUCCESS {
|
||||
log.Printf("Warning: Could not initialize NVML: %v", ret)
|
||||
}
|
||||
nvmlAvailable = false
|
||||
} else {
|
||||
nvmlAvailable = true
|
||||
defer nvml.Shutdown()
|
||||
}
|
||||
|
||||
// Start HTTP server
|
||||
go startHTTPServer()
|
||||
// Create a context that we'll use to shut down the application
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// Set up signal handling
|
||||
sigChan := make(chan os.Signal, 1)
|
||||
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
// Start HTTP server with context
|
||||
srv := startHTTPServer(ctx)
|
||||
|
||||
usageHistory := make([]ResourceUsage, 0)
|
||||
ticker := time.NewTicker(checkInterval)
|
||||
@ -70,15 +86,29 @@ func main() {
|
||||
|
||||
log.Printf("Starting idle monitoring. System will suspend when:\n")
|
||||
log.Printf("- Average CPU usage across all cores < %.1f%%\n", cpuThreshold)
|
||||
if nvmlAvailable {
|
||||
log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold)
|
||||
} else {
|
||||
log.Printf("- GPU monitoring disabled (NVML initialization failed)")
|
||||
}
|
||||
log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024))
|
||||
log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024))
|
||||
log.Printf("- No active SSH connections\n")
|
||||
log.Printf("- No active user sessions\n")
|
||||
log.Printf("Over the last %v\n", monitoringPeriod)
|
||||
log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort)
|
||||
log.Printf("Press Ctrl+C to exit\n")
|
||||
|
||||
for range ticker.C {
|
||||
mainLoop:
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break mainLoop
|
||||
case sig := <-sigChan:
|
||||
log.Printf("Received signal %v, shutting down...", sig)
|
||||
cancel()
|
||||
break mainLoop
|
||||
case <-ticker.C:
|
||||
usage := getCurrentUsage()
|
||||
usageHistory = append(usageHistory, usage)
|
||||
|
||||
@ -88,7 +118,7 @@ func main() {
|
||||
// Remove entries older than monitoring period
|
||||
cutoff := time.Now().Add(-monitoringPeriod)
|
||||
for i, u := range usageHistory {
|
||||
if u.timestamp.After(cutoff) {
|
||||
if u.Timestamp.After(cutoff) {
|
||||
usageHistory = usageHistory[i:]
|
||||
break
|
||||
}
|
||||
@ -96,10 +126,12 @@ func main() {
|
||||
|
||||
if len(usageHistory) > 0 && isSystemIdle(usageHistory) {
|
||||
log.Printf("System status before suspend:\n")
|
||||
log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].cpuUsage)
|
||||
log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].gpuUsage)
|
||||
log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].sshConnections)
|
||||
log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].activeUsers)
|
||||
log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage)
|
||||
if nvmlAvailable {
|
||||
log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].GpuUsage)
|
||||
}
|
||||
log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].SshConnections)
|
||||
log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].ActiveUsers)
|
||||
log.Println("System has been idle for the monitoring period. Suspending...")
|
||||
if err := suspendSystem(); err != nil {
|
||||
log.Printf("Failed to suspend system: %v", err)
|
||||
@ -108,11 +140,31 @@ func main() {
|
||||
}
|
||||
}
|
||||
|
||||
func startHTTPServer() {
|
||||
http.HandleFunc("/status", handleStatus)
|
||||
if err := http.ListenAndServe(fmt.Sprintf(":%d", httpPort), nil); err != nil {
|
||||
log.Printf("Failed to start HTTP server: %v", err)
|
||||
// Graceful shutdown of HTTP server
|
||||
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer shutdownCancel()
|
||||
|
||||
if err := srv.Shutdown(shutdownCtx); err != nil {
|
||||
log.Printf("HTTP server shutdown error: %v", err)
|
||||
}
|
||||
|
||||
log.Println("Goodbye!")
|
||||
}
|
||||
|
||||
func startHTTPServer(ctx context.Context) *http.Server {
|
||||
srv := &http.Server{
|
||||
Addr: fmt.Sprintf(":%d", httpPort),
|
||||
}
|
||||
|
||||
http.HandleFunc("/status", handleStatus)
|
||||
|
||||
go func() {
|
||||
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
|
||||
log.Printf("HTTP server error: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
return srv
|
||||
}
|
||||
|
||||
func handleStatus(w http.ResponseWriter, r *http.Request) {
|
||||
@ -132,17 +184,17 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
|
||||
|
||||
if len(history) >= 2 {
|
||||
// Calculate rates using last two samples
|
||||
duration := history[len(history)-1].timestamp.Sub(history[len(history)-2].timestamp).Seconds()
|
||||
diskIORate := float64(history[len(history)-1].diskIO-history[len(history)-2].diskIO) / duration
|
||||
netIORate := float64(history[len(history)-1].networkIO-history[len(history)-2].networkIO) / duration
|
||||
duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds()
|
||||
diskIORate := float64(history[len(history)-1].DiskIO-history[len(history)-2].DiskIO) / duration
|
||||
netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration
|
||||
|
||||
if current.cpuUsage >= cpuThreshold {
|
||||
if current.CpuUsage >= cpuThreshold {
|
||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||
fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.cpuUsage, cpuThreshold))
|
||||
fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold))
|
||||
}
|
||||
if current.gpuUsage >= gpuThreshold {
|
||||
if nvmlAvailable && current.GpuUsage >= gpuThreshold {
|
||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||
fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.gpuUsage, gpuThreshold))
|
||||
fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold))
|
||||
}
|
||||
if diskIORate >= float64(diskThreshold) {
|
||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||
@ -156,13 +208,13 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
|
||||
}
|
||||
}
|
||||
|
||||
if current.sshConnections > 0 {
|
||||
if current.SshConnections > 0 {
|
||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||
fmt.Sprintf("Active SSH connections: %d", current.sshConnections))
|
||||
fmt.Sprintf("Active SSH connections: %d", current.SshConnections))
|
||||
}
|
||||
if current.activeUsers > 0 {
|
||||
if current.ActiveUsers > 0 {
|
||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||
fmt.Sprintf("Active user sessions: %d", current.activeUsers))
|
||||
fmt.Sprintf("Active user sessions: %d", current.ActiveUsers))
|
||||
}
|
||||
|
||||
if len(currentStatus.Blockers) == 0 {
|
||||
@ -172,7 +224,8 @@ func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
|
||||
|
||||
func getCurrentUsage() ResourceUsage {
|
||||
usage := ResourceUsage{
|
||||
timestamp: time.Now(),
|
||||
Timestamp: time.Now(),
|
||||
GpuAvailable: nvmlAvailable,
|
||||
}
|
||||
|
||||
// Get CPU usage across all cores
|
||||
@ -182,10 +235,11 @@ func getCurrentUsage() ResourceUsage {
|
||||
for _, percent := range cpuPercent {
|
||||
totalCPU += percent
|
||||
}
|
||||
usage.cpuUsage = totalCPU / float64(len(cpuPercent))
|
||||
usage.CpuUsage = totalCPU / float64(len(cpuPercent))
|
||||
}
|
||||
|
||||
// Get GPU usage across all GPUs
|
||||
// Get GPU usage across all GPUs if available
|
||||
if nvmlAvailable {
|
||||
count, ret := nvml.DeviceGetCount()
|
||||
if ret == nvml.SUCCESS && count > 0 {
|
||||
var totalGPU float64
|
||||
@ -201,7 +255,8 @@ func getCurrentUsage() ResourceUsage {
|
||||
}
|
||||
}
|
||||
if activeGPUs > 0 {
|
||||
usage.gpuUsage = totalGPU / float64(activeGPUs)
|
||||
usage.GpuUsage = totalGPU / float64(activeGPUs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -211,22 +266,22 @@ func getCurrentUsage() ResourceUsage {
|
||||
for _, stat := range diskStats {
|
||||
totalIO += stat.ReadBytes + stat.WriteBytes
|
||||
}
|
||||
usage.diskIO = totalIO
|
||||
usage.DiskIO = totalIO
|
||||
}
|
||||
|
||||
// Get network I/O
|
||||
if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 {
|
||||
usage.networkIO = netStats[0].BytesSent + netStats[0].BytesRecv
|
||||
usage.NetworkIO = netStats[0].BytesSent + netStats[0].BytesRecv
|
||||
}
|
||||
|
||||
// Count SSH connections
|
||||
if sshCount, err := getSSHConnectionCount(); err == nil {
|
||||
usage.sshConnections = sshCount
|
||||
usage.SshConnections = sshCount
|
||||
}
|
||||
|
||||
// Count active user sessions
|
||||
if userCount, err := getActiveUserCount(); err == nil {
|
||||
usage.activeUsers = userCount
|
||||
usage.ActiveUsers = userCount
|
||||
}
|
||||
|
||||
return usage
|
||||
@ -286,30 +341,43 @@ func isSystemIdle(history []ResourceUsage) bool {
|
||||
samples := len(history)
|
||||
|
||||
for _, usage := range history {
|
||||
avgCPU += usage.cpuUsage
|
||||
avgGPU += usage.gpuUsage
|
||||
if usage.sshConnections > maxSSHConnections {
|
||||
maxSSHConnections = usage.sshConnections
|
||||
avgCPU += usage.CpuUsage
|
||||
if nvmlAvailable {
|
||||
avgGPU += usage.GpuUsage
|
||||
}
|
||||
if usage.activeUsers > maxActiveUsers {
|
||||
maxActiveUsers = usage.activeUsers
|
||||
if usage.SshConnections > maxSSHConnections {
|
||||
maxSSHConnections = usage.SshConnections
|
||||
}
|
||||
if usage.ActiveUsers > maxActiveUsers {
|
||||
maxActiveUsers = usage.ActiveUsers
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate I/O rates using first and last samples
|
||||
duration := history[samples-1].timestamp.Sub(history[0].timestamp).Seconds()
|
||||
diskIORate := float64(history[samples-1].diskIO-history[0].diskIO) / duration
|
||||
netIORate := float64(history[samples-1].networkIO-history[0].networkIO) / duration
|
||||
duration := history[samples-1].Timestamp.Sub(history[0].Timestamp).Seconds()
|
||||
diskIORate := float64(history[samples-1].DiskIO-history[0].DiskIO) / duration
|
||||
netIORate := float64(history[samples-1].NetworkIO-history[0].NetworkIO) / duration
|
||||
|
||||
avgCPU /= float64(samples)
|
||||
if nvmlAvailable {
|
||||
avgGPU /= float64(samples)
|
||||
}
|
||||
|
||||
return avgCPU < cpuThreshold &&
|
||||
avgGPU < gpuThreshold &&
|
||||
diskIORate < float64(diskThreshold) &&
|
||||
netIORate < float64(networkThreshold) &&
|
||||
maxSSHConnections == 0 &&
|
||||
maxActiveUsers == 0
|
||||
// Basic checks that always apply
|
||||
if avgCPU >= cpuThreshold ||
|
||||
diskIORate >= float64(diskThreshold) ||
|
||||
netIORate >= float64(networkThreshold) ||
|
||||
maxSSHConnections > 0 ||
|
||||
maxActiveUsers > 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// GPU check only if NVML is available
|
||||
if nvmlAvailable && avgGPU >= gpuThreshold {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func suspendSystem() error {
|
||||
|
Loading…
x
Reference in New Issue
Block a user