373 lines
9.6 KiB
Go
373 lines
9.6 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/exec"
|
|
"os/signal"
|
|
"strings"
|
|
"sync"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
|
"github.com/shirou/gopsutil/v3/cpu"
|
|
"github.com/shirou/gopsutil/v3/disk"
|
|
"github.com/shirou/gopsutil/v3/net"
|
|
"github.com/shirou/gopsutil/v3/process"
|
|
)
|
|
|
|
const (
|
|
checkInterval = 10 * time.Second
|
|
monitoringPeriod = 3 * time.Minute
|
|
cpuThreshold = 20.0 // percentage
|
|
gpuThreshold = 20.0 // percentage
|
|
diskThreshold = 5 * 1024 * 1024 // 5 MB/s
|
|
networkThreshold = 1 * 1024 * 1024 // 1 MB/s
|
|
httpPort = 8081
|
|
)
|
|
|
|
type ResourceUsage struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
CpuUsage float64 `json:"cpu_usage"`
|
|
GpuUsage float64 `json:"gpu_usage"`
|
|
GpuAvailable bool `json:"gpu_available"`
|
|
DiskIO float64 `json:"disk_io"`
|
|
NetworkIO float64 `json:"network_io"`
|
|
SshConnections int `json:"ssh_connections"`
|
|
ActiveUsers int `json:"active_users"`
|
|
HttpRequests int `json:"http_requests"`
|
|
}
|
|
|
|
type StatusResponse struct {
|
|
Usage ResourceUsage `json:"usage"`
|
|
Blockers []string `json:"blockers"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
SleepAt time.Time `json:"sleep_at"`
|
|
}
|
|
|
|
var (
|
|
statusMutex sync.RWMutex
|
|
currentStatus ResourceUsage
|
|
nvmlAvailable bool
|
|
networkBytesIO uint64
|
|
diskBytesIO uint64
|
|
nextSleepTime time.Time
|
|
)
|
|
|
|
func main() {
|
|
// Check if running as root
|
|
if os.Geteuid() != 0 {
|
|
log.Fatal("This program must be run as root")
|
|
}
|
|
|
|
// Initialize NVML for GPU monitoring
|
|
ret := nvml.Init()
|
|
if !errors.Is(ret, nvml.SUCCESS) {
|
|
log.Printf("Warning: Could not initialize NVML: %v", ret)
|
|
nvmlAvailable = false
|
|
} else {
|
|
nvmlAvailable = true
|
|
defer nvml.Shutdown()
|
|
}
|
|
|
|
// Set up signal handling
|
|
sigChan := make(chan os.Signal, 1)
|
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
|
|
|
// Start HTTP server with context
|
|
srv := startHTTPServer()
|
|
|
|
ticker := time.NewTicker(checkInterval)
|
|
defer ticker.Stop()
|
|
|
|
log.Printf("Starting idle monitoring. System will suspend when:\n")
|
|
log.Printf("- Average CPU usage across all cores < %.1f%%\n", cpuThreshold)
|
|
if nvmlAvailable {
|
|
log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold)
|
|
} else {
|
|
log.Printf("- GPU monitoring disabled (NVML initialization failed)")
|
|
}
|
|
log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024))
|
|
log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024))
|
|
log.Printf("- No active SSH connections\n")
|
|
log.Printf("- No active user sessions\n")
|
|
log.Printf("- Over the last %v\n", monitoringPeriod)
|
|
log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort)
|
|
log.Printf("Press Ctrl+C to exit\n")
|
|
|
|
mainLoop:
|
|
for {
|
|
select {
|
|
case sig := <-sigChan:
|
|
log.Printf("Received signal %v, shutting down...", sig)
|
|
break mainLoop
|
|
case <-ticker.C:
|
|
updateCurrentUsage()
|
|
blocked, _ := getBlockers()
|
|
if blocked {
|
|
delaySleep()
|
|
}
|
|
|
|
if time.Now().After(nextSleepTime) {
|
|
log.Printf("System status before suspend:\n")
|
|
log.Printf("- CPU: %.1f%%\n", currentStatus.CpuUsage)
|
|
if nvmlAvailable {
|
|
log.Printf("- GPU: %.1f%%\n", currentStatus.GpuUsage)
|
|
}
|
|
log.Printf("- SSH connections: %d\n", currentStatus.SshConnections)
|
|
log.Printf("- Active users: %d\n", currentStatus.ActiveUsers)
|
|
log.Printf("- Disk I/O: %.1f MB/s\n", currentStatus.DiskIO/(1024*1024))
|
|
log.Printf("- Network I/O: %.1f MB/s\n", currentStatus.NetworkIO/(1024*1024))
|
|
log.Println("System has been idle for the monitoring period. Suspending...")
|
|
if err := suspendSystem(); err != nil {
|
|
log.Printf("Failed to suspend system: %v", err)
|
|
}
|
|
nextSleepTime = time.Now().Add(monitoringPeriod)
|
|
log.Printf("Resumed")
|
|
}
|
|
}
|
|
}
|
|
|
|
// Graceful shutdown of HTTP server
|
|
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer shutdownCancel()
|
|
|
|
if err := srv.Shutdown(shutdownCtx); err != nil {
|
|
log.Printf("HTTP server shutdown error: %v", err)
|
|
}
|
|
|
|
log.Println("Goodbye!")
|
|
}
|
|
|
|
func startHTTPServer() *http.Server {
|
|
srv := &http.Server{
|
|
Addr: fmt.Sprintf(":%d", httpPort),
|
|
}
|
|
|
|
http.HandleFunc("/status", handleStatus)
|
|
http.HandleFunc("/reset", handleReset)
|
|
|
|
go func() {
|
|
if err := srv.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
|
|
log.Printf("HTTP server error: %v", err)
|
|
}
|
|
}()
|
|
|
|
return srv
|
|
}
|
|
|
|
func handleStatus(w http.ResponseWriter, _ *http.Request) {
|
|
statusMutex.RLock()
|
|
defer statusMutex.RUnlock()
|
|
|
|
_, blockers := getBlockersNoLock()
|
|
|
|
response := StatusResponse{
|
|
Usage: currentStatus,
|
|
Blockers: blockers,
|
|
Timestamp: time.Now(),
|
|
SleepAt: nextSleepTime,
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
err := json.NewEncoder(w).Encode(response)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
}
|
|
}
|
|
|
|
func handleReset(w http.ResponseWriter, _ *http.Request) {
|
|
statusMutex.Lock()
|
|
defer statusMutex.Unlock()
|
|
|
|
delaySleep()
|
|
|
|
currentStatus.HttpRequests++
|
|
nextSleepTime = time.Now().Add(monitoringPeriod)
|
|
w.WriteHeader(http.StatusOK)
|
|
}
|
|
|
|
func getBlockers() (bool, []string) {
|
|
statusMutex.Lock()
|
|
defer statusMutex.Unlock()
|
|
return getBlockersNoLock()
|
|
}
|
|
|
|
func getBlockersNoLock() (bool, []string) {
|
|
blocked := true
|
|
blockers := []string{}
|
|
|
|
if currentStatus.CpuUsage >= cpuThreshold {
|
|
blockers = append(blockers,
|
|
fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", currentStatus.CpuUsage, cpuThreshold))
|
|
}
|
|
if nvmlAvailable && currentStatus.GpuUsage >= gpuThreshold {
|
|
blockers = append(blockers,
|
|
fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", currentStatus.GpuUsage, gpuThreshold))
|
|
}
|
|
if currentStatus.DiskIO >= float64(diskThreshold) {
|
|
blockers = append(blockers,
|
|
fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s",
|
|
currentStatus.DiskIO/(1024*1024), float64(diskThreshold)/(1024*1024)))
|
|
}
|
|
if currentStatus.NetworkIO >= float64(networkThreshold) {
|
|
blockers = append(blockers,
|
|
fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s",
|
|
currentStatus.NetworkIO/(1024*1024), float64(networkThreshold)/(1024*1024)))
|
|
}
|
|
|
|
if currentStatus.SshConnections > 0 {
|
|
blockers = append(blockers,
|
|
fmt.Sprintf("Active SSH connections: %d", currentStatus.SshConnections))
|
|
}
|
|
if currentStatus.ActiveUsers > 0 {
|
|
blockers = append(blockers,
|
|
fmt.Sprintf("Active user sessions: %d", currentStatus.ActiveUsers))
|
|
}
|
|
|
|
if currentStatus.HttpRequests > 0 {
|
|
blockers = append(blockers,
|
|
fmt.Sprintf("HTTP requests received: %d", currentStatus.HttpRequests))
|
|
}
|
|
|
|
if len(blockers) == 0 {
|
|
blockers = append(blockers, "No blockers - system can sleep")
|
|
blocked = false
|
|
}
|
|
return blocked, blockers
|
|
}
|
|
|
|
func updateCurrentUsage() {
|
|
statusMutex.Lock()
|
|
defer statusMutex.Unlock()
|
|
|
|
usage := ResourceUsage{
|
|
Timestamp: time.Now(),
|
|
GpuAvailable: nvmlAvailable,
|
|
}
|
|
|
|
// Get CPU usage across all cores
|
|
if cpuPercent, err := cpu.Percent(0, true); err == nil && len(cpuPercent) > 0 {
|
|
// Calculate average CPU usage across all cores
|
|
var totalCPU float64
|
|
for _, percent := range cpuPercent {
|
|
totalCPU += percent
|
|
}
|
|
usage.CpuUsage = totalCPU / float64(len(cpuPercent))
|
|
}
|
|
|
|
// Get GPU usage across all GPUs if available
|
|
if nvmlAvailable {
|
|
count, ret := nvml.DeviceGetCount()
|
|
if errors.Is(ret, nvml.SUCCESS) && count > 0 {
|
|
var totalGPU float64
|
|
var activeGPUs int
|
|
for i := 0; i < count; i++ {
|
|
device, ret := nvml.DeviceGetHandleByIndex(i)
|
|
if errors.Is(ret, nvml.SUCCESS) {
|
|
utilization, ret := device.GetUtilizationRates()
|
|
if errors.Is(ret, nvml.SUCCESS) {
|
|
totalGPU += float64(utilization.Gpu)
|
|
activeGPUs++
|
|
}
|
|
}
|
|
}
|
|
if activeGPUs > 0 {
|
|
usage.GpuUsage = totalGPU / float64(activeGPUs)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get disk I/O
|
|
if diskStats, err := disk.IOCounters(); err == nil {
|
|
var totalIO uint64
|
|
for _, stat := range diskStats {
|
|
totalIO += stat.ReadBytes + stat.WriteBytes
|
|
}
|
|
|
|
previousDiskBytesIO := diskBytesIO
|
|
diskBytesIO = totalIO
|
|
diff := diskBytesIO - previousDiskBytesIO
|
|
usage.DiskIO = float64(diff) / checkInterval.Seconds()
|
|
}
|
|
|
|
// Get network I/O
|
|
if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 {
|
|
previousNetworkBytesIO := networkBytesIO
|
|
networkBytesIO = netStats[0].BytesSent + netStats[0].BytesRecv
|
|
diff := networkBytesIO - previousNetworkBytesIO
|
|
usage.NetworkIO = float64(diff) / checkInterval.Seconds()
|
|
}
|
|
|
|
// Count SSH connections
|
|
if sshCount, err := getSSHConnectionCount(); err == nil {
|
|
usage.SshConnections = sshCount
|
|
}
|
|
|
|
// Count active user sessions
|
|
if userCount, err := getActiveUserCount(); err == nil {
|
|
usage.ActiveUsers = userCount
|
|
}
|
|
|
|
currentStatus = usage
|
|
}
|
|
|
|
func getSSHConnectionCount() (int, error) {
|
|
processes, err := process.Processes()
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
count := 0
|
|
for _, p := range processes {
|
|
name, err := p.Name()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if name == "sshd" {
|
|
cmdline, err := p.Cmdline()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
// Only count sshd processes that are handling connections
|
|
// The main sshd process doesn't have "@" in its cmdline
|
|
if strings.Contains(cmdline, "@") {
|
|
count++
|
|
}
|
|
}
|
|
}
|
|
return count, nil
|
|
}
|
|
|
|
func getActiveUserCount() (int, error) {
|
|
out, err := exec.Command("who", "-s").Output()
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
// Count non-empty lines
|
|
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
|
count := 0
|
|
for _, line := range lines {
|
|
if line != "" {
|
|
count++
|
|
}
|
|
}
|
|
return count, nil
|
|
}
|
|
|
|
func suspendSystem() error {
|
|
cmd := exec.Command("systemctl", "suspend")
|
|
return cmd.Run()
|
|
}
|
|
|
|
func delaySleep() {
|
|
nextSleepTime = time.Now().Add(monitoringPeriod)
|
|
}
|