Simplify system
This commit is contained in:
parent
5a27277e7c
commit
93fe9ebed3
231
main.go
231
main.go
@ -3,6 +3,7 @@ package main
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
@ -37,27 +38,26 @@ type ResourceUsage struct {
|
|||||||
CpuUsage float64 `json:"cpu_usage"`
|
CpuUsage float64 `json:"cpu_usage"`
|
||||||
GpuUsage float64 `json:"gpu_usage"`
|
GpuUsage float64 `json:"gpu_usage"`
|
||||||
GpuAvailable bool `json:"gpu_available"`
|
GpuAvailable bool `json:"gpu_available"`
|
||||||
DiskIO uint64 `json:"disk_io"`
|
DiskIO float64 `json:"disk_io"`
|
||||||
NetworkIO uint64 `json:"network_io"`
|
NetworkIO float64 `json:"network_io"`
|
||||||
SshConnections int `json:"ssh_connections"`
|
SshConnections int `json:"ssh_connections"`
|
||||||
ActiveUsers int `json:"active_users"`
|
ActiveUsers int `json:"active_users"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type SystemStatus struct {
|
|
||||||
CurrentUsage ResourceUsage `json:"current_usage"`
|
|
||||||
Blockers []string `json:"sleep_blockers"`
|
|
||||||
InGracePeriod bool `json:"in_grace_period,omitempty"`
|
|
||||||
GraceTimeLeft string `json:"grace_time_left,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
currentStatus SystemStatus
|
|
||||||
statusMutex sync.RWMutex
|
statusMutex sync.RWMutex
|
||||||
|
blockers []string
|
||||||
|
currentStatus ResourceUsage
|
||||||
nvmlAvailable bool
|
nvmlAvailable bool
|
||||||
lastResumeTime time.Time // Track when the system last resumed from sleep
|
lastBlockedTime time.Time
|
||||||
lastTickTime time.Time // Track when we last processed a tick
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func Must(err error) {
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Error: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
// Check if running as root
|
// Check if running as root
|
||||||
if os.Geteuid() != 0 {
|
if os.Geteuid() != 0 {
|
||||||
@ -66,24 +66,20 @@ func main() {
|
|||||||
|
|
||||||
// Initialize NVML for GPU monitoring
|
// Initialize NVML for GPU monitoring
|
||||||
ret := nvml.Init()
|
ret := nvml.Init()
|
||||||
if ret != nvml.SUCCESS {
|
if !errors.Is(ret, nvml.SUCCESS) {
|
||||||
log.Printf("Warning: Could not initialize NVML: %v", ret)
|
log.Printf("Warning: Could not initialize NVML: %v", ret)
|
||||||
nvmlAvailable = false
|
nvmlAvailable = false
|
||||||
} else {
|
} else {
|
||||||
nvmlAvailable = true
|
nvmlAvailable = true
|
||||||
defer nvml.Shutdown()
|
defer Must(nvml.Shutdown())
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a context that we'll use to shut down the application
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
// Set up signal handling
|
// Set up signal handling
|
||||||
sigChan := make(chan os.Signal, 1)
|
sigChan := make(chan os.Signal, 1)
|
||||||
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
|
||||||
// Start HTTP server with context
|
// Start HTTP server with context
|
||||||
srv := startHTTPServer(ctx)
|
srv := startHTTPServer()
|
||||||
|
|
||||||
usageHistory := make([]ResourceUsage, 0)
|
usageHistory := make([]ResourceUsage, 0)
|
||||||
ticker := time.NewTicker(checkInterval)
|
ticker := time.NewTicker(checkInterval)
|
||||||
@ -108,42 +104,14 @@ func main() {
|
|||||||
mainLoop:
|
mainLoop:
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
|
||||||
break mainLoop
|
|
||||||
case sig := <-sigChan:
|
case sig := <-sigChan:
|
||||||
log.Printf("Received signal %v, shutting down...", sig)
|
log.Printf("Received signal %v, shutting down...", sig)
|
||||||
cancel()
|
|
||||||
break mainLoop
|
break mainLoop
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
now := time.Now()
|
updateCurrentUsage()
|
||||||
|
updateSystemStatus()
|
||||||
|
|
||||||
// Check if we just resumed from sleep
|
if time.Now().Sub(lastBlockedTime) >= monitoringPeriod {
|
||||||
if !lastTickTime.IsZero() {
|
|
||||||
gap := now.Sub(lastTickTime)
|
|
||||||
// If there was a significant gap, probably resumed from sleep
|
|
||||||
if gap > (checkInterval*3) && gap < time.Hour {
|
|
||||||
log.Printf("Detected system resume after gap of %v", gap)
|
|
||||||
lastResumeTime = now
|
|
||||||
}
|
|
||||||
}
|
|
||||||
lastTickTime = now
|
|
||||||
|
|
||||||
usage := getCurrentUsage()
|
|
||||||
usageHistory = append(usageHistory, usage)
|
|
||||||
|
|
||||||
// Update current status
|
|
||||||
updateSystemStatus(usage, usageHistory)
|
|
||||||
|
|
||||||
// Remove entries older than monitoring period
|
|
||||||
cutoff := time.Now().Add(-monitoringPeriod)
|
|
||||||
for i, u := range usageHistory {
|
|
||||||
if u.Timestamp.After(cutoff) {
|
|
||||||
usageHistory = usageHistory[i:]
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(usageHistory) > 0 && isSystemIdle(usageHistory) {
|
|
||||||
log.Printf("System status before suspend:\n")
|
log.Printf("System status before suspend:\n")
|
||||||
log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage)
|
log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].CpuUsage)
|
||||||
if nvmlAvailable {
|
if nvmlAvailable {
|
||||||
@ -155,6 +123,8 @@ mainLoop:
|
|||||||
if err := suspendSystem(); err != nil {
|
if err := suspendSystem(); err != nil {
|
||||||
log.Printf("Failed to suspend system: %v", err)
|
log.Printf("Failed to suspend system: %v", err)
|
||||||
}
|
}
|
||||||
|
lastBlockedTime = time.Now()
|
||||||
|
log.Printf("Resumed")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -170,20 +140,7 @@ mainLoop:
|
|||||||
log.Println("Goodbye!")
|
log.Println("Goodbye!")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to check if we're within the resume grace period
|
func startHTTPServer() *http.Server {
|
||||||
func isInsideResumeGracePeriod() bool {
|
|
||||||
return !lastResumeTime.IsZero() && time.Since(lastResumeTime) < resumeGracePeriod
|
|
||||||
}
|
|
||||||
|
|
||||||
// Function to calculate time left in grace period
|
|
||||||
func timeLeftInGracePeriod() time.Duration {
|
|
||||||
if !isInsideResumeGracePeriod() {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
return resumeGracePeriod - time.Since(lastResumeTime)
|
|
||||||
}
|
|
||||||
|
|
||||||
func startHTTPServer(ctx context.Context) *http.Server {
|
|
||||||
srv := &http.Server{
|
srv := &http.Server{
|
||||||
Addr: fmt.Sprintf(":%d", httpPort),
|
Addr: fmt.Sprintf(":%d", httpPort),
|
||||||
}
|
}
|
||||||
@ -191,7 +148,7 @@ func startHTTPServer(ctx context.Context) *http.Server {
|
|||||||
http.HandleFunc("/status", handleStatus)
|
http.HandleFunc("/status", handleStatus)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
|
if err := srv.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
|
||||||
log.Printf("HTTP server error: %v", err)
|
log.Printf("HTTP server error: %v", err)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
@ -199,74 +156,62 @@ func startHTTPServer(ctx context.Context) *http.Server {
|
|||||||
return srv
|
return srv
|
||||||
}
|
}
|
||||||
|
|
||||||
func handleStatus(w http.ResponseWriter, r *http.Request) {
|
func handleStatus(w http.ResponseWriter, _ *http.Request) {
|
||||||
statusMutex.RLock()
|
statusMutex.RLock()
|
||||||
defer statusMutex.RUnlock()
|
defer statusMutex.RUnlock()
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
json.NewEncoder(w).Encode(currentStatus)
|
err := json.NewEncoder(w).Encode(currentStatus)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
|
func updateSystemStatus() {
|
||||||
statusMutex.Lock()
|
statusMutex.Lock()
|
||||||
defer statusMutex.Unlock()
|
defer statusMutex.Unlock()
|
||||||
|
|
||||||
currentStatus.CurrentUsage = current
|
blockers = []string{}
|
||||||
currentStatus.Blockers = []string{}
|
|
||||||
|
|
||||||
// Add grace period info to status
|
if currentStatus.CpuUsage >= cpuThreshold {
|
||||||
if isInsideResumeGracePeriod() {
|
blockers = append(blockers,
|
||||||
timeLeft := timeLeftInGracePeriod()
|
fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", currentStatus.CpuUsage, cpuThreshold))
|
||||||
currentStatus.InGracePeriod = true
|
|
||||||
currentStatus.GraceTimeLeft = timeLeft.Round(time.Second).String()
|
|
||||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
|
||||||
fmt.Sprintf("Resume grace period: %v remaining", timeLeft.Round(time.Second)))
|
|
||||||
} else {
|
|
||||||
currentStatus.InGracePeriod = false
|
|
||||||
currentStatus.GraceTimeLeft = ""
|
|
||||||
}
|
}
|
||||||
|
if nvmlAvailable && currentStatus.GpuUsage >= gpuThreshold {
|
||||||
if len(history) >= 2 {
|
blockers = append(blockers,
|
||||||
// Calculate rates using last two samples
|
fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", currentStatus.GpuUsage, gpuThreshold))
|
||||||
duration := history[len(history)-1].Timestamp.Sub(history[len(history)-2].Timestamp).Seconds()
|
|
||||||
diskIORate := float64(history[len(history)-1].DiskIO-history[len(history)-2].DiskIO) / duration
|
|
||||||
netIORate := float64(history[len(history)-1].NetworkIO-history[len(history)-2].NetworkIO) / duration
|
|
||||||
|
|
||||||
if current.CpuUsage >= cpuThreshold {
|
|
||||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
|
||||||
fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.CpuUsage, cpuThreshold))
|
|
||||||
}
|
}
|
||||||
if nvmlAvailable && current.GpuUsage >= gpuThreshold {
|
if currentStatus.DiskIO >= float64(diskThreshold) {
|
||||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
blockers = append(blockers,
|
||||||
fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.GpuUsage, gpuThreshold))
|
|
||||||
}
|
|
||||||
if diskIORate >= float64(diskThreshold) {
|
|
||||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
|
||||||
fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s",
|
fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s",
|
||||||
diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024)))
|
currentStatus.DiskIO/(1024*1024), float64(diskThreshold)/(1024*1024)))
|
||||||
}
|
}
|
||||||
if netIORate >= float64(networkThreshold) {
|
if currentStatus.NetworkIO >= float64(networkThreshold) {
|
||||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
blockers = append(blockers,
|
||||||
fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s",
|
fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s",
|
||||||
netIORate/(1024*1024), float64(networkThreshold)/(1024*1024)))
|
currentStatus.NetworkIO/(1024*1024), float64(networkThreshold)/(1024*1024)))
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if current.SshConnections > 0 {
|
if currentStatus.SshConnections > 0 {
|
||||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
blockers = append(blockers,
|
||||||
fmt.Sprintf("Active SSH connections: %d", current.SshConnections))
|
fmt.Sprintf("Active SSH connections: %d", currentStatus.SshConnections))
|
||||||
}
|
}
|
||||||
if current.ActiveUsers > 0 {
|
if currentStatus.ActiveUsers > 0 {
|
||||||
currentStatus.Blockers = append(currentStatus.Blockers,
|
blockers = append(blockers,
|
||||||
fmt.Sprintf("Active user sessions: %d", current.ActiveUsers))
|
fmt.Sprintf("Active user sessions: %d", currentStatus.ActiveUsers))
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(currentStatus.Blockers) == 0 {
|
if len(blockers) == 0 {
|
||||||
currentStatus.Blockers = append(currentStatus.Blockers, "No blockers - system can sleep")
|
blockers = append(blockers, "No blockers - system can sleep")
|
||||||
|
} else {
|
||||||
|
lastBlockedTime = time.Now()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getCurrentUsage() ResourceUsage {
|
func updateCurrentUsage() {
|
||||||
|
statusMutex.Lock()
|
||||||
|
defer statusMutex.Unlock()
|
||||||
|
|
||||||
usage := ResourceUsage{
|
usage := ResourceUsage{
|
||||||
Timestamp: time.Now(),
|
Timestamp: time.Now(),
|
||||||
GpuAvailable: nvmlAvailable,
|
GpuAvailable: nvmlAvailable,
|
||||||
@ -285,14 +230,14 @@ func getCurrentUsage() ResourceUsage {
|
|||||||
// Get GPU usage across all GPUs if available
|
// Get GPU usage across all GPUs if available
|
||||||
if nvmlAvailable {
|
if nvmlAvailable {
|
||||||
count, ret := nvml.DeviceGetCount()
|
count, ret := nvml.DeviceGetCount()
|
||||||
if ret == nvml.SUCCESS && count > 0 {
|
if errors.Is(ret, nvml.SUCCESS) && count > 0 {
|
||||||
var totalGPU float64
|
var totalGPU float64
|
||||||
var activeGPUs int
|
var activeGPUs int
|
||||||
for i := 0; i < count; i++ {
|
for i := 0; i < count; i++ {
|
||||||
device, ret := nvml.DeviceGetHandleByIndex(i)
|
device, ret := nvml.DeviceGetHandleByIndex(i)
|
||||||
if ret == nvml.SUCCESS {
|
if errors.Is(ret, nvml.SUCCESS) {
|
||||||
utilization, ret := device.GetUtilizationRates()
|
utilization, ret := device.GetUtilizationRates()
|
||||||
if ret == nvml.SUCCESS {
|
if errors.Is(ret, nvml.SUCCESS) {
|
||||||
totalGPU += float64(utilization.Gpu)
|
totalGPU += float64(utilization.Gpu)
|
||||||
activeGPUs++
|
activeGPUs++
|
||||||
}
|
}
|
||||||
@ -310,12 +255,12 @@ func getCurrentUsage() ResourceUsage {
|
|||||||
for _, stat := range diskStats {
|
for _, stat := range diskStats {
|
||||||
totalIO += stat.ReadBytes + stat.WriteBytes
|
totalIO += stat.ReadBytes + stat.WriteBytes
|
||||||
}
|
}
|
||||||
usage.DiskIO = totalIO
|
usage.DiskIO = float64(totalIO)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get network I/O
|
// Get network I/O
|
||||||
if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 {
|
if netStats, err := net.IOCounters(false); err == nil && len(netStats) > 0 {
|
||||||
usage.NetworkIO = netStats[0].BytesSent + netStats[0].BytesRecv
|
usage.NetworkIO = float64(netStats[0].BytesSent + netStats[0].BytesRecv)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count SSH connections
|
// Count SSH connections
|
||||||
@ -328,7 +273,7 @@ func getCurrentUsage() ResourceUsage {
|
|||||||
usage.ActiveUsers = userCount
|
usage.ActiveUsers = userCount
|
||||||
}
|
}
|
||||||
|
|
||||||
return usage
|
currentStatus = usage
|
||||||
}
|
}
|
||||||
|
|
||||||
func getSSHConnectionCount() (int, error) {
|
func getSSHConnectionCount() (int, error) {
|
||||||
@ -375,60 +320,6 @@ func getActiveUserCount() (int, error) {
|
|||||||
return count, nil
|
return count, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func isSystemIdle(history []ResourceUsage) bool {
|
|
||||||
// Don't allow sleep during grace period after resume
|
|
||||||
if isInsideResumeGracePeriod() {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(history) < 2 {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
var avgCPU, avgGPU float64
|
|
||||||
var maxSSHConnections, maxActiveUsers int
|
|
||||||
samples := len(history)
|
|
||||||
|
|
||||||
for _, usage := range history {
|
|
||||||
avgCPU += usage.CpuUsage
|
|
||||||
if nvmlAvailable {
|
|
||||||
avgGPU += usage.GpuUsage
|
|
||||||
}
|
|
||||||
if usage.SshConnections > maxSSHConnections {
|
|
||||||
maxSSHConnections = usage.SshConnections
|
|
||||||
}
|
|
||||||
if usage.ActiveUsers > maxActiveUsers {
|
|
||||||
maxActiveUsers = usage.ActiveUsers
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate I/O rates using first and last samples
|
|
||||||
duration := history[samples-1].Timestamp.Sub(history[0].Timestamp).Seconds()
|
|
||||||
diskIORate := float64(history[samples-1].DiskIO-history[0].DiskIO) / duration
|
|
||||||
netIORate := float64(history[samples-1].NetworkIO-history[0].NetworkIO) / duration
|
|
||||||
|
|
||||||
avgCPU /= float64(samples)
|
|
||||||
if nvmlAvailable {
|
|
||||||
avgGPU /= float64(samples)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Basic checks that always apply
|
|
||||||
if avgCPU >= cpuThreshold ||
|
|
||||||
diskIORate >= float64(diskThreshold) ||
|
|
||||||
netIORate >= float64(networkThreshold) ||
|
|
||||||
maxSSHConnections > 0 ||
|
|
||||||
maxActiveUsers > 0 {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// GPU check only if NVML is available
|
|
||||||
if nvmlAvailable && avgGPU >= gpuThreshold {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
func suspendSystem() error {
|
func suspendSystem() error {
|
||||||
cmd := exec.Command("systemctl", "suspend")
|
cmd := exec.Command("systemctl", "suspend")
|
||||||
return cmd.Run()
|
return cmd.Run()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user