Track SSH connections and add endpoint
This commit is contained in:
parent
ff7b0a3a70
commit
ad86ad077a
161
main.go
161
main.go
@ -1,15 +1,22 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
"github.com/shirou/gopsutil/v3/cpu"
|
"github.com/shirou/gopsutil/v3/cpu"
|
||||||
"github.com/shirou/gopsutil/v3/disk"
|
"github.com/shirou/gopsutil/v3/disk"
|
||||||
"github.com/shirou/gopsutil/v3/net"
|
"github.com/shirou/gopsutil/v3/net"
|
||||||
|
"github.com/shirou/gopsutil/v3/process"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -19,6 +26,7 @@ const (
|
|||||||
gpuThreshold = 20.0 // percentage
|
gpuThreshold = 20.0 // percentage
|
||||||
diskThreshold = 5 * 1024 * 1024 // 5 MB/s
|
diskThreshold = 5 * 1024 * 1024 // 5 MB/s
|
||||||
networkThreshold = 1 * 1024 * 1024 // 1 MB/s
|
networkThreshold = 1 * 1024 * 1024 // 1 MB/s
|
||||||
|
httpPort = 8080
|
||||||
)
|
)
|
||||||
|
|
||||||
type ResourceUsage struct {
|
type ResourceUsage struct {
|
||||||
@ -27,8 +35,20 @@ type ResourceUsage struct {
|
|||||||
gpuUsage float64
|
gpuUsage float64
|
||||||
diskIO uint64
|
diskIO uint64
|
||||||
networkIO uint64
|
networkIO uint64
|
||||||
|
sshConnections int
|
||||||
|
activeUsers int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type SystemStatus struct {
|
||||||
|
CurrentUsage ResourceUsage `json:"current_usage"`
|
||||||
|
Blockers []string `json:"sleep_blockers"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
currentStatus SystemStatus
|
||||||
|
statusMutex sync.RWMutex
|
||||||
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
// Check if running as root
|
// Check if running as root
|
||||||
if os.Geteuid() != 0 {
|
if os.Geteuid() != 0 {
|
||||||
@ -42,6 +62,9 @@ func main() {
|
|||||||
}
|
}
|
||||||
defer nvml.Shutdown()
|
defer nvml.Shutdown()
|
||||||
|
|
||||||
|
// Start HTTP server
|
||||||
|
go startHTTPServer()
|
||||||
|
|
||||||
usageHistory := make([]ResourceUsage, 0)
|
usageHistory := make([]ResourceUsage, 0)
|
||||||
ticker := time.NewTicker(checkInterval)
|
ticker := time.NewTicker(checkInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
@ -51,12 +74,18 @@ func main() {
|
|||||||
log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold)
|
log.Printf("- Average GPU usage across all GPUs < %.1f%%\n", gpuThreshold)
|
||||||
log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024))
|
log.Printf("- Disk I/O < %.1f MB/s\n", float64(diskThreshold)/(1024*1024))
|
||||||
log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024))
|
log.Printf("- Network I/O < %.1f MB/s\n", float64(networkThreshold)/(1024*1024))
|
||||||
|
log.Printf("- No active SSH connections\n")
|
||||||
|
log.Printf("- No active user sessions\n")
|
||||||
log.Printf("Over the last %v\n", monitoringPeriod)
|
log.Printf("Over the last %v\n", monitoringPeriod)
|
||||||
|
log.Printf("HTTP status endpoint available at http://localhost:%d/status\n", httpPort)
|
||||||
|
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
usage := getCurrentUsage()
|
usage := getCurrentUsage()
|
||||||
usageHistory = append(usageHistory, usage)
|
usageHistory = append(usageHistory, usage)
|
||||||
|
|
||||||
|
// Update current status
|
||||||
|
updateSystemStatus(usage, usageHistory)
|
||||||
|
|
||||||
// Remove entries older than monitoring period
|
// Remove entries older than monitoring period
|
||||||
cutoff := time.Now().Add(-monitoringPeriod)
|
cutoff := time.Now().Add(-monitoringPeriod)
|
||||||
for i, u := range usageHistory {
|
for i, u := range usageHistory {
|
||||||
@ -67,6 +96,11 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(usageHistory) > 0 && isSystemIdle(usageHistory) {
|
if len(usageHistory) > 0 && isSystemIdle(usageHistory) {
|
||||||
|
log.Printf("System status before suspend:\n")
|
||||||
|
log.Printf("- CPU: %.1f%%\n", usageHistory[len(usageHistory)-1].cpuUsage)
|
||||||
|
log.Printf("- GPU: %.1f%%\n", usageHistory[len(usageHistory)-1].gpuUsage)
|
||||||
|
log.Printf("- SSH connections: %d\n", usageHistory[len(usageHistory)-1].sshConnections)
|
||||||
|
log.Printf("- Active users: %d\n", usageHistory[len(usageHistory)-1].activeUsers)
|
||||||
log.Println("System has been idle for the monitoring period. Suspending...")
|
log.Println("System has been idle for the monitoring period. Suspending...")
|
||||||
if err := suspendSystem(); err != nil {
|
if err := suspendSystem(); err != nil {
|
||||||
log.Printf("Failed to suspend system: %v", err)
|
log.Printf("Failed to suspend system: %v", err)
|
||||||
@ -75,6 +109,68 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func startHTTPServer() {
|
||||||
|
http.HandleFunc("/status", handleStatus)
|
||||||
|
if err := http.ListenAndServe(fmt.Sprintf(":%d", httpPort), nil); err != nil {
|
||||||
|
log.Printf("Failed to start HTTP server: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func handleStatus(w http.ResponseWriter, r *http.Request) {
|
||||||
|
statusMutex.RLock()
|
||||||
|
defer statusMutex.RUnlock()
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
json.NewEncoder(w).Encode(currentStatus)
|
||||||
|
}
|
||||||
|
|
||||||
|
func updateSystemStatus(current ResourceUsage, history []ResourceUsage) {
|
||||||
|
statusMutex.Lock()
|
||||||
|
defer statusMutex.Unlock()
|
||||||
|
|
||||||
|
currentStatus.CurrentUsage = current
|
||||||
|
currentStatus.Blockers = []string{}
|
||||||
|
|
||||||
|
if len(history) >= 2 {
|
||||||
|
// Calculate rates using last two samples
|
||||||
|
duration := history[len(history)-1].timestamp.Sub(history[len(history)-2].timestamp).Seconds()
|
||||||
|
diskIORate := float64(history[len(history)-1].diskIO-history[len(history)-2].diskIO) / duration
|
||||||
|
netIORate := float64(history[len(history)-1].networkIO-history[len(history)-2].networkIO) / duration
|
||||||
|
|
||||||
|
if current.cpuUsage >= cpuThreshold {
|
||||||
|
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||||
|
fmt.Sprintf("CPU usage too high: %.1f%% >= %.1f%%", current.cpuUsage, cpuThreshold))
|
||||||
|
}
|
||||||
|
if current.gpuUsage >= gpuThreshold {
|
||||||
|
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||||
|
fmt.Sprintf("GPU usage too high: %.1f%% >= %.1f%%", current.gpuUsage, gpuThreshold))
|
||||||
|
}
|
||||||
|
if diskIORate >= float64(diskThreshold) {
|
||||||
|
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||||
|
fmt.Sprintf("Disk I/O too high: %.1f MB/s >= %.1f MB/s",
|
||||||
|
diskIORate/(1024*1024), float64(diskThreshold)/(1024*1024)))
|
||||||
|
}
|
||||||
|
if netIORate >= float64(networkThreshold) {
|
||||||
|
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||||
|
fmt.Sprintf("Network I/O too high: %.1f MB/s >= %.1f MB/s",
|
||||||
|
netIORate/(1024*1024), float64(networkThreshold)/(1024*1024)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if current.sshConnections > 0 {
|
||||||
|
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||||
|
fmt.Sprintf("Active SSH connections: %d", current.sshConnections))
|
||||||
|
}
|
||||||
|
if current.activeUsers > 0 {
|
||||||
|
currentStatus.Blockers = append(currentStatus.Blockers,
|
||||||
|
fmt.Sprintf("Active user sessions: %d", current.activeUsers))
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(currentStatus.Blockers) == 0 {
|
||||||
|
currentStatus.Blockers = append(currentStatus.Blockers, "No blockers - system can sleep")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func getCurrentUsage() ResourceUsage {
|
func getCurrentUsage() ResourceUsage {
|
||||||
usage := ResourceUsage{
|
usage := ResourceUsage{
|
||||||
timestamp: time.Now(),
|
timestamp: time.Now(),
|
||||||
@ -124,20 +220,81 @@ func getCurrentUsage() ResourceUsage {
|
|||||||
usage.networkIO = netStats[0].BytesSent + netStats[0].BytesRecv
|
usage.networkIO = netStats[0].BytesSent + netStats[0].BytesRecv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Count SSH connections
|
||||||
|
if sshCount, err := getSSHConnectionCount(); err == nil {
|
||||||
|
usage.sshConnections = sshCount
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count active user sessions
|
||||||
|
if userCount, err := getActiveUserCount(); err == nil {
|
||||||
|
usage.activeUsers = userCount
|
||||||
|
}
|
||||||
|
|
||||||
return usage
|
return usage
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getSSHConnectionCount() (int, error) {
|
||||||
|
processes, err := process.Processes()
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
count := 0
|
||||||
|
for _, p := range processes {
|
||||||
|
name, err := p.Name()
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if name == "sshd" {
|
||||||
|
cmdline, err := p.Cmdline()
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Only count sshd processes that are handling connections
|
||||||
|
// The main sshd process doesn't have "@" in its cmdline
|
||||||
|
if strings.Contains(cmdline, "@") {
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getActiveUserCount() (int, error) {
|
||||||
|
out, err := exec.Command("who", "-s").Output()
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count non-empty lines
|
||||||
|
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
||||||
|
count := 0
|
||||||
|
for _, line := range lines {
|
||||||
|
if line != "" {
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count, nil
|
||||||
|
}
|
||||||
|
|
||||||
func isSystemIdle(history []ResourceUsage) bool {
|
func isSystemIdle(history []ResourceUsage) bool {
|
||||||
if len(history) < 2 {
|
if len(history) < 2 {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
var avgCPU, avgGPU float64
|
var avgCPU, avgGPU float64
|
||||||
|
var maxSSHConnections, maxActiveUsers int
|
||||||
samples := len(history)
|
samples := len(history)
|
||||||
|
|
||||||
for _, usage := range history {
|
for _, usage := range history {
|
||||||
avgCPU += usage.cpuUsage
|
avgCPU += usage.cpuUsage
|
||||||
avgGPU += usage.gpuUsage
|
avgGPU += usage.gpuUsage
|
||||||
|
if usage.sshConnections > maxSSHConnections {
|
||||||
|
maxSSHConnections = usage.sshConnections
|
||||||
|
}
|
||||||
|
if usage.activeUsers > maxActiveUsers {
|
||||||
|
maxActiveUsers = usage.activeUsers
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate I/O rates using first and last samples
|
// Calculate I/O rates using first and last samples
|
||||||
@ -151,7 +308,9 @@ func isSystemIdle(history []ResourceUsage) bool {
|
|||||||
return avgCPU < cpuThreshold &&
|
return avgCPU < cpuThreshold &&
|
||||||
avgGPU < gpuThreshold &&
|
avgGPU < gpuThreshold &&
|
||||||
diskIORate < float64(diskThreshold) &&
|
diskIORate < float64(diskThreshold) &&
|
||||||
netIORate < float64(networkThreshold)
|
netIORate < float64(networkThreshold) &&
|
||||||
|
maxSSHConnections == 0 &&
|
||||||
|
maxActiveUsers == 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func suspendSystem() error {
|
func suspendSystem() error {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user