ProximileAdmin commited on
Commit
7d5c9a2
·
verified ·
1 Parent(s): d840019

Create remote/gpu_stats_srv.py

Browse files
Files changed (1) hide show
  1. remote/gpu_stats_srv.py +124 -0
remote/gpu_stats_srv.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GPU Metrics JSON Server
4
+
5
+ This script provides a simple HTTP server that serves NVIDIA GPU metrics in JSON format.
6
+ It runs on the remote machine and is accessed via an SSH tunnel.
7
+ """
8
+
9
+ import json
10
+ import subprocess
11
+ import re
12
+ from flask import Flask, jsonify
13
+ import logging
14
+
15
+ # Configure logging
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
19
+ )
20
+ logger = logging.getLogger('gpu_server')
21
+
22
+ app = Flask(__name__)
23
+
24
+ def get_gpu_info():
25
+ """
26
+ Get NVIDIA GPU information and parse it into a structured format
27
+
28
+ Returns:
29
+ dict: Dictionary containing GPU information
30
+ """
31
+ try:
32
+ # Run nvidia-smi to get GPU information
33
+ nvidia_smi_output = subprocess.check_output(
34
+ [
35
+ 'nvidia-smi',
36
+ '--query-gpu=index,name,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.used,memory.free,power.draw,power.limit',
37
+ '--format=csv,noheader,nounits'
38
+ ],
39
+ universal_newlines=True
40
+ )
41
+
42
+ # Parse the CSV output
43
+ gpus = []
44
+ for line in nvidia_smi_output.strip().split('\n'):
45
+ values = [v.strip() for v in line.split(',')]
46
+ if len(values) >= 10:
47
+ gpu = {
48
+ 'index': int(values[0]),
49
+ 'name': values[1],
50
+ 'temperature': float(values[2]),
51
+ 'gpu_utilization': float(values[3]),
52
+ 'memory_utilization': float(values[4]),
53
+ 'memory_total': float(values[5]),
54
+ 'memory_used': float(values[6]),
55
+ 'memory_free': float(values[7]),
56
+ 'power_draw': float(values[8]),
57
+ 'power_limit': float(values[9])
58
+ }
59
+ gpus.append(gpu)
60
+
61
+ # Get GPU processes information
62
+ process_output = subprocess.check_output(
63
+ ['nvidia-smi', '--query-compute-apps=pid,process_name,used_memory', '--format=csv,noheader,nounits'],
64
+ universal_newlines=True
65
+ )
66
+
67
+ processes = []
68
+ for line in process_output.strip().split('\n'):
69
+ if line: # Skip empty lines
70
+ values = [v.strip() for v in line.split(',')]
71
+ if len(values) >= 3:
72
+ process = {
73
+ 'pid': int(values[0]),
74
+ 'name': values[1],
75
+ 'memory_used': float(values[2])
76
+ }
77
+ processes.append(process)
78
+
79
+ return {
80
+ 'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(),
81
+ 'gpus': gpus,
82
+ 'processes': processes,
83
+ 'success': True
84
+ }
85
+
86
+ except Exception as e:
87
+ logger.error(f"Error getting GPU information: {str(e)}")
88
+ return {
89
+ 'timestamp': subprocess.check_output(['date', '+%Y-%m-%d %H:%M:%S'], universal_newlines=True).strip(),
90
+ 'error': str(e),
91
+ 'success': False
92
+ }
93
+
94
+ @app.route('/gpu/json')
95
+ def gpu_json():
96
+ """
97
+ API endpoint for GPU information in JSON format
98
+ """
99
+ return jsonify(get_gpu_info())
100
+
101
+ @app.route('/gpu/txt')
102
+ def gpu_txt():
103
+ """
104
+ API endpoint for traditional nvidia-smi text output (for backward compatibility)
105
+ """
106
+ try:
107
+ # Run nvidia-smi with standard output format
108
+ nvidia_smi_output = subprocess.check_output(['nvidia-smi'], universal_newlines=True)
109
+ return nvidia_smi_output
110
+ except Exception as e:
111
+ logger.error(f"Error getting nvidia-smi output: {str(e)}")
112
+ return f"Error: {str(e)}"
113
+
114
+ @app.route('/health')
115
+ def health_check():
116
+ """
117
+ Simple health check endpoint
118
+ """
119
+ return jsonify({'status': 'ok'})
120
+
121
+ if __name__ == '__main__':
122
+ # Note: In production, consider using a proper WSGI server like gunicorn
123
+ # and configure proper authentication/security
124
+ app.run(host='0.0.0.0', port=5000)