veloren_server/
metrics.rs

1use prometheus::{
2    Gauge, GaugeVec, Histogram, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge,
3    IntGaugeVec, Opts, Registry,
4};
5use std::{
6    convert::TryInto,
7    error::Error,
8    time::{Duration, SystemTime, UNIX_EPOCH},
9};
10
11pub struct PhysicsMetrics {
12    pub entity_entity_collision_checks_count: IntCounter,
13    pub entity_entity_collisions_count: IntCounter,
14}
15
16pub struct EcsSystemMetrics {
17    // Gauges give us detailed information for random ticks
18    pub system_start_time: IntGaugeVec,
19    pub system_length_time: IntGaugeVec,
20    pub system_thread_avg: GaugeVec,
21    // Counter will only give us granularity on pool speed (2s?) for actual spike detection we
22    // need the Historgram
23    pub system_length_hist: HistogramVec,
24    pub system_length_count: IntCounterVec,
25}
26
27pub struct PlayerMetrics {
28    pub clients_connected: IntCounter,
29    pub players_connected: IntCounter,
30    pub clients_disconnected: IntCounterVec, // timeout, network_error, gracefully
31}
32
33pub struct NetworkRequestMetrics {
34    pub chunks_request_dropped: IntCounter,
35    pub chunks_served_from_memory: IntCounter,
36    pub chunks_generation_triggered: IntCounter,
37    pub chunks_served_lossy: IntCounter,
38    pub chunks_served_lossless: IntCounter,
39    pub chunks_serialisation_requests: IntCounter,
40    pub chunks_distinct_serialisation_requests: IntCounter,
41}
42
43pub struct ChunkGenMetrics {
44    pub chunks_requested: IntCounter,
45    pub chunks_served: IntCounter,
46    pub chunks_canceled: IntCounter,
47}
48
49pub struct JobMetrics {
50    pub job_queried_hst: HistogramVec,
51    pub job_execution_hst: HistogramVec,
52}
53
54pub struct TickMetrics {
55    pub chonks_count: IntGauge,
56    pub chunks_count: IntGauge,
57    pub chunk_groups_count: IntGauge,
58    pub entity_count: IntGauge,
59    pub tick_time: IntGaugeVec,
60    /// Timing of some subsections of `State::tick`.
61    pub state_tick_time: IntGaugeVec,
62    pub tick_time_hist: Histogram,
63    pub build_info: IntGauge,
64    pub start_time: IntGauge,
65    pub time_of_day: Gauge,
66    pub light_count: IntGauge,
67}
68
69pub struct ServerEventMetrics {
70    pub event_count: IntCounterVec,
71}
72
73pub struct QueryServerMetrics {
74    pub received_packets: IntCounter,
75    pub dropped_packets: IntCounter,
76    pub invalid_packets: IntCounter,
77    pub proccessing_errors: IntCounter,
78    pub info_requests: IntCounter,
79    pub init_requests: IntCounter,
80    pub sent_responses: IntCounter,
81    pub failed_responses: IntCounter,
82    pub timed_out_responses: IntCounter,
83    pub ratelimited: IntCounter,
84}
85
86impl PhysicsMetrics {
87    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
88        let entity_entity_collision_checks_count = IntCounter::with_opts(Opts::new(
89            "entity_entity_collision_checks_count",
90            "shows the number of collision checks",
91        ))?;
92        let entity_entity_collisions_count = IntCounter::with_opts(Opts::new(
93            "entity_entity_collisions_count",
94            "shows the number of actual collisions detected",
95        ))?;
96
97        registry.register(Box::new(entity_entity_collision_checks_count.clone()))?;
98        registry.register(Box::new(entity_entity_collisions_count.clone()))?;
99
100        Ok(Self {
101            entity_entity_collision_checks_count,
102            entity_entity_collisions_count,
103        })
104    }
105}
106
107impl EcsSystemMetrics {
108    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
109        let bucket = vec![
110            Duration::from_micros(1).as_secs_f64(),
111            Duration::from_micros(10).as_secs_f64(),
112            Duration::from_micros(100).as_secs_f64(),
113            Duration::from_micros(200).as_secs_f64(),
114            Duration::from_micros(400).as_secs_f64(),
115            Duration::from_millis(2).as_secs_f64(),
116            Duration::from_millis(5).as_secs_f64(),
117            Duration::from_millis(10).as_secs_f64(),
118            Duration::from_millis(20).as_secs_f64(),
119            Duration::from_millis(30).as_secs_f64(),
120            Duration::from_millis(50).as_secs_f64(),
121            Duration::from_millis(100).as_secs_f64(),
122        ];
123        let system_length_hist = HistogramVec::new(
124            HistogramOpts::new(
125                "system_length_hist",
126                "shows the detailed time in ns inside each ECS system as histogram",
127            )
128            .buckets(bucket),
129            &["system"],
130        )?;
131        let system_length_count = IntCounterVec::new(
132            Opts::new(
133                "system_length_count",
134                "shows the detailed time in ns inside each ECS system",
135            ),
136            &["system"],
137        )?;
138        let system_start_time = IntGaugeVec::new(
139            Opts::new(
140                "system_start_time",
141                "start relative to tick start in ns required per ECS system",
142            ),
143            &["system"],
144        )?;
145        let system_length_time = IntGaugeVec::new(
146            Opts::new("system_length_time", "time in ns required per ECS system"),
147            &["system"],
148        )?;
149        let system_thread_avg = GaugeVec::new(
150            Opts::new(
151                "system_thread_avg",
152                "average threads used by the ECS system",
153            ),
154            &["system"],
155        )?;
156
157        registry.register(Box::new(system_start_time.clone()))?;
158        registry.register(Box::new(system_length_time.clone()))?;
159        registry.register(Box::new(system_thread_avg.clone()))?;
160        registry.register(Box::new(system_length_hist.clone()))?;
161        registry.register(Box::new(system_length_count.clone()))?;
162
163        Ok(Self {
164            system_start_time,
165            system_length_time,
166            system_thread_avg,
167            system_length_hist,
168            system_length_count,
169        })
170    }
171}
172
173impl PlayerMetrics {
174    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
175        let clients_connected = IntCounter::with_opts(Opts::new(
176            "clients_connected",
177            "shows the number of clients joined to the server",
178        ))?;
179        let players_connected = IntCounter::with_opts(Opts::new(
180            "players_connected",
181            "shows the number of players joined to the server. A player is a client, that \
182             registers itself. Bots are not players (but clients)",
183        ))?;
184        let clients_disconnected = IntCounterVec::new(
185            Opts::new(
186                "clients_disconnected",
187                "shows the number of clients disconnected from the server and the reason",
188            ),
189            &["reason"],
190        )?;
191
192        registry.register(Box::new(clients_connected.clone()))?;
193        registry.register(Box::new(players_connected.clone()))?;
194        registry.register(Box::new(clients_disconnected.clone()))?;
195
196        Ok(Self {
197            clients_connected,
198            players_connected,
199            clients_disconnected,
200        })
201    }
202}
203
204impl NetworkRequestMetrics {
205    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
206        let chunks_request_dropped = IntCounter::with_opts(Opts::new(
207            "chunks_request_dropped",
208            "number of all chunk request dropped, e.g because the player was to far away",
209        ))?;
210        let chunks_served_from_memory = IntCounter::with_opts(Opts::new(
211            "chunks_served_from_memory",
212            "number of all requested chunks already generated and could be served out of cache",
213        ))?;
214        let chunks_generation_triggered = IntCounter::with_opts(Opts::new(
215            "chunks_generation_triggered",
216            "number of all chunks that were requested and needs to be generated",
217        ))?;
218        let chunks_served_lossy = IntCounter::with_opts(Opts::new(
219            "chunks_served_lossy",
220            "number of chunks that were sent with lossy compression requested",
221        ))?;
222        let chunks_served_lossless = IntCounter::with_opts(Opts::new(
223            "chunks_served_lossless",
224            "number of chunks that were sent with lossless compression requested",
225        ))?;
226        let chunks_serialisation_requests = IntCounter::with_opts(Opts::new(
227            "chunks_serialisation_requests",
228            "number of requests for the sys chunk_serialisation",
229        ))?;
230        let chunks_distinct_serialisation_requests = IntCounter::with_opts(Opts::new(
231            "chunks_distinct_serialisation_requests",
232            "number of distinct chunks in requests for the sys chunk_serialisation",
233        ))?;
234
235        registry.register(Box::new(chunks_request_dropped.clone()))?;
236        registry.register(Box::new(chunks_served_from_memory.clone()))?;
237        registry.register(Box::new(chunks_generation_triggered.clone()))?;
238        registry.register(Box::new(chunks_served_lossy.clone()))?;
239        registry.register(Box::new(chunks_served_lossless.clone()))?;
240        registry.register(Box::new(chunks_serialisation_requests.clone()))?;
241        registry.register(Box::new(chunks_distinct_serialisation_requests.clone()))?;
242
243        Ok(Self {
244            chunks_request_dropped,
245            chunks_served_from_memory,
246            chunks_generation_triggered,
247            chunks_served_lossy,
248            chunks_served_lossless,
249            chunks_serialisation_requests,
250            chunks_distinct_serialisation_requests,
251        })
252    }
253}
254
255impl ChunkGenMetrics {
256    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
257        let chunks_requested = IntCounter::with_opts(Opts::new(
258            "chunks_requested",
259            "number of all chunks requested on the server",
260        ))?;
261        let chunks_served = IntCounter::with_opts(Opts::new(
262            "chunks_served",
263            "number of all requested chunks already served on the server",
264        ))?;
265        let chunks_canceled = IntCounter::with_opts(Opts::new(
266            "chunks_canceled",
267            "number of all canceled chunks on the server",
268        ))?;
269
270        registry.register(Box::new(chunks_requested.clone()))?;
271        registry.register(Box::new(chunks_served.clone()))?;
272        registry.register(Box::new(chunks_canceled.clone()))?;
273
274        Ok(Self {
275            chunks_requested,
276            chunks_served,
277            chunks_canceled,
278        })
279    }
280}
281
282impl JobMetrics {
283    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
284        let bucket = vec![
285            Duration::from_micros(100).as_secs_f64(),
286            Duration::from_millis(2).as_secs_f64(),
287            Duration::from_millis(100).as_secs_f64(),
288        ];
289
290        let job_queried_hst = HistogramVec::new(
291            HistogramOpts::new(
292                "job_queried_hst",
293                "shows the detailed time each job name took from query till it started to execute \
294                 as histogram",
295            )
296            .buckets(bucket),
297            &["name"],
298        )?;
299
300        let bucket = vec![
301            Duration::from_millis(5).as_secs_f64(),
302            Duration::from_millis(20).as_secs_f64(),
303            Duration::from_millis(50).as_secs_f64(),
304            Duration::from_millis(100).as_secs_f64(),
305            Duration::from_millis(200).as_secs_f64(),
306            Duration::from_millis(500).as_secs_f64(),
307            Duration::from_millis(1000).as_secs_f64(),
308            Duration::from_millis(10000).as_secs_f64(),
309        ];
310
311        let job_execution_hst = HistogramVec::new(
312            HistogramOpts::new(
313                "job_execution_hst",
314                "shows the detailed time each job name took from start of execution until it \
315                 finished as histogram",
316            )
317            .buckets(bucket),
318            &["name"],
319        )?;
320
321        registry.register(Box::new(job_queried_hst.clone()))?;
322        registry.register(Box::new(job_execution_hst.clone()))?;
323
324        Ok(Self {
325            job_queried_hst,
326            job_execution_hst,
327        })
328    }
329}
330
331impl TickMetrics {
332    pub fn new(registry: &Registry) -> Result<Self, Box<dyn Error>> {
333        let chonks_count = IntGauge::with_opts(Opts::new(
334            "chonks_count",
335            "number of all chonks currently active on the server",
336        ))?;
337        let chunks_count = IntGauge::with_opts(Opts::new(
338            "chunks_count",
339            "number of all chunks currently active on the server",
340        ))?;
341        let chunk_groups_count = IntGauge::with_opts(Opts::new(
342            "chunk_groups_count",
343            "number of 4×4×4 groups currently allocated by chunks on the server",
344        ))?;
345        let entity_count = IntGauge::with_opts(Opts::new(
346            "entity_count",
347            "number of all entities currently active on the server",
348        ))?;
349        let opts = Opts::new("veloren_build_info", "Build information")
350            .const_label("hash", *common::util::GIT_HASH)
351            .const_label("version", "");
352        let build_info = IntGauge::with_opts(opts)?;
353        let start_time = IntGauge::with_opts(Opts::new(
354            "veloren_start_time",
355            "start time of the server in seconds since EPOCH",
356        ))?;
357        let time_of_day =
358            Gauge::with_opts(Opts::new("time_of_day", "ingame time in ingame-seconds"))?;
359        let light_count = IntGauge::with_opts(Opts::new(
360            "light_count",
361            "number of all lights currently active on the server",
362        ))?;
363        let tick_time = IntGaugeVec::new(
364            Opts::new("tick_time", "time in ns required for a tick of the server"),
365            &["period"],
366        )?;
367        let state_tick_time = IntGaugeVec::new(
368            Opts::new(
369                "state_tick_time",
370                "time in ns for some subsections of State::tick",
371            ),
372            &["period"],
373        )?;
374        // 33.33ms is the ideal tick time. So we have hight detail around it.
375        // 300/700 are to detect high I/O blocks
376        let bucket = vec![
377            Duration::from_millis(8).as_secs_f64(),
378            Duration::from_millis(16).as_secs_f64(),
379            Duration::from_millis(24).as_secs_f64(),
380            Duration::from_millis(30).as_secs_f64(),
381            Duration::from_millis(33).as_secs_f64(),
382            Duration::from_millis(37).as_secs_f64(),
383            Duration::from_millis(45).as_secs_f64(),
384            Duration::from_millis(60).as_secs_f64(),
385            Duration::from_millis(100).as_secs_f64(),
386            Duration::from_millis(300).as_secs_f64(),
387            Duration::from_millis(700).as_secs_f64(),
388        ];
389        let tick_time_hist = Histogram::with_opts(
390            HistogramOpts::new(
391                "tick_time_hist",
392                "shows the detailed time in ns spend for the whole tick as histogram",
393            )
394            .buckets(bucket),
395        )?;
396
397        let since_the_epoch = SystemTime::now()
398            .duration_since(UNIX_EPOCH)
399            .expect("Time went backwards");
400        start_time.set(since_the_epoch.as_secs().try_into()?);
401
402        registry.register(Box::new(chonks_count.clone()))?;
403        registry.register(Box::new(chunks_count.clone()))?;
404        registry.register(Box::new(chunk_groups_count.clone()))?;
405        registry.register(Box::new(entity_count.clone()))?;
406        registry.register(Box::new(build_info.clone()))?;
407        registry.register(Box::new(start_time.clone()))?;
408        registry.register(Box::new(time_of_day.clone()))?;
409        registry.register(Box::new(light_count.clone()))?;
410        registry.register(Box::new(tick_time.clone()))?;
411        registry.register(Box::new(state_tick_time.clone()))?;
412        registry.register(Box::new(tick_time_hist.clone()))?;
413
414        Ok(Self {
415            chonks_count,
416            chunks_count,
417            chunk_groups_count,
418            entity_count,
419            tick_time,
420            state_tick_time,
421            tick_time_hist,
422            build_info,
423            start_time,
424            time_of_day,
425            light_count,
426        })
427    }
428}
429
430impl ServerEventMetrics {
431    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
432        let event_count = IntCounterVec::new(
433            Opts::new("event_count", "number of ServerEvents handled"),
434            &["event"],
435        )?;
436        registry.register(Box::new(event_count.clone()))?;
437
438        Ok(Self { event_count })
439    }
440}
441
442impl QueryServerMetrics {
443    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
444        let received_packets = IntCounter::with_opts(Opts::new(
445            "query_server::received_packets",
446            "Total amount of received packets by the query server",
447        ))?;
448        let dropped_packets = IntCounter::with_opts(Opts::new(
449            "query_server::dropped_packets",
450            "Amount of dropped packets received by the query server (too short or invalid header)",
451        ))?;
452        let invalid_packets = IntCounter::with_opts(Opts::new(
453            "query_server::invalid_packets",
454            "Amount of unparseable packets received by the query server",
455        ))?;
456        let proccessing_errors = IntCounter::with_opts(Opts::new(
457            "query_server::proccessing_errors",
458            "Amount of errors that occured while processing a query server request",
459        ))?;
460        let info_requests = IntCounter::with_opts(Opts::new(
461            "query_server::info_requests",
462            "Amount of server info requests received by the query server",
463        ))?;
464        let init_requests = IntCounter::with_opts(Opts::new(
465            "query_server::ping_requests",
466            "Amount of init requests received by the query server",
467        ))?;
468        let sent_responses = IntCounter::with_opts(Opts::new(
469            "query_server::sent_responses",
470            "Amount of responses sent by the query server",
471        ))?;
472        let failed_responses = IntCounter::with_opts(Opts::new(
473            "query_server::failed_responses",
474            "Amount of responses which failed to be sent by the query server",
475        ))?;
476        let timed_out_responses = IntCounter::with_opts(Opts::new(
477            "query_server::timed_out_responses",
478            "Amount of responses which timed out",
479        ))?;
480        let ratelimited = IntCounter::with_opts(Opts::new(
481            "query_server::ratelimited",
482            "Ratelimited requests to the query server",
483        ))?;
484
485        registry.register(Box::new(received_packets.clone()))?;
486        registry.register(Box::new(dropped_packets.clone()))?;
487        registry.register(Box::new(invalid_packets.clone()))?;
488        registry.register(Box::new(proccessing_errors.clone()))?;
489        registry.register(Box::new(info_requests.clone()))?;
490        registry.register(Box::new(init_requests.clone()))?;
491        registry.register(Box::new(sent_responses.clone()))?;
492        registry.register(Box::new(failed_responses.clone()))?;
493        registry.register(Box::new(timed_out_responses.clone()))?;
494        registry.register(Box::new(ratelimited.clone()))?;
495
496        Ok(Self {
497            received_packets,
498            dropped_packets,
499            invalid_packets,
500            proccessing_errors,
501            info_requests,
502            init_requests,
503            sent_responses,
504            failed_responses,
505            timed_out_responses,
506            ratelimited,
507        })
508    }
509
510    pub fn apply(
511        &self,
512        veloren_query_server::server::Metrics {
513            received_packets,
514            dropped_packets,
515            invalid_packets,
516            proccessing_errors,
517            info_requests,
518            init_requests,
519            sent_responses,
520            failed_responses,
521            timed_out_responses,
522            ratelimited,
523        }: veloren_query_server::server::Metrics,
524    ) {
525        self.received_packets.inc_by(received_packets as u64);
526        self.dropped_packets.inc_by(dropped_packets as u64);
527        self.invalid_packets.inc_by(invalid_packets as u64);
528        self.proccessing_errors.inc_by(proccessing_errors as u64);
529        self.info_requests.inc_by(info_requests as u64);
530        self.init_requests.inc_by(init_requests as u64);
531        self.sent_responses.inc_by(sent_responses as u64);
532        self.failed_responses.inc_by(failed_responses as u64);
533        self.timed_out_responses.inc_by(timed_out_responses as u64);
534        self.ratelimited.inc_by(ratelimited as u64);
535    }
536}