Skip to main content

veloren_server/
metrics.rs

1use prometheus::{
2    Gauge, GaugeVec, Histogram, HistogramOpts, HistogramVec, IntCounter, IntCounterVec, IntGauge,
3    IntGaugeVec, Opts, Registry,
4};
5use std::{
6    convert::TryInto,
7    error::Error,
8    time::{Duration, SystemTime, UNIX_EPOCH},
9};
10
11pub struct PhysicsMetrics {
12    pub entity_entity_collision_checks_count: IntCounter,
13    pub entity_entity_collisions_count: IntCounter,
14}
15
16pub struct EcsSystemMetrics {
17    // Gauges give us detailed information for random ticks
18    pub system_start_time: IntGaugeVec,
19    pub system_length_time: IntGaugeVec,
20    pub system_thread_avg: GaugeVec,
21    // Counter will only give us granularity on pool speed (2s?) for actual spike detection we
22    // need the Historgram
23    pub system_length_hist: HistogramVec,
24    pub system_length_count: IntCounterVec,
25}
26
27pub struct PlayerMetrics {
28    pub clients_connected: IntCounter,
29    pub players_connected: IntCounter,
30    pub clients_disconnected: IntCounterVec, // timeout, network_error, gracefully
31}
32
33pub struct NetworkRequestMetrics {
34    pub chunks_request_dropped: IntCounter,
35    pub chunks_served_from_memory: IntCounter,
36    pub chunks_generation_triggered: IntCounter,
37    pub chunks_served_lossy: IntCounter,
38    pub chunks_served_lossless: IntCounter,
39    pub chunks_serialisation_requests: IntCounter,
40    pub chunks_distinct_serialisation_requests: IntCounter,
41}
42
43pub struct ChunkGenMetrics {
44    pub chunks_requested: IntCounter,
45    pub chunks_served: IntCounter,
46    pub chunks_canceled: IntCounter,
47}
48
49pub struct JobMetrics {
50    pub job_queried_hst: HistogramVec,
51    pub job_execution_hst: HistogramVec,
52}
53
54pub struct TickMetrics {
55    pub chonks_count: IntGauge,
56    pub chunks_count: IntGauge,
57    pub chunk_groups_count: IntGauge,
58    pub entity_count: IntGauge,
59    pub tick_time: IntGaugeVec,
60    /// Timing of some subsections of `State::tick`.
61    pub state_tick_time: IntGaugeVec,
62    pub tick_time_hist: Histogram,
63    pub build_info: IntGauge,
64    pub start_time: IntGauge,
65    pub time_of_day: Gauge,
66    pub light_count: IntGauge,
67}
68
69pub struct ServerEventMetrics {
70    pub event_count: IntCounterVec,
71}
72
73pub struct GameplayMetrics {
74    pub entity_kills_by_location: IntCounterVec,
75    pub entity_kills_by_type: IntCounterVec,
76}
77
78pub struct QueryServerMetrics {
79    pub received_packets: IntCounter,
80    pub dropped_packets: IntCounter,
81    pub invalid_packets: IntCounter,
82    pub proccessing_errors: IntCounter,
83    pub info_requests: IntCounter,
84    pub init_requests: IntCounter,
85    pub sent_responses: IntCounter,
86    pub failed_responses: IntCounter,
87    pub timed_out_responses: IntCounter,
88    pub ratelimited: IntCounter,
89}
90
91impl PhysicsMetrics {
92    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
93        let entity_entity_collision_checks_count = IntCounter::with_opts(Opts::new(
94            "entity_entity_collision_checks_count",
95            "shows the number of collision checks",
96        ))?;
97        let entity_entity_collisions_count = IntCounter::with_opts(Opts::new(
98            "entity_entity_collisions_count",
99            "shows the number of actual collisions detected",
100        ))?;
101
102        registry.register(Box::new(entity_entity_collision_checks_count.clone()))?;
103        registry.register(Box::new(entity_entity_collisions_count.clone()))?;
104
105        Ok(Self {
106            entity_entity_collision_checks_count,
107            entity_entity_collisions_count,
108        })
109    }
110}
111
112impl EcsSystemMetrics {
113    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
114        let bucket = vec![
115            Duration::from_micros(1).as_secs_f64(),
116            Duration::from_micros(10).as_secs_f64(),
117            Duration::from_micros(100).as_secs_f64(),
118            Duration::from_micros(200).as_secs_f64(),
119            Duration::from_micros(400).as_secs_f64(),
120            Duration::from_millis(2).as_secs_f64(),
121            Duration::from_millis(5).as_secs_f64(),
122            Duration::from_millis(10).as_secs_f64(),
123            Duration::from_millis(20).as_secs_f64(),
124            Duration::from_millis(30).as_secs_f64(),
125            Duration::from_millis(50).as_secs_f64(),
126            Duration::from_millis(100).as_secs_f64(),
127        ];
128        let system_length_hist = HistogramVec::new(
129            HistogramOpts::new(
130                "system_length_hist",
131                "shows the detailed time in ns inside each ECS system as histogram",
132            )
133            .buckets(bucket),
134            &["system"],
135        )?;
136        let system_length_count = IntCounterVec::new(
137            Opts::new(
138                "system_length_count",
139                "shows the detailed time in ns inside each ECS system",
140            ),
141            &["system"],
142        )?;
143        let system_start_time = IntGaugeVec::new(
144            Opts::new(
145                "system_start_time",
146                "start relative to tick start in ns required per ECS system",
147            ),
148            &["system"],
149        )?;
150        let system_length_time = IntGaugeVec::new(
151            Opts::new("system_length_time", "time in ns required per ECS system"),
152            &["system"],
153        )?;
154        let system_thread_avg = GaugeVec::new(
155            Opts::new(
156                "system_thread_avg",
157                "average threads used by the ECS system",
158            ),
159            &["system"],
160        )?;
161
162        registry.register(Box::new(system_start_time.clone()))?;
163        registry.register(Box::new(system_length_time.clone()))?;
164        registry.register(Box::new(system_thread_avg.clone()))?;
165        registry.register(Box::new(system_length_hist.clone()))?;
166        registry.register(Box::new(system_length_count.clone()))?;
167
168        Ok(Self {
169            system_start_time,
170            system_length_time,
171            system_thread_avg,
172            system_length_hist,
173            system_length_count,
174        })
175    }
176}
177
178impl PlayerMetrics {
179    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
180        let clients_connected = IntCounter::with_opts(Opts::new(
181            "clients_connected",
182            "shows the number of clients joined to the server",
183        ))?;
184        let players_connected = IntCounter::with_opts(Opts::new(
185            "players_connected",
186            "shows the number of players joined to the server. A player is a client, that \
187             registers itself. Bots are not players (but clients)",
188        ))?;
189        let clients_disconnected = IntCounterVec::new(
190            Opts::new(
191                "clients_disconnected",
192                "shows the number of clients disconnected from the server and the reason",
193            ),
194            &["reason"],
195        )?;
196
197        registry.register(Box::new(clients_connected.clone()))?;
198        registry.register(Box::new(players_connected.clone()))?;
199        registry.register(Box::new(clients_disconnected.clone()))?;
200
201        Ok(Self {
202            clients_connected,
203            players_connected,
204            clients_disconnected,
205        })
206    }
207}
208
209impl NetworkRequestMetrics {
210    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
211        let chunks_request_dropped = IntCounter::with_opts(Opts::new(
212            "chunks_request_dropped",
213            "number of all chunk request dropped, e.g because the player was to far away",
214        ))?;
215        let chunks_served_from_memory = IntCounter::with_opts(Opts::new(
216            "chunks_served_from_memory",
217            "number of all requested chunks already generated and could be served out of cache",
218        ))?;
219        let chunks_generation_triggered = IntCounter::with_opts(Opts::new(
220            "chunks_generation_triggered",
221            "number of all chunks that were requested and needs to be generated",
222        ))?;
223        let chunks_served_lossy = IntCounter::with_opts(Opts::new(
224            "chunks_served_lossy",
225            "number of chunks that were sent with lossy compression requested",
226        ))?;
227        let chunks_served_lossless = IntCounter::with_opts(Opts::new(
228            "chunks_served_lossless",
229            "number of chunks that were sent with lossless compression requested",
230        ))?;
231        let chunks_serialisation_requests = IntCounter::with_opts(Opts::new(
232            "chunks_serialisation_requests",
233            "number of requests for the sys chunk_serialisation",
234        ))?;
235        let chunks_distinct_serialisation_requests = IntCounter::with_opts(Opts::new(
236            "chunks_distinct_serialisation_requests",
237            "number of distinct chunks in requests for the sys chunk_serialisation",
238        ))?;
239
240        registry.register(Box::new(chunks_request_dropped.clone()))?;
241        registry.register(Box::new(chunks_served_from_memory.clone()))?;
242        registry.register(Box::new(chunks_generation_triggered.clone()))?;
243        registry.register(Box::new(chunks_served_lossy.clone()))?;
244        registry.register(Box::new(chunks_served_lossless.clone()))?;
245        registry.register(Box::new(chunks_serialisation_requests.clone()))?;
246        registry.register(Box::new(chunks_distinct_serialisation_requests.clone()))?;
247
248        Ok(Self {
249            chunks_request_dropped,
250            chunks_served_from_memory,
251            chunks_generation_triggered,
252            chunks_served_lossy,
253            chunks_served_lossless,
254            chunks_serialisation_requests,
255            chunks_distinct_serialisation_requests,
256        })
257    }
258}
259
260impl ChunkGenMetrics {
261    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
262        let chunks_requested = IntCounter::with_opts(Opts::new(
263            "chunks_requested",
264            "number of all chunks requested on the server",
265        ))?;
266        let chunks_served = IntCounter::with_opts(Opts::new(
267            "chunks_served",
268            "number of all requested chunks already served on the server",
269        ))?;
270        let chunks_canceled = IntCounter::with_opts(Opts::new(
271            "chunks_canceled",
272            "number of all canceled chunks on the server",
273        ))?;
274
275        registry.register(Box::new(chunks_requested.clone()))?;
276        registry.register(Box::new(chunks_served.clone()))?;
277        registry.register(Box::new(chunks_canceled.clone()))?;
278
279        Ok(Self {
280            chunks_requested,
281            chunks_served,
282            chunks_canceled,
283        })
284    }
285}
286
287impl JobMetrics {
288    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
289        let bucket = vec![
290            Duration::from_micros(100).as_secs_f64(),
291            Duration::from_millis(2).as_secs_f64(),
292            Duration::from_millis(100).as_secs_f64(),
293        ];
294
295        let job_queried_hst = HistogramVec::new(
296            HistogramOpts::new(
297                "job_queried_hst",
298                "shows the detailed time each job name took from query till it started to execute \
299                 as histogram",
300            )
301            .buckets(bucket),
302            &["name"],
303        )?;
304
305        let bucket = vec![
306            Duration::from_millis(5).as_secs_f64(),
307            Duration::from_millis(20).as_secs_f64(),
308            Duration::from_millis(50).as_secs_f64(),
309            Duration::from_millis(100).as_secs_f64(),
310            Duration::from_millis(200).as_secs_f64(),
311            Duration::from_millis(500).as_secs_f64(),
312            Duration::from_millis(1000).as_secs_f64(),
313            Duration::from_millis(10000).as_secs_f64(),
314        ];
315
316        let job_execution_hst = HistogramVec::new(
317            HistogramOpts::new(
318                "job_execution_hst",
319                "shows the detailed time each job name took from start of execution until it \
320                 finished as histogram",
321            )
322            .buckets(bucket),
323            &["name"],
324        )?;
325
326        registry.register(Box::new(job_queried_hst.clone()))?;
327        registry.register(Box::new(job_execution_hst.clone()))?;
328
329        Ok(Self {
330            job_queried_hst,
331            job_execution_hst,
332        })
333    }
334}
335
336impl TickMetrics {
337    pub fn new(registry: &Registry) -> Result<Self, Box<dyn Error>> {
338        let chonks_count = IntGauge::with_opts(Opts::new(
339            "chonks_count",
340            "number of all chonks currently active on the server",
341        ))?;
342        let chunks_count = IntGauge::with_opts(Opts::new(
343            "chunks_count",
344            "number of all chunks currently active on the server",
345        ))?;
346        let chunk_groups_count = IntGauge::with_opts(Opts::new(
347            "chunk_groups_count",
348            "number of 4×4×4 groups currently allocated by chunks on the server",
349        ))?;
350        let entity_count = IntGauge::with_opts(Opts::new(
351            "entity_count",
352            "number of all entities currently active on the server",
353        ))?;
354        let mut opts = Opts::new("veloren_build_info", "Build information")
355            .const_label("hash", format!("{:x}", *common::util::GIT_HASH));
356        if !common::util::GIT_TAG.is_empty() {
357            opts = opts.const_label("tag", *common::util::GIT_TAG);
358        }
359        let build_info = IntGauge::with_opts(opts)?;
360        let start_time = IntGauge::with_opts(Opts::new(
361            "veloren_start_time",
362            "start time of the server in seconds since EPOCH",
363        ))?;
364        let time_of_day =
365            Gauge::with_opts(Opts::new("time_of_day", "ingame time in ingame-seconds"))?;
366        let light_count = IntGauge::with_opts(Opts::new(
367            "light_count",
368            "number of all lights currently active on the server",
369        ))?;
370        let tick_time = IntGaugeVec::new(
371            Opts::new("tick_time", "time in ns required for a tick of the server"),
372            &["period"],
373        )?;
374        let state_tick_time = IntGaugeVec::new(
375            Opts::new(
376                "state_tick_time",
377                "time in ns for some subsections of State::tick",
378            ),
379            &["period"],
380        )?;
381        // 33.33ms is the ideal tick time. So we have hight detail around it.
382        // 300/700 are to detect high I/O blocks
383        let bucket = vec![
384            Duration::from_millis(8).as_secs_f64(),
385            Duration::from_millis(16).as_secs_f64(),
386            Duration::from_millis(24).as_secs_f64(),
387            Duration::from_millis(30).as_secs_f64(),
388            Duration::from_millis(33).as_secs_f64(),
389            Duration::from_millis(37).as_secs_f64(),
390            Duration::from_millis(45).as_secs_f64(),
391            Duration::from_millis(60).as_secs_f64(),
392            Duration::from_millis(100).as_secs_f64(),
393            Duration::from_millis(300).as_secs_f64(),
394            Duration::from_millis(700).as_secs_f64(),
395        ];
396        let tick_time_hist = Histogram::with_opts(
397            HistogramOpts::new(
398                "tick_time_hist",
399                "shows the detailed time in ns spend for the whole tick as histogram",
400            )
401            .buckets(bucket),
402        )?;
403
404        let since_the_epoch = SystemTime::now()
405            .duration_since(UNIX_EPOCH)
406            .expect("Time went backwards");
407        start_time.set(since_the_epoch.as_secs().try_into()?);
408
409        registry.register(Box::new(chonks_count.clone()))?;
410        registry.register(Box::new(chunks_count.clone()))?;
411        registry.register(Box::new(chunk_groups_count.clone()))?;
412        registry.register(Box::new(entity_count.clone()))?;
413        registry.register(Box::new(build_info.clone()))?;
414        registry.register(Box::new(start_time.clone()))?;
415        registry.register(Box::new(time_of_day.clone()))?;
416        registry.register(Box::new(light_count.clone()))?;
417        registry.register(Box::new(tick_time.clone()))?;
418        registry.register(Box::new(state_tick_time.clone()))?;
419        registry.register(Box::new(tick_time_hist.clone()))?;
420
421        Ok(Self {
422            chonks_count,
423            chunks_count,
424            chunk_groups_count,
425            entity_count,
426            tick_time,
427            state_tick_time,
428            tick_time_hist,
429            build_info,
430            start_time,
431            time_of_day,
432            light_count,
433        })
434    }
435}
436
437impl ServerEventMetrics {
438    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
439        let event_count = IntCounterVec::new(
440            Opts::new("event_count", "number of ServerEvents handled"),
441            &["event"],
442        )?;
443        registry.register(Box::new(event_count.clone()))?;
444
445        Ok(Self { event_count })
446    }
447}
448
449impl GameplayMetrics {
450    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
451        let entity_kills_by_location = IntCounterVec::new(
452            Opts::new(
453                "entity_kills_by_location",
454                "Entity kills quantized to nearest segment of x and y coordinates",
455            ),
456            &["body_type", "x", "y"],
457        )?;
458        let entity_kills_by_type = IntCounterVec::new(
459            Opts::new(
460                "entity_kills_by_type",
461                "Entity kills by body type and weapon",
462            ),
463            &["body_type", "weapon"],
464        )?;
465
466        registry.register(Box::new(entity_kills_by_location.clone()))?;
467        registry.register(Box::new(entity_kills_by_type.clone()))?;
468
469        Ok(Self {
470            entity_kills_by_location,
471            entity_kills_by_type,
472        })
473    }
474}
475
476impl QueryServerMetrics {
477    pub fn new(registry: &Registry) -> Result<Self, prometheus::Error> {
478        let received_packets = IntCounter::with_opts(Opts::new(
479            "query_server::received_packets",
480            "Total amount of received packets by the query server",
481        ))?;
482        let dropped_packets = IntCounter::with_opts(Opts::new(
483            "query_server::dropped_packets",
484            "Amount of dropped packets received by the query server (too short or invalid header)",
485        ))?;
486        let invalid_packets = IntCounter::with_opts(Opts::new(
487            "query_server::invalid_packets",
488            "Amount of unparseable packets received by the query server",
489        ))?;
490        let proccessing_errors = IntCounter::with_opts(Opts::new(
491            "query_server::proccessing_errors",
492            "Amount of errors that occured while processing a query server request",
493        ))?;
494        let info_requests = IntCounter::with_opts(Opts::new(
495            "query_server::info_requests",
496            "Amount of server info requests received by the query server",
497        ))?;
498        let init_requests = IntCounter::with_opts(Opts::new(
499            "query_server::ping_requests",
500            "Amount of init requests received by the query server",
501        ))?;
502        let sent_responses = IntCounter::with_opts(Opts::new(
503            "query_server::sent_responses",
504            "Amount of responses sent by the query server",
505        ))?;
506        let failed_responses = IntCounter::with_opts(Opts::new(
507            "query_server::failed_responses",
508            "Amount of responses which failed to be sent by the query server",
509        ))?;
510        let timed_out_responses = IntCounter::with_opts(Opts::new(
511            "query_server::timed_out_responses",
512            "Amount of responses which timed out",
513        ))?;
514        let ratelimited = IntCounter::with_opts(Opts::new(
515            "query_server::ratelimited",
516            "Ratelimited requests to the query server",
517        ))?;
518
519        registry.register(Box::new(received_packets.clone()))?;
520        registry.register(Box::new(dropped_packets.clone()))?;
521        registry.register(Box::new(invalid_packets.clone()))?;
522        registry.register(Box::new(proccessing_errors.clone()))?;
523        registry.register(Box::new(info_requests.clone()))?;
524        registry.register(Box::new(init_requests.clone()))?;
525        registry.register(Box::new(sent_responses.clone()))?;
526        registry.register(Box::new(failed_responses.clone()))?;
527        registry.register(Box::new(timed_out_responses.clone()))?;
528        registry.register(Box::new(ratelimited.clone()))?;
529
530        Ok(Self {
531            received_packets,
532            dropped_packets,
533            invalid_packets,
534            proccessing_errors,
535            info_requests,
536            init_requests,
537            sent_responses,
538            failed_responses,
539            timed_out_responses,
540            ratelimited,
541        })
542    }
543
544    pub fn apply(
545        &self,
546        veloren_query_server::server::Metrics {
547            received_packets,
548            dropped_packets,
549            invalid_packets,
550            proccessing_errors,
551            info_requests,
552            init_requests,
553            sent_responses,
554            failed_responses,
555            timed_out_responses,
556            ratelimited,
557        }: veloren_query_server::server::Metrics,
558    ) {
559        self.received_packets.inc_by(received_packets as u64);
560        self.dropped_packets.inc_by(dropped_packets as u64);
561        self.invalid_packets.inc_by(invalid_packets as u64);
562        self.proccessing_errors.inc_by(proccessing_errors as u64);
563        self.info_requests.inc_by(info_requests as u64);
564        self.init_requests.inc_by(init_requests as u64);
565        self.sent_responses.inc_by(sent_responses as u64);
566        self.failed_responses.inc_by(failed_responses as u64);
567        self.timed_out_responses.inc_by(timed_out_responses as u64);
568        self.ratelimited.inc_by(ratelimited as u64);
569    }
570}