Skip to content

Commit cbe9ebe

Browse files
scottgerringcijothomasutpilla
authored
Track dropped spans and logs due to full buffer (#2357)
Co-authored-by: Cijo Thomas <[email protected]> Co-authored-by: Utkarsh Umesan Pillai <[email protected]> Co-authored-by: Cijo Thomas <[email protected]>
1 parent 195dea8 commit cbe9ebe

File tree

2 files changed

+66
-14
lines changed

2 files changed

+66
-14
lines changed

opentelemetry-sdk/src/logs/log_processor.rs

+32-7
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use futures_util::{
1313
use opentelemetry::logs::Severity;
1414
use opentelemetry::{otel_debug, otel_error, otel_warn, InstrumentationScope};
1515

16-
use std::sync::atomic::AtomicBool;
16+
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
1717
use std::{cmp::min, env, sync::Mutex};
1818
use std::{
1919
fmt::{self, Debug, Formatter},
@@ -154,6 +154,12 @@ impl LogProcessor for SimpleLogProcessor {
154154
/// them at a pre-configured interval.
155155
pub struct BatchLogProcessor<R: RuntimeChannel> {
156156
message_sender: R::Sender<BatchMessage>,
157+
158+
// Track dropped logs - we'll log this at shutdown
159+
dropped_logs_count: AtomicUsize,
160+
161+
// Track the maximum queue size that was configured for this processor
162+
max_queue_size: usize,
157163
}
158164

159165
impl<R: RuntimeChannel> Debug for BatchLogProcessor<R> {
@@ -172,11 +178,13 @@ impl<R: RuntimeChannel> LogProcessor for BatchLogProcessor<R> {
172178
)));
173179

174180
// TODO - Implement throttling to prevent error flooding when the queue is full or closed.
175-
if let Err(err) = result {
176-
otel_error!(
177-
name: "BatchLogProcessor.Export.Error",
178-
error = format!("{}", err)
179-
);
181+
if result.is_err() {
182+
// Increment dropped logs count. The first time we have to drop a log,
183+
// emit a warning.
184+
if self.dropped_logs_count.fetch_add(1, Ordering::Relaxed) == 0 {
185+
otel_warn!(name: "BatchLogProcessor.LogDroppingStarted",
186+
message = "BatchLogProcessor dropped a LogRecord due to queue full/internal errors. No further log will be emitted for further drops until Shutdown. During Shutdown time, a log will be emitted with exact count of total logs dropped.");
187+
}
180188
}
181189
}
182190

@@ -192,6 +200,17 @@ impl<R: RuntimeChannel> LogProcessor for BatchLogProcessor<R> {
192200
}
193201

194202
fn shutdown(&self) -> LogResult<()> {
203+
let dropped_logs = self.dropped_logs_count.load(Ordering::Relaxed);
204+
let max_queue_size = self.max_queue_size;
205+
if dropped_logs > 0 {
206+
otel_warn!(
207+
name: "BatchLogProcessor.LogsDropped",
208+
dropped_logs_count = dropped_logs,
209+
max_queue_size = max_queue_size,
210+
message = "Logs were dropped due to a queue being full or other error. The count represents the total count of log records dropped in the lifetime of this BatchLogProcessor. Consider increasing the queue size and/or decrease delay between intervals."
211+
);
212+
}
213+
195214
let (res_sender, res_receiver) = oneshot::channel();
196215
self.message_sender
197216
.try_send(BatchMessage::Shutdown(res_sender))
@@ -215,6 +234,7 @@ impl<R: RuntimeChannel> BatchLogProcessor<R> {
215234
let (message_sender, message_receiver) =
216235
runtime.batch_message_channel(config.max_queue_size);
217236
let inner_runtime = runtime.clone();
237+
let max_queue_size = config.max_queue_size;
218238

219239
// Spawn worker process via user-defined spawn function.
220240
runtime.spawn(Box::pin(async move {
@@ -296,8 +316,13 @@ impl<R: RuntimeChannel> BatchLogProcessor<R> {
296316
}
297317
}
298318
}));
319+
299320
// Return batch processor with link to worker
300-
BatchLogProcessor { message_sender }
321+
BatchLogProcessor {
322+
message_sender,
323+
dropped_logs_count: AtomicUsize::new(0),
324+
max_queue_size,
325+
}
301326
}
302327

303328
/// Create a new batch processor builder

opentelemetry-sdk/src/trace/span_processor.rs

+34-7
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,13 @@ use futures_util::{
4545
stream::{self, FusedStream, FuturesUnordered},
4646
StreamExt as _,
4747
};
48-
use opentelemetry::{otel_debug, otel_error};
48+
use opentelemetry::{otel_debug, otel_error, otel_warn};
4949
use opentelemetry::{
5050
trace::{TraceError, TraceResult},
5151
Context,
5252
};
5353
use std::cmp::min;
54+
use std::sync::atomic::{AtomicUsize, Ordering};
5455
use std::sync::{Arc, Mutex};
5556
use std::{env, fmt, str::FromStr, time::Duration};
5657

@@ -227,6 +228,12 @@ impl SpanProcessor for SimpleSpanProcessor {
227228
/// [`async-std`]: https://async.rs
228229
pub struct BatchSpanProcessor<R: RuntimeChannel> {
229230
message_sender: R::Sender<BatchMessage>,
231+
232+
// Track dropped spans
233+
dropped_spans_count: AtomicUsize,
234+
235+
// Track the maximum queue size that was configured for this processor
236+
max_queue_size: usize,
230237
}
231238

232239
impl<R: RuntimeChannel> fmt::Debug for BatchSpanProcessor<R> {
@@ -249,11 +256,14 @@ impl<R: RuntimeChannel> SpanProcessor for BatchSpanProcessor<R> {
249256

250257
let result = self.message_sender.try_send(BatchMessage::ExportSpan(span));
251258

252-
if let Err(err) = result {
253-
otel_debug!(
254-
name: "BatchSpanProcessor.OnEnd.ExportQueueingFailed",
255-
reason = format!("{:?}", TraceError::Other(err.into()))
256-
);
259+
// If the queue is full, and we can't buffer a span
260+
if result.is_err() {
261+
// Increment the number of dropped spans. If this is the first time we've had to drop,
262+
// emit a warning.
263+
if self.dropped_spans_count.fetch_add(1, Ordering::Relaxed) == 0 {
264+
otel_warn!(name: "BatchSpanProcessor.SpanDroppingStarted",
265+
message = "Beginning to drop span messages due to full/internal errors. No further log will be emitted for further drops until Shutdown. During Shutdown time, a log will be emitted with exact count of total spans dropped.");
266+
}
257267
}
258268
}
259269

@@ -269,6 +279,17 @@ impl<R: RuntimeChannel> SpanProcessor for BatchSpanProcessor<R> {
269279
}
270280

271281
fn shutdown(&self) -> TraceResult<()> {
282+
let dropped_spans = self.dropped_spans_count.load(Ordering::Relaxed);
283+
let max_queue_size = self.max_queue_size;
284+
if dropped_spans > 0 {
285+
otel_warn!(
286+
name: "BatchSpanProcessor.Shutdown",
287+
dropped_spans = dropped_spans,
288+
max_queue_size = max_queue_size,
289+
message = "Spans were dropped due to a full or closed queue. The count represents the total count of span records dropped in the lifetime of the BatchLogProcessor. Consider increasing the queue size and/or decrease delay between intervals."
290+
);
291+
}
292+
272293
let (res_sender, res_receiver) = oneshot::channel();
273294
self.message_sender
274295
.try_send(BatchMessage::Shutdown(res_sender))
@@ -469,6 +490,8 @@ impl<R: RuntimeChannel> BatchSpanProcessor<R> {
469490
let (message_sender, message_receiver) =
470491
runtime.batch_message_channel(config.max_queue_size);
471492

493+
let max_queue_size = config.max_queue_size;
494+
472495
let inner_runtime = runtime.clone();
473496
// Spawn worker process via user-defined spawn function.
474497
runtime.spawn(Box::pin(async move {
@@ -493,7 +516,11 @@ impl<R: RuntimeChannel> BatchSpanProcessor<R> {
493516
}));
494517

495518
// Return batch processor with link to worker
496-
BatchSpanProcessor { message_sender }
519+
BatchSpanProcessor {
520+
message_sender,
521+
dropped_spans_count: AtomicUsize::new(0),
522+
max_queue_size,
523+
}
497524
}
498525

499526
/// Create a new batch processor builder

0 commit comments

Comments
 (0)