langfuse
1""".. include:: ../README.md""" 2 3from langfuse.batch_evaluation import ( 4 BatchEvaluationResult, 5 BatchEvaluationResumeToken, 6 CompositeEvaluatorFunction, 7 EvaluatorInputs, 8 EvaluatorStats, 9 MapperFunction, 10) 11from langfuse.experiment import Evaluation, RegressionError, RunnerContext 12 13from ._client import client as _client_module 14from ._client.attributes import LangfuseOtelSpanAttributes 15from ._client.constants import ObservationTypeLiteral 16from ._client.get_client import get_client 17from ._client.observe import observe 18from ._client.propagation import propagate_attributes 19from ._client.span import ( 20 LangfuseAgent, 21 LangfuseChain, 22 LangfuseEmbedding, 23 LangfuseEvaluator, 24 LangfuseEvent, 25 LangfuseGeneration, 26 LangfuseGuardrail, 27 LangfuseRetriever, 28 LangfuseSpan, 29 LangfuseTool, 30) 31from ._version import __version__ 32from .span_filter import ( 33 KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES, 34 is_default_export_span, 35 is_genai_span, 36 is_known_llm_instrumentor, 37 is_langfuse_span, 38) 39from .types import ( 40 MaskOtelSpansFunction, 41 MaskOtelSpansParams, 42 MaskOtelSpansResult, 43 OtelSpanData, 44 OtelSpanIdentifier, 45 OtelSpanPatch, 46) 47 48Langfuse = _client_module.Langfuse 49 50__all__ = [ 51 "Langfuse", 52 "get_client", 53 "observe", 54 "propagate_attributes", 55 "ObservationTypeLiteral", 56 "LangfuseSpan", 57 "LangfuseGeneration", 58 "LangfuseEvent", 59 "LangfuseOtelSpanAttributes", 60 "LangfuseAgent", 61 "LangfuseTool", 62 "LangfuseChain", 63 "LangfuseEmbedding", 64 "LangfuseEvaluator", 65 "LangfuseRetriever", 66 "LangfuseGuardrail", 67 "Evaluation", 68 "EvaluatorInputs", 69 "MapperFunction", 70 "CompositeEvaluatorFunction", 71 "EvaluatorStats", 72 "BatchEvaluationResumeToken", 73 "BatchEvaluationResult", 74 "RunnerContext", 75 "RegressionError", 76 "__version__", 77 "is_default_export_span", 78 "is_langfuse_span", 79 "is_genai_span", 80 "is_known_llm_instrumentor", 81 "KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES", 82 "MaskOtelSpansFunction", 83 "MaskOtelSpansParams", 84 "MaskOtelSpansResult", 85 "OtelSpanData", 86 "OtelSpanIdentifier", 87 "OtelSpanPatch", 88 "experiment", 89 "api", 90]
147class Langfuse: 148 """Main client for Langfuse tracing and platform features. 149 150 This class provides an interface for creating and managing traces, spans, 151 and generations in Langfuse as well as interacting with the Langfuse API. 152 153 The client features a thread-safe singleton pattern for each unique public API key, 154 ensuring consistent trace context propagation across your application. It implements 155 efficient batching of spans with configurable flush settings and includes background 156 thread management for media uploads and score ingestion. 157 158 Configuration is flexible through either direct parameters or environment variables, 159 with graceful fallbacks and runtime configuration updates. 160 161 Attributes: 162 api: Synchronous API client for Langfuse backend communication 163 async_api: Asynchronous API client for Langfuse backend communication 164 _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components 165 166 Parameters: 167 public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable. 168 secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable. 169 base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable. 170 host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com". 171 timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds. 172 httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created. 173 debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable. 174 tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable. 175 flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable. 176 flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable. 177 environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'. 178 release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release. 179 media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable. 180 sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable. 181 mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as `start_observation()`, `update()`, and `set_trace_io()`. 182 mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters. 183 184 The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during `flush()` and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast. 185 186 Return `None` to leave the batch unchanged. Return `MaskOtelSpansResult` with `OtelSpanPatch` values to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export. 187 188 Example: 189 ```python 190 from typing import Optional 191 192 from langfuse import Langfuse 193 from langfuse.types import ( 194 MaskOtelSpansParams, 195 MaskOtelSpansResult, 196 OtelSpanPatch, 197 ) 198 199 def mask_otel_spans( 200 *, params: MaskOtelSpansParams 201 ) -> Optional[MaskOtelSpansResult]: 202 patches = {} 203 204 for identifier, span in params.spans.items(): 205 if "gen_ai.prompt.0.content" in span.attributes: 206 patches[identifier] = OtelSpanPatch( 207 delete_attributes=("gen_ai.prompt.0.content",), 208 set_attributes={"masking.applied": True}, 209 ) 210 211 return MaskOtelSpansResult(span_patches=patches) 212 213 langfuse = Langfuse(mask_otel_spans=mask_otel_spans) 214 ``` 215 blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use `should_export_span` instead. Equivalent behavior: 216 ```python 217 from langfuse.span_filter import is_default_export_span 218 blocked = {"sqlite", "requests"} 219 220 should_export_span = lambda span: ( 221 is_default_export_span(span) 222 and ( 223 span.instrumentation_scope is None 224 or span.instrumentation_scope.name not in blocked 225 ) 226 ) 227 ``` 228 should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with `gen_ai.*` attributes, and known LLM instrumentation scopes). 229 additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If `span_exporter` is provided, these headers are not wired into that exporter and must be configured on the exporter instance directly. 230 tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees. 231 span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire `base_url`, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, include `x-langfuse-ingestion-version=4` on the exporter to enable real time processing of exported spans. 232 233 Example: 234 ```python 235 from langfuse.otel import Langfuse 236 237 # Initialize the client (reads from env vars if not provided) 238 langfuse = Langfuse( 239 public_key="your-public-key", 240 secret_key="your-secret-key", 241 host="https://cloud.langfuse.com", # Optional, default shown 242 ) 243 244 # Create a trace span 245 with langfuse.start_as_current_observation(name="process-query") as span: 246 # Your application code here 247 248 # Create a nested generation span for an LLM call 249 with span.start_as_current_generation( 250 name="generate-response", 251 model="gpt-4", 252 input={"query": "Tell me about AI"}, 253 model_parameters={"temperature": 0.7, "max_tokens": 500} 254 ) as generation: 255 # Generate response here 256 response = "AI is a field of computer science..." 257 258 generation.update( 259 output=response, 260 usage_details={"prompt_tokens": 10, "completion_tokens": 50}, 261 cost_details={"total_cost": 0.0023} 262 ) 263 264 # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) 265 generation.score(name="relevance", value=0.95, data_type="NUMERIC") 266 ``` 267 """ 268 269 _resources: Optional[LangfuseResourceManager] = None 270 _mask: Optional[MaskFunction] = None 271 _otel_tracer: otel_trace_api.Tracer 272 273 def __init__( 274 self, 275 *, 276 public_key: Optional[str] = None, 277 secret_key: Optional[str] = None, 278 base_url: Optional[str] = None, 279 host: Optional[str] = None, 280 timeout: Optional[int] = None, 281 httpx_client: Optional[httpx.Client] = None, 282 debug: bool = False, 283 tracing_enabled: Optional[bool] = True, 284 flush_at: Optional[int] = None, 285 flush_interval: Optional[float] = None, 286 environment: Optional[str] = None, 287 release: Optional[str] = None, 288 media_upload_thread_count: Optional[int] = None, 289 sample_rate: Optional[float] = None, 290 mask: Optional[MaskFunction] = None, 291 mask_otel_spans: Optional[MaskOtelSpansFunction] = None, 292 blocked_instrumentation_scopes: Optional[List[str]] = None, 293 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 294 additional_headers: Optional[Dict[str, str]] = None, 295 tracer_provider: Optional[TracerProvider] = None, 296 span_exporter: Optional[SpanExporter] = None, 297 ): 298 self._base_url = ( 299 base_url 300 or os.environ.get(LANGFUSE_BASE_URL) 301 or host 302 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 303 ) 304 self._environment = environment or cast( 305 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 306 ) 307 self._release = ( 308 release 309 or os.environ.get(LANGFUSE_RELEASE, None) 310 or get_common_release_envs() 311 ) 312 self._project_id: Optional[str] = None 313 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 314 if not 0.0 <= sample_rate <= 1.0: 315 raise ValueError( 316 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 317 ) 318 319 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 320 321 self._tracing_enabled = ( 322 tracing_enabled 323 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 324 ) 325 if not self._tracing_enabled: 326 langfuse_logger.info( 327 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 328 ) 329 330 debug = ( 331 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 332 ) 333 if debug: 334 logging.basicConfig( 335 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 336 ) 337 langfuse_logger.setLevel(logging.DEBUG) 338 339 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 340 if public_key is None: 341 langfuse_logger.warning( 342 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 343 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 344 ) 345 self._otel_tracer = otel_trace_api.NoOpTracer() 346 return 347 348 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 349 if secret_key is None: 350 langfuse_logger.warning( 351 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 352 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 353 ) 354 self._otel_tracer = otel_trace_api.NoOpTracer() 355 return 356 357 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 358 langfuse_logger.warning( 359 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 360 ) 361 362 if blocked_instrumentation_scopes is not None: 363 warnings.warn( 364 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 365 "Use `should_export_span` instead. Example: " 366 "from langfuse.span_filter import is_default_export_span; " 367 'blocked={"scope"}; should_export_span=lambda span: ' 368 "is_default_export_span(span) and (span.instrumentation_scope is None or " 369 "span.instrumentation_scope.name not in blocked).", 370 DeprecationWarning, 371 stacklevel=2, 372 ) 373 374 # Initialize api and tracer if requirements are met 375 self._resources = LangfuseResourceManager( 376 public_key=public_key, 377 secret_key=secret_key, 378 base_url=self._base_url, 379 timeout=timeout, 380 environment=self._environment, 381 release=release, 382 flush_at=flush_at, 383 flush_interval=flush_interval, 384 httpx_client=httpx_client, 385 media_upload_thread_count=media_upload_thread_count, 386 sample_rate=sample_rate, 387 mask=mask, 388 mask_otel_spans=mask_otel_spans, 389 tracing_enabled=self._tracing_enabled, 390 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 391 should_export_span=should_export_span, 392 additional_headers=additional_headers, 393 tracer_provider=tracer_provider, 394 span_exporter=span_exporter, 395 ) 396 self._mask = self._resources.mask 397 398 self._otel_tracer = ( 399 self._resources.tracer 400 if self._tracing_enabled and self._resources.tracer is not None 401 else otel_trace_api.NoOpTracer() 402 ) 403 self.api = self._resources.api 404 self.async_api = self._resources.async_api 405 406 @overload 407 def start_observation( 408 self, 409 *, 410 trace_context: Optional[TraceContext] = None, 411 name: str, 412 as_type: Literal["generation"], 413 input: Optional[Any] = None, 414 output: Optional[Any] = None, 415 metadata: Optional[Any] = None, 416 version: Optional[str] = None, 417 level: Optional[SpanLevel] = None, 418 status_message: Optional[str] = None, 419 completion_start_time: Optional[datetime] = None, 420 model: Optional[str] = None, 421 model_parameters: Optional[Dict[str, MapValue]] = None, 422 usage_details: Optional[Dict[str, int]] = None, 423 cost_details: Optional[Dict[str, float]] = None, 424 prompt: Optional[PromptClient] = None, 425 ) -> LangfuseGeneration: ... 426 427 @overload 428 def start_observation( 429 self, 430 *, 431 trace_context: Optional[TraceContext] = None, 432 name: str, 433 as_type: Literal["span"] = "span", 434 input: Optional[Any] = None, 435 output: Optional[Any] = None, 436 metadata: Optional[Any] = None, 437 version: Optional[str] = None, 438 level: Optional[SpanLevel] = None, 439 status_message: Optional[str] = None, 440 ) -> LangfuseSpan: ... 441 442 @overload 443 def start_observation( 444 self, 445 *, 446 trace_context: Optional[TraceContext] = None, 447 name: str, 448 as_type: Literal["agent"], 449 input: Optional[Any] = None, 450 output: Optional[Any] = None, 451 metadata: Optional[Any] = None, 452 version: Optional[str] = None, 453 level: Optional[SpanLevel] = None, 454 status_message: Optional[str] = None, 455 ) -> LangfuseAgent: ... 456 457 @overload 458 def start_observation( 459 self, 460 *, 461 trace_context: Optional[TraceContext] = None, 462 name: str, 463 as_type: Literal["tool"], 464 input: Optional[Any] = None, 465 output: Optional[Any] = None, 466 metadata: Optional[Any] = None, 467 version: Optional[str] = None, 468 level: Optional[SpanLevel] = None, 469 status_message: Optional[str] = None, 470 ) -> LangfuseTool: ... 471 472 @overload 473 def start_observation( 474 self, 475 *, 476 trace_context: Optional[TraceContext] = None, 477 name: str, 478 as_type: Literal["chain"], 479 input: Optional[Any] = None, 480 output: Optional[Any] = None, 481 metadata: Optional[Any] = None, 482 version: Optional[str] = None, 483 level: Optional[SpanLevel] = None, 484 status_message: Optional[str] = None, 485 ) -> LangfuseChain: ... 486 487 @overload 488 def start_observation( 489 self, 490 *, 491 trace_context: Optional[TraceContext] = None, 492 name: str, 493 as_type: Literal["retriever"], 494 input: Optional[Any] = None, 495 output: Optional[Any] = None, 496 metadata: Optional[Any] = None, 497 version: Optional[str] = None, 498 level: Optional[SpanLevel] = None, 499 status_message: Optional[str] = None, 500 ) -> LangfuseRetriever: ... 501 502 @overload 503 def start_observation( 504 self, 505 *, 506 trace_context: Optional[TraceContext] = None, 507 name: str, 508 as_type: Literal["evaluator"], 509 input: Optional[Any] = None, 510 output: Optional[Any] = None, 511 metadata: Optional[Any] = None, 512 version: Optional[str] = None, 513 level: Optional[SpanLevel] = None, 514 status_message: Optional[str] = None, 515 ) -> LangfuseEvaluator: ... 516 517 @overload 518 def start_observation( 519 self, 520 *, 521 trace_context: Optional[TraceContext] = None, 522 name: str, 523 as_type: Literal["embedding"], 524 input: Optional[Any] = None, 525 output: Optional[Any] = None, 526 metadata: Optional[Any] = None, 527 version: Optional[str] = None, 528 level: Optional[SpanLevel] = None, 529 status_message: Optional[str] = None, 530 completion_start_time: Optional[datetime] = None, 531 model: Optional[str] = None, 532 model_parameters: Optional[Dict[str, MapValue]] = None, 533 usage_details: Optional[Dict[str, int]] = None, 534 cost_details: Optional[Dict[str, float]] = None, 535 prompt: Optional[PromptClient] = None, 536 ) -> LangfuseEmbedding: ... 537 538 @overload 539 def start_observation( 540 self, 541 *, 542 trace_context: Optional[TraceContext] = None, 543 name: str, 544 as_type: Literal["guardrail"], 545 input: Optional[Any] = None, 546 output: Optional[Any] = None, 547 metadata: Optional[Any] = None, 548 version: Optional[str] = None, 549 level: Optional[SpanLevel] = None, 550 status_message: Optional[str] = None, 551 ) -> LangfuseGuardrail: ... 552 553 def start_observation( 554 self, 555 *, 556 trace_context: Optional[TraceContext] = None, 557 name: str, 558 as_type: ObservationTypeLiteralNoEvent = "span", 559 input: Optional[Any] = None, 560 output: Optional[Any] = None, 561 metadata: Optional[Any] = None, 562 version: Optional[str] = None, 563 level: Optional[SpanLevel] = None, 564 status_message: Optional[str] = None, 565 completion_start_time: Optional[datetime] = None, 566 model: Optional[str] = None, 567 model_parameters: Optional[Dict[str, MapValue]] = None, 568 usage_details: Optional[Dict[str, int]] = None, 569 cost_details: Optional[Dict[str, float]] = None, 570 prompt: Optional[PromptClient] = None, 571 ) -> Union[ 572 LangfuseSpan, 573 LangfuseGeneration, 574 LangfuseAgent, 575 LangfuseTool, 576 LangfuseChain, 577 LangfuseRetriever, 578 LangfuseEvaluator, 579 LangfuseEmbedding, 580 LangfuseGuardrail, 581 ]: 582 """Create a new observation of the specified type. 583 584 This method creates a new observation but does not set it as the current span in the 585 context. To create and use an observation within a context, use start_as_current_observation(). 586 587 Args: 588 trace_context: Optional context for connecting to an existing trace 589 name: Name of the observation 590 as_type: Type of observation to create (defaults to "span") 591 input: Input data for the operation 592 output: Output data from the operation 593 metadata: Additional metadata to associate with the observation 594 version: Version identifier for the code or component 595 level: Importance level of the observation 596 status_message: Optional status message for the observation 597 completion_start_time: When the model started generating (for generation types) 598 model: Name/identifier of the AI model used (for generation types) 599 model_parameters: Parameters used for the model (for generation types) 600 usage_details: Token usage information (for generation types) 601 cost_details: Cost information (for generation types) 602 prompt: Associated prompt template (for generation types) 603 604 Returns: 605 An observation object of the appropriate type that must be ended with .end() 606 """ 607 if trace_context: 608 trace_id = trace_context.get("trace_id", None) 609 parent_span_id = trace_context.get("parent_span_id", None) 610 611 if trace_id: 612 remote_parent_span = self._create_remote_parent_span( 613 trace_id=trace_id, parent_span_id=parent_span_id 614 ) 615 616 with otel_trace_api.use_span( 617 cast(otel_trace_api.Span, remote_parent_span) 618 ): 619 otel_span = self._otel_tracer.start_span(name=name) 620 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 621 622 return self._create_observation_from_otel_span( 623 otel_span=otel_span, 624 as_type=as_type, 625 input=input, 626 output=output, 627 metadata=metadata, 628 version=version, 629 level=level, 630 status_message=status_message, 631 completion_start_time=completion_start_time, 632 model=model, 633 model_parameters=model_parameters, 634 usage_details=usage_details, 635 cost_details=cost_details, 636 prompt=prompt, 637 ) 638 639 otel_span = self._otel_tracer.start_span(name=name) 640 641 return self._create_observation_from_otel_span( 642 otel_span=otel_span, 643 as_type=as_type, 644 input=input, 645 output=output, 646 metadata=metadata, 647 version=version, 648 level=level, 649 status_message=status_message, 650 completion_start_time=completion_start_time, 651 model=model, 652 model_parameters=model_parameters, 653 usage_details=usage_details, 654 cost_details=cost_details, 655 prompt=prompt, 656 ) 657 658 def _create_observation_from_otel_span( 659 self, 660 *, 661 otel_span: otel_trace_api.Span, 662 as_type: ObservationTypeLiteralNoEvent, 663 input: Optional[Any] = None, 664 output: Optional[Any] = None, 665 metadata: Optional[Any] = None, 666 version: Optional[str] = None, 667 level: Optional[SpanLevel] = None, 668 status_message: Optional[str] = None, 669 completion_start_time: Optional[datetime] = None, 670 model: Optional[str] = None, 671 model_parameters: Optional[Dict[str, MapValue]] = None, 672 usage_details: Optional[Dict[str, int]] = None, 673 cost_details: Optional[Dict[str, float]] = None, 674 prompt: Optional[PromptClient] = None, 675 ) -> Union[ 676 LangfuseSpan, 677 LangfuseGeneration, 678 LangfuseAgent, 679 LangfuseTool, 680 LangfuseChain, 681 LangfuseRetriever, 682 LangfuseEvaluator, 683 LangfuseEmbedding, 684 LangfuseGuardrail, 685 ]: 686 """Create the appropriate observation type from an OTEL span.""" 687 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 688 observation_class = self._get_span_class(as_type) 689 # Type ignore to prevent overloads of internal _get_span_class function, 690 # issue is that LangfuseEvent could be returned and that classes have diff. args 691 return observation_class( # type: ignore[return-value,call-arg] 692 otel_span=otel_span, 693 langfuse_client=self, 694 environment=self._environment, 695 release=self._release, 696 input=input, 697 output=output, 698 metadata=metadata, 699 version=version, 700 level=level, 701 status_message=status_message, 702 completion_start_time=completion_start_time, 703 model=model, 704 model_parameters=model_parameters, 705 usage_details=usage_details, 706 cost_details=cost_details, 707 prompt=prompt, 708 ) 709 else: 710 # For other types (e.g. span, guardrail), create appropriate class without generation properties 711 observation_class = self._get_span_class(as_type) 712 # Type ignore to prevent overloads of internal _get_span_class function, 713 # issue is that LangfuseEvent could be returned and that classes have diff. args 714 return observation_class( # type: ignore[return-value,call-arg] 715 otel_span=otel_span, 716 langfuse_client=self, 717 environment=self._environment, 718 release=self._release, 719 input=input, 720 output=output, 721 metadata=metadata, 722 version=version, 723 level=level, 724 status_message=status_message, 725 ) 726 # span._observation_type = as_type 727 # span._otel_span.set_attribute("langfuse.observation.type", as_type) 728 # return span 729 730 @overload 731 def start_as_current_observation( 732 self, 733 *, 734 trace_context: Optional[TraceContext] = None, 735 name: str, 736 as_type: Literal["generation"], 737 input: Optional[Any] = None, 738 output: Optional[Any] = None, 739 metadata: Optional[Any] = None, 740 version: Optional[str] = None, 741 level: Optional[SpanLevel] = None, 742 status_message: Optional[str] = None, 743 completion_start_time: Optional[datetime] = None, 744 model: Optional[str] = None, 745 model_parameters: Optional[Dict[str, MapValue]] = None, 746 usage_details: Optional[Dict[str, int]] = None, 747 cost_details: Optional[Dict[str, float]] = None, 748 prompt: Optional[PromptClient] = None, 749 end_on_exit: Optional[bool] = None, 750 ) -> _AgnosticContextManager[LangfuseGeneration]: ... 751 752 @overload 753 def start_as_current_observation( 754 self, 755 *, 756 trace_context: Optional[TraceContext] = None, 757 name: str, 758 as_type: Literal["span"] = "span", 759 input: Optional[Any] = None, 760 output: Optional[Any] = None, 761 metadata: Optional[Any] = None, 762 version: Optional[str] = None, 763 level: Optional[SpanLevel] = None, 764 status_message: Optional[str] = None, 765 end_on_exit: Optional[bool] = None, 766 ) -> _AgnosticContextManager[LangfuseSpan]: ... 767 768 @overload 769 def start_as_current_observation( 770 self, 771 *, 772 trace_context: Optional[TraceContext] = None, 773 name: str, 774 as_type: Literal["agent"], 775 input: Optional[Any] = None, 776 output: Optional[Any] = None, 777 metadata: Optional[Any] = None, 778 version: Optional[str] = None, 779 level: Optional[SpanLevel] = None, 780 status_message: Optional[str] = None, 781 end_on_exit: Optional[bool] = None, 782 ) -> _AgnosticContextManager[LangfuseAgent]: ... 783 784 @overload 785 def start_as_current_observation( 786 self, 787 *, 788 trace_context: Optional[TraceContext] = None, 789 name: str, 790 as_type: Literal["tool"], 791 input: Optional[Any] = None, 792 output: Optional[Any] = None, 793 metadata: Optional[Any] = None, 794 version: Optional[str] = None, 795 level: Optional[SpanLevel] = None, 796 status_message: Optional[str] = None, 797 end_on_exit: Optional[bool] = None, 798 ) -> _AgnosticContextManager[LangfuseTool]: ... 799 800 @overload 801 def start_as_current_observation( 802 self, 803 *, 804 trace_context: Optional[TraceContext] = None, 805 name: str, 806 as_type: Literal["chain"], 807 input: Optional[Any] = None, 808 output: Optional[Any] = None, 809 metadata: Optional[Any] = None, 810 version: Optional[str] = None, 811 level: Optional[SpanLevel] = None, 812 status_message: Optional[str] = None, 813 end_on_exit: Optional[bool] = None, 814 ) -> _AgnosticContextManager[LangfuseChain]: ... 815 816 @overload 817 def start_as_current_observation( 818 self, 819 *, 820 trace_context: Optional[TraceContext] = None, 821 name: str, 822 as_type: Literal["retriever"], 823 input: Optional[Any] = None, 824 output: Optional[Any] = None, 825 metadata: Optional[Any] = None, 826 version: Optional[str] = None, 827 level: Optional[SpanLevel] = None, 828 status_message: Optional[str] = None, 829 end_on_exit: Optional[bool] = None, 830 ) -> _AgnosticContextManager[LangfuseRetriever]: ... 831 832 @overload 833 def start_as_current_observation( 834 self, 835 *, 836 trace_context: Optional[TraceContext] = None, 837 name: str, 838 as_type: Literal["evaluator"], 839 input: Optional[Any] = None, 840 output: Optional[Any] = None, 841 metadata: Optional[Any] = None, 842 version: Optional[str] = None, 843 level: Optional[SpanLevel] = None, 844 status_message: Optional[str] = None, 845 end_on_exit: Optional[bool] = None, 846 ) -> _AgnosticContextManager[LangfuseEvaluator]: ... 847 848 @overload 849 def start_as_current_observation( 850 self, 851 *, 852 trace_context: Optional[TraceContext] = None, 853 name: str, 854 as_type: Literal["embedding"], 855 input: Optional[Any] = None, 856 output: Optional[Any] = None, 857 metadata: Optional[Any] = None, 858 version: Optional[str] = None, 859 level: Optional[SpanLevel] = None, 860 status_message: Optional[str] = None, 861 completion_start_time: Optional[datetime] = None, 862 model: Optional[str] = None, 863 model_parameters: Optional[Dict[str, MapValue]] = None, 864 usage_details: Optional[Dict[str, int]] = None, 865 cost_details: Optional[Dict[str, float]] = None, 866 prompt: Optional[PromptClient] = None, 867 end_on_exit: Optional[bool] = None, 868 ) -> _AgnosticContextManager[LangfuseEmbedding]: ... 869 870 @overload 871 def start_as_current_observation( 872 self, 873 *, 874 trace_context: Optional[TraceContext] = None, 875 name: str, 876 as_type: Literal["guardrail"], 877 input: Optional[Any] = None, 878 output: Optional[Any] = None, 879 metadata: Optional[Any] = None, 880 version: Optional[str] = None, 881 level: Optional[SpanLevel] = None, 882 status_message: Optional[str] = None, 883 end_on_exit: Optional[bool] = None, 884 ) -> _AgnosticContextManager[LangfuseGuardrail]: ... 885 886 def start_as_current_observation( 887 self, 888 *, 889 trace_context: Optional[TraceContext] = None, 890 name: str, 891 as_type: ObservationTypeLiteralNoEvent = "span", 892 input: Optional[Any] = None, 893 output: Optional[Any] = None, 894 metadata: Optional[Any] = None, 895 version: Optional[str] = None, 896 level: Optional[SpanLevel] = None, 897 status_message: Optional[str] = None, 898 completion_start_time: Optional[datetime] = None, 899 model: Optional[str] = None, 900 model_parameters: Optional[Dict[str, MapValue]] = None, 901 usage_details: Optional[Dict[str, int]] = None, 902 cost_details: Optional[Dict[str, float]] = None, 903 prompt: Optional[PromptClient] = None, 904 end_on_exit: Optional[bool] = None, 905 ) -> Union[ 906 _AgnosticContextManager[LangfuseGeneration], 907 _AgnosticContextManager[LangfuseSpan], 908 _AgnosticContextManager[LangfuseAgent], 909 _AgnosticContextManager[LangfuseTool], 910 _AgnosticContextManager[LangfuseChain], 911 _AgnosticContextManager[LangfuseRetriever], 912 _AgnosticContextManager[LangfuseEvaluator], 913 _AgnosticContextManager[LangfuseEmbedding], 914 _AgnosticContextManager[LangfuseGuardrail], 915 ]: 916 """Create a new observation and set it as the current span in a context manager. 917 918 This method creates a new observation of the specified type and sets it as the 919 current span within a context manager. Use this method with a 'with' statement to 920 automatically handle the observation lifecycle within a code block. 921 922 The created observation will be the child of the current span in the context. 923 924 Args: 925 trace_context: Optional context for connecting to an existing trace 926 name: Name of the observation (e.g., function or operation name) 927 as_type: Type of observation to create (defaults to "span") 928 input: Input data for the operation (can be any JSON-serializable object) 929 output: Output data from the operation (can be any JSON-serializable object) 930 metadata: Additional metadata to associate with the observation 931 version: Version identifier for the code or component 932 level: Importance level of the observation (info, warning, error) 933 status_message: Optional status message for the observation 934 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 935 936 The following parameters are available when as_type is: "generation" or "embedding". 937 completion_start_time: When the model started generating the response 938 model: Name/identifier of the AI model used (e.g., "gpt-4") 939 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 940 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 941 cost_details: Cost information for the model call 942 prompt: Associated prompt template from Langfuse prompt management 943 944 Returns: 945 A context manager that yields the appropriate observation type based on as_type 946 947 Example: 948 ```python 949 # Create a span 950 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 951 # Do work 952 result = process_data() 953 span.update(output=result) 954 955 # Create a child span automatically 956 with span.start_as_current_observation(name="sub-operation") as child_span: 957 # Do sub-operation work 958 child_span.update(output="sub-result") 959 960 # Create a tool observation 961 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 962 # Do tool work 963 results = search_web(query) 964 tool.update(output=results) 965 966 # Create a generation observation 967 with langfuse.start_as_current_observation( 968 name="answer-generation", 969 as_type="generation", 970 model="gpt-4" 971 ) as generation: 972 # Generate answer 973 response = llm.generate(...) 974 generation.update(output=response) 975 ``` 976 """ 977 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 978 if trace_context: 979 trace_id = trace_context.get("trace_id", None) 980 parent_span_id = trace_context.get("parent_span_id", None) 981 982 if trace_id: 983 remote_parent_span = self._create_remote_parent_span( 984 trace_id=trace_id, parent_span_id=parent_span_id 985 ) 986 987 return cast( 988 Union[ 989 _AgnosticContextManager[LangfuseGeneration], 990 _AgnosticContextManager[LangfuseEmbedding], 991 ], 992 self._create_span_with_parent_context( 993 as_type=as_type, 994 name=name, 995 remote_parent_span=remote_parent_span, 996 parent=None, 997 end_on_exit=end_on_exit, 998 input=input, 999 output=output, 1000 metadata=metadata, 1001 version=version, 1002 level=level, 1003 status_message=status_message, 1004 completion_start_time=completion_start_time, 1005 model=model, 1006 model_parameters=model_parameters, 1007 usage_details=usage_details, 1008 cost_details=cost_details, 1009 prompt=prompt, 1010 ), 1011 ) 1012 1013 return cast( 1014 Union[ 1015 _AgnosticContextManager[LangfuseGeneration], 1016 _AgnosticContextManager[LangfuseEmbedding], 1017 ], 1018 self._start_as_current_otel_span_with_processed_media( 1019 as_type=as_type, 1020 name=name, 1021 end_on_exit=end_on_exit, 1022 input=input, 1023 output=output, 1024 metadata=metadata, 1025 version=version, 1026 level=level, 1027 status_message=status_message, 1028 completion_start_time=completion_start_time, 1029 model=model, 1030 model_parameters=model_parameters, 1031 usage_details=usage_details, 1032 cost_details=cost_details, 1033 prompt=prompt, 1034 ), 1035 ) 1036 1037 if as_type in get_observation_types_list(ObservationTypeSpanLike): 1038 if trace_context: 1039 trace_id = trace_context.get("trace_id", None) 1040 parent_span_id = trace_context.get("parent_span_id", None) 1041 1042 if trace_id: 1043 remote_parent_span = self._create_remote_parent_span( 1044 trace_id=trace_id, parent_span_id=parent_span_id 1045 ) 1046 1047 return cast( 1048 Union[ 1049 _AgnosticContextManager[LangfuseSpan], 1050 _AgnosticContextManager[LangfuseAgent], 1051 _AgnosticContextManager[LangfuseTool], 1052 _AgnosticContextManager[LangfuseChain], 1053 _AgnosticContextManager[LangfuseRetriever], 1054 _AgnosticContextManager[LangfuseEvaluator], 1055 _AgnosticContextManager[LangfuseGuardrail], 1056 ], 1057 self._create_span_with_parent_context( 1058 as_type=as_type, 1059 name=name, 1060 remote_parent_span=remote_parent_span, 1061 parent=None, 1062 end_on_exit=end_on_exit, 1063 input=input, 1064 output=output, 1065 metadata=metadata, 1066 version=version, 1067 level=level, 1068 status_message=status_message, 1069 ), 1070 ) 1071 1072 return cast( 1073 Union[ 1074 _AgnosticContextManager[LangfuseSpan], 1075 _AgnosticContextManager[LangfuseAgent], 1076 _AgnosticContextManager[LangfuseTool], 1077 _AgnosticContextManager[LangfuseChain], 1078 _AgnosticContextManager[LangfuseRetriever], 1079 _AgnosticContextManager[LangfuseEvaluator], 1080 _AgnosticContextManager[LangfuseGuardrail], 1081 ], 1082 self._start_as_current_otel_span_with_processed_media( 1083 as_type=as_type, 1084 name=name, 1085 end_on_exit=end_on_exit, 1086 input=input, 1087 output=output, 1088 metadata=metadata, 1089 version=version, 1090 level=level, 1091 status_message=status_message, 1092 ), 1093 ) 1094 1095 # This should never be reached since all valid types are handled above 1096 langfuse_logger.warning( 1097 f"Unknown observation type: {as_type}, falling back to span" 1098 ) 1099 return self._start_as_current_otel_span_with_processed_media( 1100 as_type="span", 1101 name=name, 1102 end_on_exit=end_on_exit, 1103 input=input, 1104 output=output, 1105 metadata=metadata, 1106 version=version, 1107 level=level, 1108 status_message=status_message, 1109 ) 1110 1111 def _get_span_class( 1112 self, 1113 as_type: str, 1114 ) -> Union[ 1115 Type[LangfuseAgent], 1116 Type[LangfuseTool], 1117 Type[LangfuseChain], 1118 Type[LangfuseRetriever], 1119 Type[LangfuseEvaluator], 1120 Type[LangfuseEmbedding], 1121 Type[LangfuseGuardrail], 1122 Type[LangfuseGeneration], 1123 Type[LangfuseEvent], 1124 Type[LangfuseSpan], 1125 ]: 1126 """Get the appropriate span class based on as_type.""" 1127 normalized_type = as_type.lower() 1128 1129 if normalized_type == "agent": 1130 return LangfuseAgent 1131 elif normalized_type == "tool": 1132 return LangfuseTool 1133 elif normalized_type == "chain": 1134 return LangfuseChain 1135 elif normalized_type == "retriever": 1136 return LangfuseRetriever 1137 elif normalized_type == "evaluator": 1138 return LangfuseEvaluator 1139 elif normalized_type == "embedding": 1140 return LangfuseEmbedding 1141 elif normalized_type == "guardrail": 1142 return LangfuseGuardrail 1143 elif normalized_type == "generation": 1144 return LangfuseGeneration 1145 elif normalized_type == "event": 1146 return LangfuseEvent 1147 elif normalized_type == "span": 1148 return LangfuseSpan 1149 else: 1150 return LangfuseSpan 1151 1152 @staticmethod 1153 def _get_observation_type_from_otel_span(otel_span: otel_trace_api.Span) -> str: 1154 if not otel_span.is_recording(): 1155 return "span" 1156 1157 attributes = getattr(otel_span, "attributes", None) 1158 if attributes is None or not hasattr(attributes, "get"): 1159 return "span" 1160 1161 observation_type = attributes.get( 1162 LangfuseOtelSpanAttributes.OBSERVATION_TYPE, "span" 1163 ) 1164 1165 return observation_type if isinstance(observation_type, str) else "span" 1166 1167 @_agnosticcontextmanager 1168 def _create_span_with_parent_context( 1169 self, 1170 *, 1171 name: str, 1172 parent: Optional[otel_trace_api.Span] = None, 1173 remote_parent_span: Optional[otel_trace_api.Span] = None, 1174 as_type: ObservationTypeLiteralNoEvent, 1175 end_on_exit: Optional[bool] = None, 1176 input: Optional[Any] = None, 1177 output: Optional[Any] = None, 1178 metadata: Optional[Any] = None, 1179 version: Optional[str] = None, 1180 level: Optional[SpanLevel] = None, 1181 status_message: Optional[str] = None, 1182 completion_start_time: Optional[datetime] = None, 1183 model: Optional[str] = None, 1184 model_parameters: Optional[Dict[str, MapValue]] = None, 1185 usage_details: Optional[Dict[str, int]] = None, 1186 cost_details: Optional[Dict[str, float]] = None, 1187 prompt: Optional[PromptClient] = None, 1188 ) -> Any: 1189 parent_span = parent or cast(otel_trace_api.Span, remote_parent_span) 1190 1191 with otel_trace_api.use_span(parent_span): 1192 with self._start_as_current_otel_span_with_processed_media( 1193 name=name, 1194 as_type=as_type, 1195 end_on_exit=end_on_exit, 1196 input=input, 1197 output=output, 1198 metadata=metadata, 1199 version=version, 1200 level=level, 1201 status_message=status_message, 1202 completion_start_time=completion_start_time, 1203 model=model, 1204 model_parameters=model_parameters, 1205 usage_details=usage_details, 1206 cost_details=cost_details, 1207 prompt=prompt, 1208 ) as langfuse_span: 1209 if remote_parent_span is not None: 1210 langfuse_span._otel_span.set_attribute( 1211 LangfuseOtelSpanAttributes.AS_ROOT, True 1212 ) 1213 1214 yield langfuse_span 1215 1216 @_agnosticcontextmanager 1217 def _start_as_current_otel_span_with_processed_media( 1218 self, 1219 *, 1220 name: str, 1221 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 1222 end_on_exit: Optional[bool] = None, 1223 input: Optional[Any] = None, 1224 output: Optional[Any] = None, 1225 metadata: Optional[Any] = None, 1226 version: Optional[str] = None, 1227 level: Optional[SpanLevel] = None, 1228 status_message: Optional[str] = None, 1229 completion_start_time: Optional[datetime] = None, 1230 model: Optional[str] = None, 1231 model_parameters: Optional[Dict[str, MapValue]] = None, 1232 usage_details: Optional[Dict[str, int]] = None, 1233 cost_details: Optional[Dict[str, float]] = None, 1234 prompt: Optional[PromptClient] = None, 1235 ) -> Any: 1236 with self._otel_tracer.start_as_current_span( 1237 name=name, 1238 end_on_exit=end_on_exit if end_on_exit is not None else True, 1239 ) as otel_span: 1240 baggage_token = None 1241 1242 if otel_span.is_recording(): 1243 context_with_app_root_claim = _set_langfuse_trace_id_in_baggage( 1244 trace_id=self._get_otel_trace_id(otel_span), 1245 context=otel_context_api.get_current(), 1246 ) 1247 baggage_token = otel_context_api.attach(context_with_app_root_claim) 1248 1249 span_class = self._get_span_class( 1250 as_type or "generation" 1251 ) # default was "generation" 1252 1253 try: 1254 common_args = { 1255 "otel_span": otel_span, 1256 "langfuse_client": self, 1257 "environment": self._environment, 1258 "release": self._release, 1259 "input": input, 1260 "output": output, 1261 "metadata": metadata, 1262 "version": version, 1263 "level": level, 1264 "status_message": status_message, 1265 } 1266 1267 if span_class in [ 1268 LangfuseGeneration, 1269 LangfuseEmbedding, 1270 ]: 1271 common_args.update( 1272 { 1273 "completion_start_time": completion_start_time, 1274 "model": model, 1275 "model_parameters": model_parameters, 1276 "usage_details": usage_details, 1277 "cost_details": cost_details, 1278 "prompt": prompt, 1279 } 1280 ) 1281 # For span-like types (span, agent, tool, chain, retriever, evaluator, guardrail), no generation properties needed 1282 1283 yield span_class(**common_args) # type: ignore[arg-type] 1284 1285 finally: 1286 if baggage_token is not None: 1287 _detach_context_token_safely(baggage_token) 1288 1289 def _get_current_otel_span(self) -> Optional[otel_trace_api.Span]: 1290 current_span = otel_trace_api.get_current_span() 1291 1292 if current_span is otel_trace_api.INVALID_SPAN: 1293 langfuse_logger.warning( 1294 "Context error: No active span in current context. Operations that depend on an active span will be skipped. " 1295 "Ensure spans are created with start_as_current_observation() or that you're operating within an active span context." 1296 ) 1297 return None 1298 1299 return current_span 1300 1301 def update_current_generation( 1302 self, 1303 *, 1304 name: Optional[str] = None, 1305 input: Optional[Any] = None, 1306 output: Optional[Any] = None, 1307 metadata: Optional[Any] = None, 1308 version: Optional[str] = None, 1309 level: Optional[SpanLevel] = None, 1310 status_message: Optional[str] = None, 1311 completion_start_time: Optional[datetime] = None, 1312 model: Optional[str] = None, 1313 model_parameters: Optional[Dict[str, MapValue]] = None, 1314 usage_details: Optional[Dict[str, int]] = None, 1315 cost_details: Optional[Dict[str, float]] = None, 1316 prompt: Optional[PromptClient] = None, 1317 ) -> None: 1318 """Update the current active generation span with new information. 1319 1320 This method updates the current generation span in the active context with 1321 additional information. It's useful for adding output, usage stats, or other 1322 details that become available during or after model generation. 1323 1324 Args: 1325 name: The generation name 1326 input: Updated input data for the model 1327 output: Output from the model (e.g., completions) 1328 metadata: Additional metadata to associate with the generation 1329 version: Version identifier for the model or component 1330 level: Importance level of the generation (info, warning, error) 1331 status_message: Optional status message for the generation 1332 completion_start_time: When the model started generating the response 1333 model: Name/identifier of the AI model used (e.g., "gpt-4") 1334 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1335 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1336 cost_details: Cost information for the model call 1337 prompt: Associated prompt template from Langfuse prompt management 1338 1339 Example: 1340 ```python 1341 with langfuse.start_as_current_generation(name="answer-query") as generation: 1342 # Initial setup and API call 1343 response = llm.generate(...) 1344 1345 # Update with results that weren't available at creation time 1346 langfuse.update_current_generation( 1347 output=response.text, 1348 usage_details={ 1349 "prompt_tokens": response.usage.prompt_tokens, 1350 "completion_tokens": response.usage.completion_tokens 1351 } 1352 ) 1353 ``` 1354 """ 1355 if not self._tracing_enabled: 1356 langfuse_logger.debug( 1357 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1358 ) 1359 return 1360 1361 current_otel_span = self._get_current_otel_span() 1362 1363 if current_otel_span is not None: 1364 generation = LangfuseGeneration( 1365 otel_span=current_otel_span, langfuse_client=self 1366 ) 1367 1368 if name: 1369 current_otel_span.update_name(name) 1370 1371 generation.update( 1372 input=input, 1373 output=output, 1374 metadata=metadata, 1375 version=version, 1376 level=level, 1377 status_message=status_message, 1378 completion_start_time=completion_start_time, 1379 model=model, 1380 model_parameters=model_parameters, 1381 usage_details=usage_details, 1382 cost_details=cost_details, 1383 prompt=prompt, 1384 ) 1385 1386 def update_current_span( 1387 self, 1388 *, 1389 name: Optional[str] = None, 1390 input: Optional[Any] = None, 1391 output: Optional[Any] = None, 1392 metadata: Optional[Any] = None, 1393 version: Optional[str] = None, 1394 level: Optional[SpanLevel] = None, 1395 status_message: Optional[str] = None, 1396 ) -> None: 1397 """Update the current active span with new information. 1398 1399 This method updates the current span in the active context with 1400 additional information. It's useful for adding outputs or metadata 1401 that become available during execution. 1402 1403 Args: 1404 name: The span name 1405 input: Updated input data for the operation 1406 output: Output data from the operation 1407 metadata: Additional metadata to associate with the span 1408 version: Version identifier for the code or component 1409 level: Importance level of the span (info, warning, error) 1410 status_message: Optional status message for the span 1411 1412 Example: 1413 ```python 1414 with langfuse.start_as_current_observation(name="process-data") as span: 1415 # Initial processing 1416 result = process_first_part() 1417 1418 # Update with intermediate results 1419 langfuse.update_current_span(metadata={"intermediate_result": result}) 1420 1421 # Continue processing 1422 final_result = process_second_part(result) 1423 1424 # Final update 1425 langfuse.update_current_span(output=final_result) 1426 ``` 1427 """ 1428 if not self._tracing_enabled: 1429 langfuse_logger.debug( 1430 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1431 ) 1432 return 1433 1434 current_otel_span = self._get_current_otel_span() 1435 1436 if current_otel_span is not None: 1437 span_class = self._get_span_class( 1438 self._get_observation_type_from_otel_span(current_otel_span) 1439 ) 1440 span = span_class( 1441 otel_span=current_otel_span, 1442 langfuse_client=self, 1443 environment=self._environment, 1444 release=self._release, 1445 ) 1446 1447 if name: 1448 current_otel_span.update_name(name) 1449 1450 span.update( 1451 input=input, 1452 output=output, 1453 metadata=metadata, 1454 version=version, 1455 level=level, 1456 status_message=status_message, 1457 ) 1458 1459 @deprecated( 1460 "Trace-level input/output is deprecated. " 1461 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1462 "This method will be removed in a future major version." 1463 ) 1464 def set_current_trace_io( 1465 self, 1466 *, 1467 input: Optional[Any] = None, 1468 output: Optional[Any] = None, 1469 ) -> None: 1470 """Set trace-level input and output for the current span's trace. 1471 1472 .. deprecated:: 1473 This is a legacy method for backward compatibility with Langfuse platform 1474 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1475 evaluators). It will be removed in a future major version. 1476 1477 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1478 use :meth:`propagate_attributes` instead. 1479 1480 Args: 1481 input: Input data to associate with the trace. 1482 output: Output data to associate with the trace. 1483 """ 1484 if not self._tracing_enabled: 1485 langfuse_logger.debug( 1486 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1487 ) 1488 return 1489 1490 current_otel_span = self._get_current_otel_span() 1491 1492 if current_otel_span is not None and current_otel_span.is_recording(): 1493 span_class = self._get_span_class( 1494 self._get_observation_type_from_otel_span(current_otel_span) 1495 ) 1496 span = span_class( 1497 otel_span=current_otel_span, 1498 langfuse_client=self, 1499 environment=self._environment, 1500 release=self._release, 1501 ) 1502 1503 span.set_trace_io( 1504 input=input, 1505 output=output, 1506 ) 1507 1508 def set_current_trace_as_public(self) -> None: 1509 """Make the current trace publicly accessible via its URL. 1510 1511 When a trace is published, anyone with the trace link can view the full trace 1512 without needing to be logged in to Langfuse. This action cannot be undone 1513 programmatically - once published, the entire trace becomes public. 1514 1515 This is a convenience method that publishes the trace from the currently 1516 active span context. Use this when you want to make a trace public from 1517 within a traced function without needing direct access to the span object. 1518 """ 1519 if not self._tracing_enabled: 1520 langfuse_logger.debug( 1521 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1522 ) 1523 return 1524 1525 current_otel_span = self._get_current_otel_span() 1526 1527 if current_otel_span is not None and current_otel_span.is_recording(): 1528 span_class = self._get_span_class( 1529 self._get_observation_type_from_otel_span(current_otel_span) 1530 ) 1531 span = span_class( 1532 otel_span=current_otel_span, 1533 langfuse_client=self, 1534 environment=self._environment, 1535 ) 1536 1537 span.set_trace_as_public() 1538 1539 def create_event( 1540 self, 1541 *, 1542 trace_context: Optional[TraceContext] = None, 1543 name: str, 1544 input: Optional[Any] = None, 1545 output: Optional[Any] = None, 1546 metadata: Optional[Any] = None, 1547 version: Optional[str] = None, 1548 level: Optional[SpanLevel] = None, 1549 status_message: Optional[str] = None, 1550 ) -> LangfuseEvent: 1551 """Create a new Langfuse observation of type 'EVENT'. 1552 1553 The created Langfuse Event observation will be the child of the current span in the context. 1554 1555 Args: 1556 trace_context: Optional context for connecting to an existing trace 1557 name: Name of the span (e.g., function or operation name) 1558 input: Input data for the operation (can be any JSON-serializable object) 1559 output: Output data from the operation (can be any JSON-serializable object) 1560 metadata: Additional metadata to associate with the span 1561 version: Version identifier for the code or component 1562 level: Importance level of the span (info, warning, error) 1563 status_message: Optional status message for the span 1564 1565 Returns: 1566 The Langfuse Event object 1567 1568 Example: 1569 ```python 1570 event = langfuse.create_event(name="process-event") 1571 ``` 1572 """ 1573 timestamp = time_ns() 1574 1575 if trace_context: 1576 trace_id = trace_context.get("trace_id", None) 1577 parent_span_id = trace_context.get("parent_span_id", None) 1578 1579 if trace_id: 1580 remote_parent_span = self._create_remote_parent_span( 1581 trace_id=trace_id, parent_span_id=parent_span_id 1582 ) 1583 1584 with otel_trace_api.use_span( 1585 cast(otel_trace_api.Span, remote_parent_span) 1586 ): 1587 otel_span = self._otel_tracer.start_span( 1588 name=name, start_time=timestamp 1589 ) 1590 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1591 1592 return cast( 1593 LangfuseEvent, 1594 LangfuseEvent( 1595 otel_span=otel_span, 1596 langfuse_client=self, 1597 environment=self._environment, 1598 release=self._release, 1599 input=input, 1600 output=output, 1601 metadata=metadata, 1602 version=version, 1603 level=level, 1604 status_message=status_message, 1605 ).end(end_time=timestamp), 1606 ) 1607 1608 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1609 1610 return cast( 1611 LangfuseEvent, 1612 LangfuseEvent( 1613 otel_span=otel_span, 1614 langfuse_client=self, 1615 environment=self._environment, 1616 release=self._release, 1617 input=input, 1618 output=output, 1619 metadata=metadata, 1620 version=version, 1621 level=level, 1622 status_message=status_message, 1623 ).end(end_time=timestamp), 1624 ) 1625 1626 def _create_remote_parent_span( 1627 self, *, trace_id: str, parent_span_id: Optional[str] 1628 ) -> Any: 1629 if not self._is_valid_trace_id(trace_id): 1630 langfuse_logger.warning( 1631 f"Passed trace ID '{trace_id}' is not a valid 32 lowercase hex char Langfuse trace id. Ignoring trace ID." 1632 ) 1633 1634 if parent_span_id and not self._is_valid_span_id(parent_span_id): 1635 langfuse_logger.warning( 1636 f"Passed span ID '{parent_span_id}' is not a valid 16 lowercase hex char Langfuse span id. Ignoring parent span ID." 1637 ) 1638 1639 int_trace_id = int(trace_id, 16) 1640 int_parent_span_id = ( 1641 int(parent_span_id, 16) 1642 if parent_span_id 1643 else RandomIdGenerator().generate_span_id() 1644 ) 1645 1646 span_context = otel_trace_api.SpanContext( 1647 trace_id=int_trace_id, 1648 span_id=int_parent_span_id, 1649 trace_flags=otel_trace_api.TraceFlags(0x01), # mark span as sampled 1650 is_remote=False, 1651 ) 1652 1653 return otel_trace_api.NonRecordingSpan(span_context) 1654 1655 def _is_valid_trace_id(self, trace_id: str) -> bool: 1656 pattern = r"^[0-9a-f]{32}$" 1657 1658 return bool(re.match(pattern, trace_id)) 1659 1660 def _is_valid_span_id(self, span_id: str) -> bool: 1661 pattern = r"^[0-9a-f]{16}$" 1662 1663 return bool(re.match(pattern, span_id)) 1664 1665 def _create_observation_id(self, *, seed: Optional[str] = None) -> str: 1666 """Create a unique observation ID for use with Langfuse. 1667 1668 This method generates a unique observation ID (span ID in OpenTelemetry terms) 1669 for use with various Langfuse APIs. It can either generate a random ID or 1670 create a deterministic ID based on a seed string. 1671 1672 Observation IDs must be 16 lowercase hexadecimal characters, representing 8 bytes. 1673 This method ensures the generated ID meets this requirement. If you need to 1674 correlate an external ID with a Langfuse observation ID, use the external ID as 1675 the seed to get a valid, deterministic observation ID. 1676 1677 Args: 1678 seed: Optional string to use as a seed for deterministic ID generation. 1679 If provided, the same seed will always produce the same ID. 1680 If not provided, a random ID will be generated. 1681 1682 Returns: 1683 A 16-character lowercase hexadecimal string representing the observation ID. 1684 1685 Example: 1686 ```python 1687 # Generate a random observation ID 1688 obs_id = langfuse.create_observation_id() 1689 1690 # Generate a deterministic ID based on a seed 1691 user_obs_id = langfuse.create_observation_id(seed="user-123-feedback") 1692 1693 # Correlate an external item ID with a Langfuse observation ID 1694 item_id = "item-789012" 1695 correlated_obs_id = langfuse.create_observation_id(seed=item_id) 1696 1697 # Use the ID with Langfuse APIs 1698 langfuse.create_score( 1699 name="relevance", 1700 value=0.95, 1701 trace_id=trace_id, 1702 observation_id=obs_id 1703 ) 1704 ``` 1705 """ 1706 if not seed: 1707 span_id_int = RandomIdGenerator().generate_span_id() 1708 1709 return self._format_otel_span_id(span_id_int) 1710 1711 return sha256(seed.encode("utf-8")).digest()[:8].hex() 1712 1713 @staticmethod 1714 def create_trace_id(*, seed: Optional[str] = None) -> str: 1715 """Create a unique trace ID for use with Langfuse. 1716 1717 This method generates a unique trace ID for use with various Langfuse APIs. 1718 It can either generate a random ID or create a deterministic ID based on 1719 a seed string. 1720 1721 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1722 This method ensures the generated ID meets this requirement. If you need to 1723 correlate an external ID with a Langfuse trace ID, use the external ID as the 1724 seed to get a valid, deterministic Langfuse trace ID. 1725 1726 Args: 1727 seed: Optional string to use as a seed for deterministic ID generation. 1728 If provided, the same seed will always produce the same ID. 1729 If not provided, a random ID will be generated. 1730 1731 Returns: 1732 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1733 1734 Example: 1735 ```python 1736 # Generate a random trace ID 1737 trace_id = langfuse.create_trace_id() 1738 1739 # Generate a deterministic ID based on a seed 1740 session_trace_id = langfuse.create_trace_id(seed="session-456") 1741 1742 # Correlate an external ID with a Langfuse trace ID 1743 external_id = "external-system-123456" 1744 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1745 1746 # Use the ID with trace context 1747 with langfuse.start_as_current_observation( 1748 name="process-request", 1749 trace_context={"trace_id": trace_id} 1750 ) as span: 1751 # Operation will be part of the specific trace 1752 pass 1753 ``` 1754 """ 1755 if not seed: 1756 trace_id_int = RandomIdGenerator().generate_trace_id() 1757 1758 return Langfuse._format_otel_trace_id(trace_id_int) 1759 1760 return sha256(seed.encode("utf-8")).digest()[:16].hex() 1761 1762 def _get_otel_trace_id(self, otel_span: otel_trace_api.Span) -> str: 1763 span_context = otel_span.get_span_context() 1764 1765 return self._format_otel_trace_id(span_context.trace_id) 1766 1767 def _get_otel_span_id(self, otel_span: otel_trace_api.Span) -> str: 1768 span_context = otel_span.get_span_context() 1769 1770 return self._format_otel_span_id(span_context.span_id) 1771 1772 @staticmethod 1773 def _format_otel_span_id(span_id_int: int) -> str: 1774 """Format an integer span ID to a 16-character lowercase hex string. 1775 1776 Internal method to convert an OpenTelemetry integer span ID to the standard 1777 W3C Trace Context format (16-character lowercase hex string). 1778 1779 Args: 1780 span_id_int: 64-bit integer representing a span ID 1781 1782 Returns: 1783 A 16-character lowercase hexadecimal string 1784 """ 1785 return format(span_id_int, "016x") 1786 1787 @staticmethod 1788 def _format_otel_trace_id(trace_id_int: int) -> str: 1789 """Format an integer trace ID to a 32-character lowercase hex string. 1790 1791 Internal method to convert an OpenTelemetry integer trace ID to the standard 1792 W3C Trace Context format (32-character lowercase hex string). 1793 1794 Args: 1795 trace_id_int: 128-bit integer representing a trace ID 1796 1797 Returns: 1798 A 32-character lowercase hexadecimal string 1799 """ 1800 return format(trace_id_int, "032x") 1801 1802 @overload 1803 def create_score( 1804 self, 1805 *, 1806 name: str, 1807 value: float, 1808 session_id: Optional[str] = None, 1809 dataset_run_id: Optional[str] = None, 1810 trace_id: Optional[str] = None, 1811 observation_id: Optional[str] = None, 1812 score_id: Optional[str] = None, 1813 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1814 comment: Optional[str] = None, 1815 config_id: Optional[str] = None, 1816 metadata: Optional[Any] = None, 1817 timestamp: Optional[datetime] = None, 1818 ) -> None: ... 1819 1820 @overload 1821 def create_score( 1822 self, 1823 *, 1824 name: str, 1825 value: str, 1826 session_id: Optional[str] = None, 1827 dataset_run_id: Optional[str] = None, 1828 trace_id: Optional[str] = None, 1829 score_id: Optional[str] = None, 1830 observation_id: Optional[str] = None, 1831 data_type: Optional[ 1832 Literal["CATEGORICAL", "TEXT", "CORRECTION"] 1833 ] = "CATEGORICAL", 1834 comment: Optional[str] = None, 1835 config_id: Optional[str] = None, 1836 metadata: Optional[Any] = None, 1837 timestamp: Optional[datetime] = None, 1838 ) -> None: ... 1839 1840 def create_score( 1841 self, 1842 *, 1843 name: str, 1844 value: Union[float, str], 1845 session_id: Optional[str] = None, 1846 dataset_run_id: Optional[str] = None, 1847 trace_id: Optional[str] = None, 1848 observation_id: Optional[str] = None, 1849 score_id: Optional[str] = None, 1850 data_type: Optional[ScoreDataType] = None, 1851 comment: Optional[str] = None, 1852 config_id: Optional[str] = None, 1853 metadata: Optional[Any] = None, 1854 timestamp: Optional[datetime] = None, 1855 ) -> None: 1856 """Create a score for a specific trace or observation. 1857 1858 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1859 used to track quality metrics, user feedback, or automated evaluations. 1860 1861 Args: 1862 name: Name of the score (e.g., "relevance", "accuracy") 1863 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 1864 session_id: ID of the Langfuse session to associate the score with 1865 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1866 trace_id: ID of the Langfuse trace to associate the score with 1867 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1868 score_id: Optional custom ID for the score (auto-generated if not provided) 1869 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 1870 comment: Optional comment or explanation for the score 1871 config_id: Optional ID of a score config defined in Langfuse 1872 metadata: Optional metadata to be attached to the score 1873 timestamp: Optional timestamp for the score (defaults to current UTC time) 1874 1875 Example: 1876 ```python 1877 # Create a numeric score for accuracy 1878 langfuse.create_score( 1879 name="accuracy", 1880 value=0.92, 1881 trace_id="abcdef1234567890abcdef1234567890", 1882 data_type="NUMERIC", 1883 comment="High accuracy with minor irrelevant details" 1884 ) 1885 1886 # Create a categorical score for sentiment 1887 langfuse.create_score( 1888 name="sentiment", 1889 value="positive", 1890 trace_id="abcdef1234567890abcdef1234567890", 1891 observation_id="abcdef1234567890", 1892 data_type="CATEGORICAL" 1893 ) 1894 ``` 1895 """ 1896 if not self._tracing_enabled: 1897 return 1898 1899 score_id = score_id or self._create_observation_id() 1900 1901 try: 1902 new_body = ScoreBody( 1903 id=score_id, 1904 sessionId=session_id, 1905 datasetRunId=dataset_run_id, 1906 traceId=trace_id, 1907 observationId=observation_id, 1908 name=name, 1909 value=value, 1910 dataType=data_type, # type: ignore 1911 comment=comment, 1912 configId=config_id, 1913 environment=self._environment, 1914 metadata=metadata, 1915 ) 1916 1917 event = { 1918 "id": self.create_trace_id(), 1919 "type": "score-create", 1920 "timestamp": timestamp or _get_timestamp(), 1921 "body": new_body, 1922 } 1923 1924 if self._resources is not None: 1925 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1926 force_sample = ( 1927 not self._is_valid_trace_id(trace_id) if trace_id else True 1928 ) 1929 1930 self._resources.add_score_task( 1931 event, 1932 force_sample=force_sample, 1933 ) 1934 1935 except Exception as e: 1936 langfuse_logger.exception( 1937 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1938 ) 1939 1940 def _create_trace_tags_via_ingestion( 1941 self, 1942 *, 1943 trace_id: str, 1944 tags: List[str], 1945 ) -> None: 1946 """Private helper to enqueue trace tag updates via ingestion API events.""" 1947 if not self._tracing_enabled: 1948 return 1949 1950 if len(tags) == 0: 1951 return 1952 1953 try: 1954 new_body = TraceBody( 1955 id=trace_id, 1956 tags=tags, 1957 ) 1958 1959 event = { 1960 "id": self.create_trace_id(), 1961 "type": "trace-create", 1962 "timestamp": _get_timestamp(), 1963 "body": new_body, 1964 } 1965 1966 if self._resources is not None: 1967 self._resources.add_trace_task(event) 1968 except Exception as e: 1969 langfuse_logger.exception( 1970 f"Error updating trace tags: Failed to process trace update event for trace_id={trace_id}. Error: {e}" 1971 ) 1972 1973 @overload 1974 def score_current_span( 1975 self, 1976 *, 1977 name: str, 1978 value: float, 1979 score_id: Optional[str] = None, 1980 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 1981 comment: Optional[str] = None, 1982 config_id: Optional[str] = None, 1983 metadata: Optional[Any] = None, 1984 ) -> None: ... 1985 1986 @overload 1987 def score_current_span( 1988 self, 1989 *, 1990 name: str, 1991 value: str, 1992 score_id: Optional[str] = None, 1993 data_type: Optional[ 1994 Literal["CATEGORICAL", "TEXT", "CORRECTION"] 1995 ] = "CATEGORICAL", 1996 comment: Optional[str] = None, 1997 config_id: Optional[str] = None, 1998 metadata: Optional[Any] = None, 1999 ) -> None: ... 2000 2001 def score_current_span( 2002 self, 2003 *, 2004 name: str, 2005 value: Union[float, str], 2006 score_id: Optional[str] = None, 2007 data_type: Optional[ScoreDataType] = None, 2008 comment: Optional[str] = None, 2009 config_id: Optional[str] = None, 2010 metadata: Optional[Any] = None, 2011 ) -> None: 2012 """Create a score for the current active span. 2013 2014 This method scores the currently active span in the context. It's a convenient 2015 way to score the current operation without needing to know its trace and span IDs. 2016 2017 Args: 2018 name: Name of the score (e.g., "relevance", "accuracy") 2019 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2020 score_id: Optional custom ID for the score (auto-generated if not provided) 2021 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2022 comment: Optional comment or explanation for the score 2023 config_id: Optional ID of a score config defined in Langfuse 2024 metadata: Optional metadata to be attached to the score 2025 2026 Example: 2027 ```python 2028 with langfuse.start_as_current_generation(name="answer-query") as generation: 2029 # Generate answer 2030 response = generate_answer(...) 2031 generation.update(output=response) 2032 2033 # Score the generation 2034 langfuse.score_current_span( 2035 name="relevance", 2036 value=0.85, 2037 data_type="NUMERIC", 2038 comment="Mostly relevant but contains some tangential information", 2039 metadata={"model": "gpt-4", "prompt_version": "v2"} 2040 ) 2041 ``` 2042 """ 2043 current_span = self._get_current_otel_span() 2044 2045 if current_span is not None: 2046 trace_id = self._get_otel_trace_id(current_span) 2047 observation_id = self._get_otel_span_id(current_span) 2048 2049 langfuse_logger.info( 2050 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 2051 ) 2052 2053 self.create_score( 2054 trace_id=trace_id, 2055 observation_id=observation_id, 2056 name=name, 2057 value=cast(str, value), 2058 score_id=score_id, 2059 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2060 comment=comment, 2061 config_id=config_id, 2062 metadata=metadata, 2063 ) 2064 2065 @overload 2066 def score_current_trace( 2067 self, 2068 *, 2069 name: str, 2070 value: float, 2071 score_id: Optional[str] = None, 2072 data_type: Optional[Literal["NUMERIC", "BOOLEAN"]] = None, 2073 comment: Optional[str] = None, 2074 config_id: Optional[str] = None, 2075 metadata: Optional[Any] = None, 2076 ) -> None: ... 2077 2078 @overload 2079 def score_current_trace( 2080 self, 2081 *, 2082 name: str, 2083 value: str, 2084 score_id: Optional[str] = None, 2085 data_type: Optional[ 2086 Literal["CATEGORICAL", "TEXT", "CORRECTION"] 2087 ] = "CATEGORICAL", 2088 comment: Optional[str] = None, 2089 config_id: Optional[str] = None, 2090 metadata: Optional[Any] = None, 2091 ) -> None: ... 2092 2093 def score_current_trace( 2094 self, 2095 *, 2096 name: str, 2097 value: Union[float, str], 2098 score_id: Optional[str] = None, 2099 data_type: Optional[ScoreDataType] = None, 2100 comment: Optional[str] = None, 2101 config_id: Optional[str] = None, 2102 metadata: Optional[Any] = None, 2103 ) -> None: 2104 """Create a score for the current trace. 2105 2106 This method scores the trace of the currently active span. Unlike score_current_span, 2107 this method associates the score with the entire trace rather than a specific span. 2108 It's useful for scoring overall performance or quality of the entire operation. 2109 2110 Args: 2111 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2112 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2113 score_id: Optional custom ID for the score (auto-generated if not provided) 2114 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2115 comment: Optional comment or explanation for the score 2116 config_id: Optional ID of a score config defined in Langfuse 2117 metadata: Optional metadata to be attached to the score 2118 2119 Example: 2120 ```python 2121 with langfuse.start_as_current_observation(name="process-user-request") as span: 2122 # Process request 2123 result = process_complete_request() 2124 span.update(output=result) 2125 2126 # Score the overall trace 2127 langfuse.score_current_trace( 2128 name="overall_quality", 2129 value=0.95, 2130 data_type="NUMERIC", 2131 comment="High quality end-to-end response", 2132 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2133 ) 2134 ``` 2135 """ 2136 current_span = self._get_current_otel_span() 2137 2138 if current_span is not None: 2139 trace_id = self._get_otel_trace_id(current_span) 2140 2141 langfuse_logger.info( 2142 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2143 ) 2144 2145 self.create_score( 2146 trace_id=trace_id, 2147 name=name, 2148 value=cast(str, value), 2149 score_id=score_id, 2150 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2151 comment=comment, 2152 config_id=config_id, 2153 metadata=metadata, 2154 ) 2155 2156 def flush(self) -> None: 2157 """Force flush all pending spans and events to the Langfuse API. 2158 2159 This method manually flushes any pending spans, scores, and other events to the 2160 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2161 before proceeding, without waiting for the automatic flush interval. 2162 2163 Example: 2164 ```python 2165 # Record some spans and scores 2166 with langfuse.start_as_current_observation(name="operation") as span: 2167 # Do work... 2168 pass 2169 2170 # Ensure all data is sent to Langfuse before proceeding 2171 langfuse.flush() 2172 2173 # Continue with other work 2174 ``` 2175 """ 2176 if self._resources is not None: 2177 self._resources.flush() 2178 2179 def shutdown(self) -> None: 2180 """Shut down the Langfuse client and flush all pending data. 2181 2182 This method cleanly shuts down the Langfuse client, ensuring all pending data 2183 is flushed to the API and all background threads are properly terminated. 2184 2185 It's important to call this method when your application is shutting down to 2186 prevent data loss and resource leaks. For most applications, using the client 2187 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2188 2189 Example: 2190 ```python 2191 # Initialize Langfuse 2192 langfuse = Langfuse(public_key="...", secret_key="...") 2193 2194 # Use Langfuse throughout your application 2195 # ... 2196 2197 # When application is shutting down 2198 langfuse.shutdown() 2199 ``` 2200 """ 2201 if self._resources is not None: 2202 self._resources.shutdown() 2203 2204 def get_current_trace_id(self) -> Optional[str]: 2205 """Get the trace ID of the current active span. 2206 2207 This method retrieves the trace ID from the currently active span in the context. 2208 It can be used to get the trace ID for referencing in logs, external systems, 2209 or for creating related operations. 2210 2211 Returns: 2212 The current trace ID as a 32-character lowercase hexadecimal string, 2213 or None if there is no active span. 2214 2215 Example: 2216 ```python 2217 with langfuse.start_as_current_observation(name="process-request") as span: 2218 # Get the current trace ID for reference 2219 trace_id = langfuse.get_current_trace_id() 2220 2221 # Use it for external correlation 2222 log.info(f"Processing request with trace_id: {trace_id}") 2223 2224 # Or pass to another system 2225 external_system.process(data, trace_id=trace_id) 2226 ``` 2227 """ 2228 if not self._tracing_enabled: 2229 langfuse_logger.debug( 2230 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2231 ) 2232 return None 2233 2234 current_otel_span = self._get_current_otel_span() 2235 2236 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None 2237 2238 def get_current_observation_id(self) -> Optional[str]: 2239 """Get the observation ID (span ID) of the current active span. 2240 2241 This method retrieves the observation ID from the currently active span in the context. 2242 It can be used to get the observation ID for referencing in logs, external systems, 2243 or for creating scores or other related operations. 2244 2245 Returns: 2246 The current observation ID as a 16-character lowercase hexadecimal string, 2247 or None if there is no active span. 2248 2249 Example: 2250 ```python 2251 with langfuse.start_as_current_observation(name="process-user-query") as span: 2252 # Get the current observation ID 2253 observation_id = langfuse.get_current_observation_id() 2254 2255 # Store it for later reference 2256 cache.set(f"query_{query_id}_observation", observation_id) 2257 2258 # Process the query... 2259 ``` 2260 """ 2261 if not self._tracing_enabled: 2262 langfuse_logger.debug( 2263 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2264 ) 2265 return None 2266 2267 current_otel_span = self._get_current_otel_span() 2268 2269 return self._get_otel_span_id(current_otel_span) if current_otel_span else None 2270 2271 def _get_project_id(self) -> Optional[str]: 2272 """Fetch and return the current project id. Persisted across requests. Returns None if no project id is found for api keys.""" 2273 if not self._project_id: 2274 proj = self.api.projects.get() 2275 if not proj.data or not proj.data[0].id: 2276 return None 2277 2278 self._project_id = proj.data[0].id 2279 2280 return self._project_id 2281 2282 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2283 """Get the URL to view a trace in the Langfuse UI. 2284 2285 This method generates a URL that links directly to a trace in the Langfuse UI. 2286 It's useful for providing links in logs, notifications, or debugging tools. 2287 2288 Args: 2289 trace_id: Optional trace ID to generate a URL for. If not provided, 2290 the trace ID of the current active span will be used. 2291 2292 Returns: 2293 A URL string pointing to the trace in the Langfuse UI, 2294 or None if the project ID couldn't be retrieved or no trace ID is available. 2295 2296 Example: 2297 ```python 2298 # Get URL for the current trace 2299 with langfuse.start_as_current_observation(name="process-request") as span: 2300 trace_url = langfuse.get_trace_url() 2301 log.info(f"Processing trace: {trace_url}") 2302 2303 # Get URL for a specific trace 2304 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2305 send_notification(f"Review needed for trace: {specific_trace_url}") 2306 ``` 2307 """ 2308 final_trace_id = trace_id or self.get_current_trace_id() 2309 if not final_trace_id: 2310 return None 2311 2312 project_id = self._get_project_id() 2313 2314 return ( 2315 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2316 if project_id and final_trace_id 2317 else None 2318 ) 2319 2320 def get_dataset( 2321 self, 2322 name: str, 2323 *, 2324 fetch_items_page_size: Optional[int] = 50, 2325 version: Optional[datetime] = None, 2326 ) -> "DatasetClient": 2327 """Fetch a dataset by its name. 2328 2329 Args: 2330 name (str): The name of the dataset to fetch. 2331 fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2332 version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). 2333 If provided, returns the state of items at the specified UTC timestamp. 2334 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2335 2336 Returns: 2337 DatasetClient: The dataset with the given name. 2338 """ 2339 try: 2340 langfuse_logger.debug(f"Getting datasets {name}") 2341 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2342 2343 dataset_items = [] 2344 page = 1 2345 2346 while True: 2347 new_items = self.api.dataset_items.list( 2348 dataset_name=self._url_encode(name, is_url_param=True), 2349 page=page, 2350 limit=fetch_items_page_size, 2351 version=version, 2352 ) 2353 dataset_items.extend(new_items.data) 2354 2355 if new_items.meta.total_pages <= page: 2356 break 2357 2358 page += 1 2359 2360 return DatasetClient( 2361 dataset=dataset, 2362 items=dataset_items, 2363 version=version, 2364 langfuse_client=self, 2365 ) 2366 2367 except Error as e: 2368 handle_fern_exception(e) 2369 raise e 2370 2371 def get_dataset_run( 2372 self, *, dataset_name: str, run_name: str 2373 ) -> DatasetRunWithItems: 2374 """Fetch a dataset run by dataset name and run name. 2375 2376 Args: 2377 dataset_name (str): The name of the dataset. 2378 run_name (str): The name of the run. 2379 2380 Returns: 2381 DatasetRunWithItems: The dataset run with its items. 2382 """ 2383 try: 2384 return cast( 2385 DatasetRunWithItems, 2386 self.api.datasets.get_run( 2387 dataset_name=self._url_encode(dataset_name), 2388 run_name=self._url_encode(run_name), 2389 request_options=None, 2390 ), 2391 ) 2392 except Error as e: 2393 handle_fern_exception(e) 2394 raise e 2395 2396 def get_dataset_runs( 2397 self, 2398 *, 2399 dataset_name: str, 2400 page: Optional[int] = None, 2401 limit: Optional[int] = None, 2402 ) -> PaginatedDatasetRuns: 2403 """Fetch all runs for a dataset. 2404 2405 Args: 2406 dataset_name (str): The name of the dataset. 2407 page (Optional[int]): Page number, starts at 1. 2408 limit (Optional[int]): Limit of items per page. 2409 2410 Returns: 2411 PaginatedDatasetRuns: Paginated list of dataset runs. 2412 """ 2413 try: 2414 return cast( 2415 PaginatedDatasetRuns, 2416 self.api.datasets.get_runs( 2417 dataset_name=self._url_encode(dataset_name), 2418 page=page, 2419 limit=limit, 2420 request_options=None, 2421 ), 2422 ) 2423 except Error as e: 2424 handle_fern_exception(e) 2425 raise e 2426 2427 def delete_dataset_run( 2428 self, *, dataset_name: str, run_name: str 2429 ) -> DeleteDatasetRunResponse: 2430 """Delete a dataset run and all its run items. This action is irreversible. 2431 2432 Args: 2433 dataset_name (str): The name of the dataset. 2434 run_name (str): The name of the run. 2435 2436 Returns: 2437 DeleteDatasetRunResponse: Confirmation of deletion. 2438 """ 2439 try: 2440 return cast( 2441 DeleteDatasetRunResponse, 2442 self.api.datasets.delete_run( 2443 dataset_name=self._url_encode(dataset_name), 2444 run_name=self._url_encode(run_name), 2445 request_options=None, 2446 ), 2447 ) 2448 except Error as e: 2449 handle_fern_exception(e) 2450 raise e 2451 2452 def run_experiment( 2453 self, 2454 *, 2455 name: str, 2456 run_name: Optional[str] = None, 2457 description: Optional[str] = None, 2458 data: ExperimentData, 2459 task: TaskFunction, 2460 evaluators: List[EvaluatorFunction] = [], 2461 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2462 run_evaluators: List[RunEvaluatorFunction] = [], 2463 max_concurrency: int = 50, 2464 metadata: Optional[Dict[str, str]] = None, 2465 _dataset_version: Optional[datetime] = None, 2466 ) -> ExperimentResult: 2467 """Run an experiment on a dataset with automatic tracing and evaluation. 2468 2469 This method executes a task function on each item in the provided dataset, 2470 automatically traces all executions with Langfuse for observability, runs 2471 item-level and run-level evaluators on the outputs, and returns comprehensive 2472 results with evaluation metrics. 2473 2474 The experiment system provides: 2475 - Automatic tracing of all task executions 2476 - Concurrent processing with configurable limits 2477 - Comprehensive error handling that isolates failures 2478 - Integration with Langfuse datasets for experiment tracking 2479 - Flexible evaluation framework supporting both sync and async evaluators 2480 2481 Args: 2482 name: Human-readable name for the experiment. Used for identification 2483 in the Langfuse UI. 2484 run_name: Optional exact name for the experiment run. If provided, this will be 2485 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2486 If not provided, this will default to the experiment name appended with an ISO timestamp. 2487 description: Optional description explaining the experiment's purpose, 2488 methodology, or expected outcomes. 2489 data: Array of data items to process. Can be either: 2490 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2491 - List of Langfuse DatasetItem objects from dataset.items 2492 task: Function that processes each data item and returns output. 2493 Must accept 'item' as keyword argument and can return sync or async results. 2494 The task function signature should be: task(*, item, **kwargs) -> Any 2495 evaluators: List of functions to evaluate each item's output individually. 2496 Each evaluator receives input, output, expected_output, and metadata. 2497 Can return single Evaluation dict or list of Evaluation dicts. 2498 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2499 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2500 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2501 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2502 run_evaluators: List of functions to evaluate the entire experiment run. 2503 Each run evaluator receives all item_results and can compute aggregate metrics. 2504 Useful for calculating averages, distributions, or cross-item comparisons. 2505 max_concurrency: Maximum number of concurrent task executions (default: 50). 2506 Controls the number of items processed simultaneously. Adjust based on 2507 API rate limits and system resources. 2508 metadata: Optional metadata dictionary to attach to all experiment traces. 2509 This metadata will be included in every trace created during the experiment. 2510 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2511 2512 Returns: 2513 ExperimentResult containing: 2514 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2515 - item_results: List of results for each processed item with outputs and evaluations 2516 - run_evaluations: List of aggregate evaluation results for the entire run 2517 - experiment_id: Stable identifier for the experiment run across all items 2518 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2519 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2520 2521 Raises: 2522 ValueError: If required parameters are missing or invalid 2523 Exception: If experiment setup fails (individual item failures are handled gracefully) 2524 2525 Examples: 2526 Basic experiment with local data: 2527 ```python 2528 def summarize_text(*, item, **kwargs): 2529 return f"Summary: {item['input'][:50]}..." 2530 2531 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2532 return { 2533 "name": "output_length", 2534 "value": len(output), 2535 "comment": f"Output contains {len(output)} characters" 2536 } 2537 2538 result = langfuse.run_experiment( 2539 name="Text Summarization Test", 2540 description="Evaluate summarization quality and length", 2541 data=[ 2542 {"input": "Long article text...", "expected_output": "Expected summary"}, 2543 {"input": "Another article...", "expected_output": "Another summary"} 2544 ], 2545 task=summarize_text, 2546 evaluators=[length_evaluator] 2547 ) 2548 2549 print(f"Processed {len(result.item_results)} items") 2550 for item_result in result.item_results: 2551 print(f"Input: {item_result.item['input']}") 2552 print(f"Output: {item_result.output}") 2553 print(f"Evaluations: {item_result.evaluations}") 2554 ``` 2555 2556 Advanced experiment with async task and multiple evaluators: 2557 ```python 2558 async def llm_task(*, item, **kwargs): 2559 # Simulate async LLM call 2560 response = await openai_client.chat.completions.create( 2561 model="gpt-4", 2562 messages=[{"role": "user", "content": item["input"]}] 2563 ) 2564 return response.choices[0].message.content 2565 2566 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2567 if expected_output and expected_output.lower() in output.lower(): 2568 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2569 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2570 2571 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2572 # Simulate toxicity check 2573 toxicity_score = check_toxicity(output) # Your toxicity checker 2574 return { 2575 "name": "toxicity", 2576 "value": toxicity_score, 2577 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2578 } 2579 2580 def average_accuracy(*, item_results, **kwargs): 2581 accuracies = [ 2582 eval.value for result in item_results 2583 for eval in result.evaluations 2584 if eval.name == "accuracy" 2585 ] 2586 return { 2587 "name": "average_accuracy", 2588 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2589 "comment": f"Average accuracy across {len(accuracies)} items" 2590 } 2591 2592 result = langfuse.run_experiment( 2593 name="LLM Safety and Accuracy Test", 2594 description="Evaluate model accuracy and safety across diverse prompts", 2595 data=test_dataset, # Your dataset items 2596 task=llm_task, 2597 evaluators=[accuracy_evaluator, toxicity_evaluator], 2598 run_evaluators=[average_accuracy], 2599 max_concurrency=5, # Limit concurrent API calls 2600 metadata={"model": "gpt-4", "temperature": 0.7} 2601 ) 2602 ``` 2603 2604 Using with Langfuse datasets: 2605 ```python 2606 # Get dataset from Langfuse 2607 dataset = langfuse.get_dataset("my-eval-dataset") 2608 2609 result = dataset.run_experiment( 2610 name="Production Model Evaluation", 2611 description="Monthly evaluation of production model performance", 2612 task=my_production_task, 2613 evaluators=[accuracy_evaluator, latency_evaluator] 2614 ) 2615 2616 # Results automatically linked to dataset in Langfuse UI 2617 print(f"View results: {result['dataset_run_url']}") 2618 ``` 2619 2620 Note: 2621 - Task and evaluator functions can be either synchronous or asynchronous 2622 - Individual item failures are logged but don't stop the experiment 2623 - All executions are automatically traced and visible in Langfuse UI 2624 - When using Langfuse datasets, results are automatically linked for easy comparison 2625 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2626 - Async execution is handled automatically with smart event loop detection 2627 """ 2628 return cast( 2629 ExperimentResult, 2630 run_async_safely( 2631 self._run_experiment_async( 2632 name=name, 2633 run_name=self._create_experiment_run_name( 2634 name=name, run_name=run_name 2635 ), 2636 description=description, 2637 data=data, 2638 task=task, 2639 evaluators=evaluators or [], 2640 composite_evaluator=composite_evaluator, 2641 run_evaluators=run_evaluators or [], 2642 max_concurrency=max_concurrency, 2643 metadata=metadata, 2644 dataset_version=_dataset_version, 2645 ), 2646 ), 2647 ) 2648 2649 async def _run_experiment_async( 2650 self, 2651 *, 2652 name: str, 2653 run_name: str, 2654 description: Optional[str], 2655 data: ExperimentData, 2656 task: TaskFunction, 2657 evaluators: List[EvaluatorFunction], 2658 composite_evaluator: Optional[CompositeEvaluatorFunction], 2659 run_evaluators: List[RunEvaluatorFunction], 2660 max_concurrency: int, 2661 metadata: Optional[Dict[str, Any]] = None, 2662 dataset_version: Optional[datetime] = None, 2663 ) -> ExperimentResult: 2664 langfuse_logger.debug( 2665 f"Starting experiment '{name}' run '{run_name}' with {len(data)} items" 2666 ) 2667 2668 shared_fallback_experiment_id = self._create_observation_id() 2669 2670 # Set up concurrency control 2671 semaphore = asyncio.Semaphore(max_concurrency) 2672 2673 # Process all items 2674 async def process_item(item: ExperimentItem) -> ExperimentItemResult: 2675 async with semaphore: 2676 return await self._process_experiment_item( 2677 item, 2678 task, 2679 evaluators, 2680 composite_evaluator, 2681 shared_fallback_experiment_id, 2682 name, 2683 run_name, 2684 description, 2685 metadata, 2686 dataset_version, 2687 ) 2688 2689 # Run all items concurrently 2690 tasks = [process_item(item) for item in data] 2691 item_results = await asyncio.gather(*tasks, return_exceptions=True) 2692 2693 # Filter out any exceptions and log errors 2694 valid_results: List[ExperimentItemResult] = [] 2695 for i, result in enumerate(item_results): 2696 if isinstance(result, Exception): 2697 langfuse_logger.error(f"Item {i} failed: {result}") 2698 elif isinstance(result, ExperimentItemResult): 2699 valid_results.append(result) # type: ignore 2700 2701 # Run experiment-level evaluators 2702 run_evaluations: List[Evaluation] = [] 2703 for run_evaluator in run_evaluators: 2704 try: 2705 evaluations = await _run_evaluator( 2706 run_evaluator, item_results=valid_results 2707 ) 2708 run_evaluations.extend(evaluations) 2709 except Exception as e: 2710 langfuse_logger.error(f"Run evaluator failed: {e}") 2711 2712 # Generate dataset run URL if applicable 2713 dataset_run_id = next( 2714 ( 2715 result.dataset_run_id 2716 for result in valid_results 2717 if result.dataset_run_id 2718 ), 2719 None, 2720 ) 2721 dataset_run_url = None 2722 if dataset_run_id and data: 2723 try: 2724 # Check if the first item has dataset_id (for DatasetItem objects) 2725 first_item = data[0] 2726 dataset_id = None 2727 2728 if hasattr(first_item, "dataset_id"): 2729 dataset_id = getattr(first_item, "dataset_id", None) 2730 2731 if dataset_id: 2732 project_id = self._get_project_id() 2733 2734 if project_id: 2735 dataset_run_url = f"{self._base_url}/project/{project_id}/datasets/{dataset_id}/runs/{dataset_run_id}" 2736 2737 except Exception: 2738 pass # URL generation is optional 2739 2740 # Store run-level evaluations as scores 2741 for evaluation in run_evaluations: 2742 try: 2743 if dataset_run_id: 2744 self.create_score( 2745 dataset_run_id=dataset_run_id, 2746 name=evaluation.name or "<unknown>", 2747 value=evaluation.value, # type: ignore 2748 comment=evaluation.comment, 2749 metadata=evaluation.metadata, 2750 data_type=evaluation.data_type, # type: ignore 2751 config_id=evaluation.config_id, 2752 ) 2753 2754 except Exception as e: 2755 langfuse_logger.error(f"Failed to store run evaluation: {e}") 2756 2757 # Flush scores and traces 2758 self.flush() 2759 2760 return ExperimentResult( 2761 name=name, 2762 run_name=run_name, 2763 description=description, 2764 item_results=valid_results, 2765 run_evaluations=run_evaluations, 2766 experiment_id=dataset_run_id or shared_fallback_experiment_id, 2767 dataset_run_id=dataset_run_id, 2768 dataset_run_url=dataset_run_url, 2769 ) 2770 2771 async def _process_experiment_item( 2772 self, 2773 item: ExperimentItem, 2774 task: Callable, 2775 evaluators: List[Callable], 2776 composite_evaluator: Optional[CompositeEvaluatorFunction], 2777 fallback_experiment_id: str, 2778 experiment_name: str, 2779 experiment_run_name: str, 2780 experiment_description: Optional[str], 2781 experiment_metadata: Optional[Dict[str, Any]] = None, 2782 dataset_version: Optional[datetime] = None, 2783 ) -> ExperimentItemResult: 2784 span_name = "experiment-item-run" 2785 2786 with self.start_as_current_observation(name=span_name) as span: 2787 try: 2788 input_data = ( 2789 item.get("input") 2790 if isinstance(item, dict) 2791 else getattr(item, "input", None) 2792 ) 2793 2794 if input_data is None: 2795 raise ValueError("Experiment Item is missing input. Skipping item.") 2796 2797 expected_output = ( 2798 item.get("expected_output") 2799 if isinstance(item, dict) 2800 else getattr(item, "expected_output", None) 2801 ) 2802 2803 item_metadata = ( 2804 item.get("metadata") 2805 if isinstance(item, dict) 2806 else getattr(item, "metadata", None) 2807 ) 2808 2809 final_observation_metadata = { 2810 "experiment_name": experiment_name, 2811 "experiment_run_name": experiment_run_name, 2812 **(experiment_metadata or {}), 2813 } 2814 2815 trace_id = span.trace_id 2816 dataset_id = None 2817 dataset_item_id = None 2818 dataset_run_id = None 2819 2820 # Link to dataset run if this is a dataset item 2821 if hasattr(item, "id") and hasattr(item, "dataset_id"): 2822 try: 2823 # Use sync API to avoid event loop issues when run_async_safely 2824 # creates multiple event loops across different threads 2825 dataset_run_item = await asyncio.to_thread( 2826 self.api.dataset_run_items.create, 2827 run_name=experiment_run_name, 2828 run_description=experiment_description, 2829 metadata=experiment_metadata, 2830 dataset_item_id=item.id, # type: ignore 2831 trace_id=trace_id, 2832 observation_id=span.id, 2833 dataset_version=dataset_version, 2834 ) 2835 2836 dataset_run_id = dataset_run_item.dataset_run_id 2837 2838 except Exception as e: 2839 langfuse_logger.error(f"Failed to create dataset run item: {e}") 2840 2841 if ( 2842 not isinstance(item, dict) 2843 and hasattr(item, "dataset_id") 2844 and hasattr(item, "id") 2845 ): 2846 dataset_id = item.dataset_id 2847 dataset_item_id = item.id 2848 2849 final_observation_metadata.update( 2850 {"dataset_id": dataset_id, "dataset_item_id": dataset_item_id} 2851 ) 2852 2853 if isinstance(item_metadata, dict): 2854 final_observation_metadata.update(item_metadata) 2855 2856 experiment_id = dataset_run_id or fallback_experiment_id 2857 experiment_item_id = ( 2858 dataset_item_id or get_sha256_hash_hex(_serialize(input_data))[:16] 2859 ) 2860 span._otel_span.set_attributes( 2861 { 2862 k: v 2863 for k, v in { 2864 LangfuseOtelSpanAttributes.ENVIRONMENT: LANGFUSE_SDK_EXPERIMENT_ENVIRONMENT, 2865 LangfuseOtelSpanAttributes.EXPERIMENT_DESCRIPTION: experiment_description, 2866 LangfuseOtelSpanAttributes.EXPERIMENT_ITEM_EXPECTED_OUTPUT: _serialize( 2867 expected_output 2868 ), 2869 }.items() 2870 if v is not None 2871 } 2872 ) 2873 2874 propagated_experiment_attributes = PropagatedExperimentAttributes( 2875 experiment_id=experiment_id, 2876 experiment_name=experiment_run_name, 2877 experiment_metadata=_flatten_and_serialize_metadata_values( 2878 experiment_metadata 2879 ), 2880 experiment_dataset_id=dataset_id, 2881 experiment_item_id=experiment_item_id, 2882 experiment_item_metadata=_flatten_and_serialize_metadata_values( 2883 item_metadata if isinstance(item_metadata, dict) else None 2884 ), 2885 experiment_item_root_observation_id=span.id, 2886 ) 2887 2888 with _propagate_attributes(experiment=propagated_experiment_attributes): 2889 output = await _run_task(task, item) 2890 2891 span.update( 2892 input=input_data, 2893 output=output, 2894 metadata=final_observation_metadata, 2895 ) 2896 2897 except Exception as e: 2898 span.update( 2899 output=f"Error: {str(e)}", level="ERROR", status_message=str(e) 2900 ) 2901 raise e 2902 2903 # Run evaluators 2904 evaluations = [] 2905 2906 for evaluator in evaluators: 2907 try: 2908 eval_metadata: Optional[Dict[str, Any]] = None 2909 2910 if isinstance(item, dict): 2911 eval_metadata = item.get("metadata") 2912 elif hasattr(item, "metadata"): 2913 eval_metadata = item.metadata 2914 2915 with _propagate_attributes( 2916 experiment=propagated_experiment_attributes 2917 ): 2918 eval_results = await _run_evaluator( 2919 evaluator, 2920 input=input_data, 2921 output=output, 2922 expected_output=expected_output, 2923 metadata=eval_metadata, 2924 ) 2925 evaluations.extend(eval_results) 2926 2927 # Store evaluations as scores 2928 for evaluation in eval_results: 2929 self.create_score( 2930 trace_id=trace_id, 2931 observation_id=span.id, 2932 name=evaluation.name, 2933 value=evaluation.value, # type: ignore 2934 comment=evaluation.comment, 2935 metadata=evaluation.metadata, 2936 config_id=evaluation.config_id, 2937 data_type=evaluation.data_type, # type: ignore 2938 ) 2939 2940 except Exception as e: 2941 langfuse_logger.error(f"Evaluator failed: {e}") 2942 2943 # Run composite evaluator if provided and we have evaluations 2944 if composite_evaluator and evaluations: 2945 try: 2946 composite_eval_metadata: Optional[Dict[str, Any]] = None 2947 if isinstance(item, dict): 2948 composite_eval_metadata = item.get("metadata") 2949 elif hasattr(item, "metadata"): 2950 composite_eval_metadata = item.metadata 2951 2952 with _propagate_attributes( 2953 experiment=propagated_experiment_attributes 2954 ): 2955 result = composite_evaluator( 2956 input=input_data, 2957 output=output, 2958 expected_output=expected_output, 2959 metadata=composite_eval_metadata, 2960 evaluations=evaluations, 2961 ) 2962 2963 # Handle async composite evaluators 2964 if asyncio.iscoroutine(result): 2965 result = await result 2966 2967 # Normalize to list 2968 composite_evals: List[Evaluation] = [] 2969 if isinstance(result, (dict, Evaluation)): 2970 composite_evals = [result] # type: ignore 2971 elif isinstance(result, list): 2972 composite_evals = result # type: ignore 2973 2974 # Store composite evaluations as scores and add to evaluations list 2975 for composite_evaluation in composite_evals: 2976 self.create_score( 2977 trace_id=trace_id, 2978 observation_id=span.id, 2979 name=composite_evaluation.name, 2980 value=composite_evaluation.value, # type: ignore 2981 comment=composite_evaluation.comment, 2982 metadata=composite_evaluation.metadata, 2983 config_id=composite_evaluation.config_id, 2984 data_type=composite_evaluation.data_type, # type: ignore 2985 ) 2986 evaluations.append(composite_evaluation) 2987 2988 except Exception as e: 2989 langfuse_logger.error(f"Composite evaluator failed: {e}") 2990 2991 return ExperimentItemResult( 2992 item=item, 2993 output=output, 2994 evaluations=evaluations, 2995 trace_id=trace_id, 2996 dataset_run_id=dataset_run_id, 2997 ) 2998 2999 def _create_experiment_run_name( 3000 self, *, name: Optional[str] = None, run_name: Optional[str] = None 3001 ) -> str: 3002 if run_name: 3003 return run_name 3004 3005 iso_timestamp = _get_timestamp().isoformat().replace("+00:00", "Z") 3006 3007 return f"{name} - {iso_timestamp}" 3008 3009 def run_batched_evaluation( 3010 self, 3011 *, 3012 scope: Literal["traces", "observations"], 3013 mapper: MapperFunction, 3014 filter: Optional[str] = None, 3015 fetch_batch_size: int = 50, 3016 fetch_trace_fields: Optional[str] = None, 3017 max_items: Optional[int] = None, 3018 max_retries: int = 3, 3019 evaluators: List[EvaluatorFunction], 3020 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 3021 max_concurrency: int = 5, 3022 metadata: Optional[Dict[str, Any]] = None, 3023 _add_observation_scores_to_trace: bool = False, 3024 _additional_trace_tags: Optional[List[str]] = None, 3025 resume_from: Optional[BatchEvaluationResumeToken] = None, 3026 verbose: bool = False, 3027 ) -> BatchEvaluationResult: 3028 """Fetch traces or observations and run evaluations on each item. 3029 3030 This method provides a powerful way to evaluate existing data in Langfuse at scale. 3031 It fetches items based on filters, transforms them using a mapper function, runs 3032 evaluators on each item, and creates scores that are linked back to the original 3033 entities. This is ideal for: 3034 3035 - Running evaluations on production traces after deployment 3036 - Backtesting new evaluation metrics on historical data 3037 - Batch scoring of observations for quality monitoring 3038 - Periodic evaluation runs on recent data 3039 3040 The method uses a streaming/pipeline approach to process items in batches, making 3041 it memory-efficient for large datasets. It includes comprehensive error handling, 3042 retry logic, and resume capability for long-running evaluations. 3043 3044 Args: 3045 scope: The type of items to evaluate. Must be one of: 3046 - "traces": Evaluate complete traces with all their observations 3047 - "observations": Evaluate individual observations (spans, generations, events) 3048 mapper: Function that transforms API response objects into evaluator inputs. 3049 Receives a trace/observation object and returns an EvaluatorInputs 3050 instance with input, output, expected_output, and metadata fields. 3051 Can be sync or async. 3052 evaluators: List of evaluation functions to run on each item. Each evaluator 3053 receives the mapped inputs and returns Evaluation object(s). Evaluator 3054 failures are logged but don't stop the batch evaluation. 3055 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 3056 - '{"tags": ["production"]}' 3057 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 3058 Default: None (fetches all items). 3059 fetch_batch_size: Number of items to fetch per API call and hold in memory. 3060 Larger values may be faster but use more memory. Default: 50. 3061 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 3062 max_items: Maximum total number of items to process. If None, processes all 3063 items matching the filter. Useful for testing or limiting evaluation runs. 3064 Default: None (process all). 3065 max_concurrency: Maximum number of items to evaluate concurrently. Controls 3066 parallelism and resource usage. Default: 5. 3067 composite_evaluator: Optional function that creates a composite score from 3068 item-level evaluations. Receives the original item and its evaluations, 3069 returns a single Evaluation. Useful for weighted averages or combined metrics. 3070 Default: None. 3071 metadata: Optional metadata dict to add to all created scores. Useful for 3072 tracking evaluation runs, versions, or other context. Default: None. 3073 max_retries: Maximum number of retry attempts for failed batch fetches. 3074 Uses exponential backoff (1s, 2s, 4s). Default: 3. 3075 verbose: If True, logs progress information to console. Useful for monitoring 3076 long-running evaluations. Default: False. 3077 resume_from: Optional resume token from a previous incomplete run. Allows 3078 continuing evaluation after interruption or failure. Default: None. 3079 3080 3081 Returns: 3082 BatchEvaluationResult containing: 3083 - total_items_fetched: Number of items fetched from API 3084 - total_items_processed: Number of items successfully evaluated 3085 - total_items_failed: Number of items that failed evaluation 3086 - total_scores_created: Scores created by item-level evaluators 3087 - total_composite_scores_created: Scores created by composite evaluator 3088 - total_evaluations_failed: Individual evaluator failures 3089 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 3090 - resume_token: Token for resuming if incomplete (None if completed) 3091 - completed: True if all items processed 3092 - duration_seconds: Total execution time 3093 - failed_item_ids: IDs of items that failed 3094 - error_summary: Error types and counts 3095 - has_more_items: True if max_items reached but more exist 3096 3097 Raises: 3098 ValueError: If invalid scope is provided. 3099 3100 Examples: 3101 Basic trace evaluation: 3102 ```python 3103 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3104 3105 client = Langfuse() 3106 3107 # Define mapper to extract fields from traces 3108 def trace_mapper(trace): 3109 return EvaluatorInputs( 3110 input=trace.input, 3111 output=trace.output, 3112 expected_output=None, 3113 metadata={"trace_id": trace.id} 3114 ) 3115 3116 # Define evaluator 3117 def length_evaluator(*, input, output, expected_output, metadata): 3118 return Evaluation( 3119 name="output_length", 3120 value=len(output) if output else 0 3121 ) 3122 3123 # Run batch evaluation 3124 result = client.run_batched_evaluation( 3125 scope="traces", 3126 mapper=trace_mapper, 3127 evaluators=[length_evaluator], 3128 filter='{"tags": ["production"]}', 3129 max_items=1000, 3130 verbose=True 3131 ) 3132 3133 print(f"Processed {result.total_items_processed} traces") 3134 print(f"Created {result.total_scores_created} scores") 3135 ``` 3136 3137 Evaluation with composite scorer: 3138 ```python 3139 def accuracy_evaluator(*, input, output, expected_output, metadata): 3140 # ... evaluation logic 3141 return Evaluation(name="accuracy", value=0.85) 3142 3143 def relevance_evaluator(*, input, output, expected_output, metadata): 3144 # ... evaluation logic 3145 return Evaluation(name="relevance", value=0.92) 3146 3147 def composite_evaluator(*, item, evaluations): 3148 # Weighted average of evaluations 3149 weights = {"accuracy": 0.6, "relevance": 0.4} 3150 total = sum( 3151 e.value * weights.get(e.name, 0) 3152 for e in evaluations 3153 if isinstance(e.value, (int, float)) 3154 ) 3155 return Evaluation( 3156 name="composite_score", 3157 value=total, 3158 comment=f"Weighted average of {len(evaluations)} metrics" 3159 ) 3160 3161 result = client.run_batched_evaluation( 3162 scope="traces", 3163 mapper=trace_mapper, 3164 evaluators=[accuracy_evaluator, relevance_evaluator], 3165 composite_evaluator=composite_evaluator, 3166 filter='{"user_id": "important_user"}', 3167 verbose=True 3168 ) 3169 ``` 3170 3171 Handling incomplete runs with resume: 3172 ```python 3173 # Initial run that may fail or timeout 3174 result = client.run_batched_evaluation( 3175 scope="observations", 3176 mapper=obs_mapper, 3177 evaluators=[my_evaluator], 3178 max_items=10000, 3179 verbose=True 3180 ) 3181 3182 # Check if incomplete 3183 if not result.completed and result.resume_token: 3184 print(f"Processed {result.resume_token.items_processed} items before interruption") 3185 3186 # Resume from where it left off 3187 result = client.run_batched_evaluation( 3188 scope="observations", 3189 mapper=obs_mapper, 3190 evaluators=[my_evaluator], 3191 resume_from=result.resume_token, 3192 verbose=True 3193 ) 3194 3195 print(f"Total items processed: {result.total_items_processed}") 3196 ``` 3197 3198 Monitoring evaluator performance: 3199 ```python 3200 result = client.run_batched_evaluation(...) 3201 3202 for stats in result.evaluator_stats: 3203 success_rate = stats.successful_runs / stats.total_runs 3204 print(f"{stats.name}:") 3205 print(f" Success rate: {success_rate:.1%}") 3206 print(f" Scores created: {stats.total_scores_created}") 3207 3208 if stats.failed_runs > 0: 3209 print(f" ⚠️ Failed {stats.failed_runs} times") 3210 ``` 3211 3212 Note: 3213 - Evaluator failures are logged but don't stop the batch evaluation 3214 - Individual item failures are tracked but don't stop processing 3215 - Fetch failures are retried with exponential backoff 3216 - All scores are automatically flushed to Langfuse at the end 3217 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3218 """ 3219 runner = BatchEvaluationRunner(self) 3220 3221 return cast( 3222 BatchEvaluationResult, 3223 run_async_safely( 3224 runner.run_async( 3225 scope=scope, 3226 mapper=mapper, 3227 evaluators=evaluators, 3228 filter=filter, 3229 fetch_batch_size=fetch_batch_size, 3230 fetch_trace_fields=fetch_trace_fields, 3231 max_items=max_items, 3232 max_concurrency=max_concurrency, 3233 composite_evaluator=composite_evaluator, 3234 metadata=metadata, 3235 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3236 _additional_trace_tags=_additional_trace_tags, 3237 max_retries=max_retries, 3238 verbose=verbose, 3239 resume_from=resume_from, 3240 ) 3241 ), 3242 ) 3243 3244 def auth_check(self) -> bool: 3245 """Check if the provided credentials (public and secret key) are valid. 3246 3247 Raises: 3248 Exception: If no projects were found for the provided credentials. 3249 3250 Note: 3251 This method is blocking. It is discouraged to use it in production code. 3252 """ 3253 try: 3254 projects = self.api.projects.get() 3255 langfuse_logger.debug( 3256 f"Auth check successful, found {len(projects.data)} projects" 3257 ) 3258 if len(projects.data) == 0: 3259 raise Exception( 3260 "Auth check failed, no project found for the keys provided." 3261 ) 3262 return True 3263 3264 except AttributeError as e: 3265 langfuse_logger.warning( 3266 f"Auth check failed: Client not properly initialized. Error: {e}" 3267 ) 3268 return False 3269 3270 except Error as e: 3271 handle_fern_exception(e) 3272 raise e 3273 3274 def create_dataset( 3275 self, 3276 *, 3277 name: str, 3278 description: Optional[str] = None, 3279 metadata: Optional[Any] = None, 3280 input_schema: Optional[Any] = None, 3281 expected_output_schema: Optional[Any] = None, 3282 ) -> Dataset: 3283 """Create a dataset with the given name on Langfuse. 3284 3285 Args: 3286 name: Name of the dataset to create. 3287 description: Description of the dataset. Defaults to None. 3288 metadata: Additional metadata. Defaults to None. 3289 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3290 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3291 3292 Returns: 3293 Dataset: The created dataset as returned by the Langfuse API. 3294 """ 3295 try: 3296 langfuse_logger.debug(f"Creating datasets {name}") 3297 3298 result = self.api.datasets.create( 3299 name=name, 3300 description=description, 3301 metadata=metadata, 3302 input_schema=input_schema, 3303 expected_output_schema=expected_output_schema, 3304 ) 3305 3306 return cast(Dataset, result) 3307 3308 except Error as e: 3309 handle_fern_exception(e) 3310 raise e 3311 3312 def create_dataset_item( 3313 self, 3314 *, 3315 dataset_name: str, 3316 input: Optional[Any] = None, 3317 expected_output: Optional[Any] = None, 3318 metadata: Optional[Any] = None, 3319 source_trace_id: Optional[str] = None, 3320 source_observation_id: Optional[str] = None, 3321 status: Optional[DatasetStatus] = None, 3322 id: Optional[str] = None, 3323 ) -> DatasetItem: 3324 """Create a dataset item. 3325 3326 Upserts if an item with id already exists. 3327 3328 Args: 3329 dataset_name: Name of the dataset in which the dataset item should be created. 3330 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3331 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3332 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3333 source_trace_id: Id of the source trace. Defaults to None. 3334 source_observation_id: Id of the source observation. Defaults to None. 3335 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3336 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3337 3338 Returns: 3339 DatasetItem: The created dataset item as returned by the Langfuse API. 3340 3341 Example: 3342 ```python 3343 from langfuse import Langfuse 3344 3345 langfuse = Langfuse() 3346 3347 # Uploading items to the Langfuse dataset named "capital_cities" 3348 langfuse.create_dataset_item( 3349 dataset_name="capital_cities", 3350 input={"input": {"country": "Italy"}}, 3351 expected_output={"expected_output": "Rome"}, 3352 metadata={"foo": "bar"} 3353 ) 3354 ``` 3355 """ 3356 try: 3357 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3358 3359 result = self.api.dataset_items.create( 3360 dataset_name=dataset_name, 3361 input=input, 3362 expected_output=expected_output, 3363 metadata=metadata, 3364 source_trace_id=source_trace_id, 3365 source_observation_id=source_observation_id, 3366 status=status, 3367 id=id, 3368 ) 3369 3370 return cast(DatasetItem, result) 3371 except Error as e: 3372 handle_fern_exception(e) 3373 raise e 3374 3375 def resolve_media_references( 3376 self, 3377 *, 3378 obj: Any, 3379 resolve_with: Literal["base64_data_uri"], 3380 max_depth: int = 10, 3381 content_fetch_timeout_seconds: int = 5, 3382 ) -> Any: 3383 """Replace media reference strings in an object with base64 data URIs. 3384 3385 This method recursively traverses an object (up to max_depth) looking for media reference strings 3386 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3387 the provided Langfuse client and replaces the reference string with a base64 data URI. 3388 3389 If fetching media content fails for a reference string, a warning is logged and the reference 3390 string is left unchanged. 3391 3392 Args: 3393 obj: The object to process. Can be a primitive value, array, or nested object. 3394 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3395 resolve_with: The representation of the media content to replace the media reference string with. 3396 Currently only "base64_data_uri" is supported. 3397 max_depth: int: The maximum depth to traverse the object. Default is 10. 3398 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3399 3400 Returns: 3401 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3402 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3403 3404 Example: 3405 obj = { 3406 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3407 "nested": { 3408 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3409 } 3410 } 3411 3412 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3413 3414 # Result: 3415 # { 3416 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3417 # "nested": { 3418 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3419 # } 3420 # } 3421 """ 3422 return LangfuseMedia.resolve_media_references( 3423 langfuse_client=self, 3424 obj=obj, 3425 resolve_with=resolve_with, 3426 max_depth=max_depth, 3427 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3428 ) 3429 3430 @overload 3431 def get_prompt( 3432 self, 3433 name: str, 3434 *, 3435 version: Optional[int] = None, 3436 label: Optional[str] = None, 3437 type: Literal["chat"], 3438 cache_ttl_seconds: Optional[int] = None, 3439 fallback: Optional[List[ChatMessageDict]] = None, 3440 max_retries: Optional[int] = None, 3441 fetch_timeout_seconds: Optional[int] = None, 3442 ) -> ChatPromptClient: ... 3443 3444 @overload 3445 def get_prompt( 3446 self, 3447 name: str, 3448 *, 3449 version: Optional[int] = None, 3450 label: Optional[str] = None, 3451 type: Literal["text"] = "text", 3452 cache_ttl_seconds: Optional[int] = None, 3453 fallback: Optional[str] = None, 3454 max_retries: Optional[int] = None, 3455 fetch_timeout_seconds: Optional[int] = None, 3456 ) -> TextPromptClient: ... 3457 3458 def get_prompt( 3459 self, 3460 name: str, 3461 *, 3462 version: Optional[int] = None, 3463 label: Optional[str] = None, 3464 type: Literal["chat", "text"] = "text", 3465 cache_ttl_seconds: Optional[int] = None, 3466 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3467 max_retries: Optional[int] = None, 3468 fetch_timeout_seconds: Optional[int] = None, 3469 ) -> PromptClient: 3470 """Get a prompt. 3471 3472 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3473 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3474 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3475 return the expired prompt as a fallback. 3476 3477 Args: 3478 name (str): The name of the prompt to retrieve. 3479 3480 Keyword Args: 3481 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3482 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3483 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3484 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3485 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3486 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3487 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3488 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3489 3490 Returns: 3491 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3492 - TextPromptClient, if type argument is 'text'. 3493 - ChatPromptClient, if type argument is 'chat'. 3494 3495 Raises: 3496 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3497 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3498 """ 3499 if self._resources is None: 3500 raise Error( 3501 "SDK is not correctly initialized. Check the init logs for more details." 3502 ) 3503 if version is not None and label is not None: 3504 raise ValueError("Cannot specify both version and label at the same time.") 3505 3506 if not name: 3507 raise ValueError("Prompt name cannot be empty.") 3508 3509 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3510 bounded_max_retries = self._get_bounded_max_retries( 3511 max_retries, default_max_retries=2, max_retries_upper_bound=4 3512 ) 3513 3514 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3515 cached_prompt = self._resources.prompt_cache.get(cache_key) 3516 3517 if cached_prompt is None or cache_ttl_seconds == 0: 3518 langfuse_logger.debug( 3519 f"Prompt '{cache_key}' not found in cache or caching disabled." 3520 ) 3521 try: 3522 return self._fetch_prompt_and_update_cache( 3523 name, 3524 version=version, 3525 label=label, 3526 ttl_seconds=cache_ttl_seconds, 3527 max_retries=bounded_max_retries, 3528 fetch_timeout_seconds=fetch_timeout_seconds, 3529 ) 3530 except Exception as e: 3531 if fallback: 3532 langfuse_logger.warning( 3533 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3534 ) 3535 3536 fallback_client_args: Dict[str, Any] = { 3537 "name": name, 3538 "prompt": fallback, 3539 "type": type, 3540 "version": version or 0, 3541 "config": {}, 3542 "labels": [label] if label else [], 3543 "tags": [], 3544 } 3545 3546 if type == "text": 3547 return TextPromptClient( 3548 prompt=Prompt_Text(**fallback_client_args), 3549 is_fallback=True, 3550 ) 3551 3552 if type == "chat": 3553 return ChatPromptClient( 3554 prompt=Prompt_Chat(**fallback_client_args), 3555 is_fallback=True, 3556 ) 3557 3558 raise e 3559 3560 if cached_prompt.is_expired(): 3561 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3562 try: 3563 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3564 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3565 3566 def refresh_task() -> None: 3567 self._fetch_prompt_and_update_cache( 3568 name, 3569 version=version, 3570 label=label, 3571 ttl_seconds=cache_ttl_seconds, 3572 max_retries=bounded_max_retries, 3573 fetch_timeout_seconds=fetch_timeout_seconds, 3574 ) 3575 3576 self._resources.prompt_cache.add_refresh_prompt_task_if_current( 3577 cache_key, 3578 cached_prompt, 3579 refresh_task, 3580 ) 3581 langfuse_logger.debug( 3582 f"Returning stale prompt '{cache_key}' from cache." 3583 ) 3584 # return stale prompt 3585 return cached_prompt.value 3586 3587 except Exception as e: 3588 langfuse_logger.warning( 3589 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3590 ) 3591 # creation of refresh prompt task failed, return stale prompt 3592 return cached_prompt.value 3593 3594 return cached_prompt.value 3595 3596 def _fetch_prompt_and_update_cache( 3597 self, 3598 name: str, 3599 *, 3600 version: Optional[int] = None, 3601 label: Optional[str] = None, 3602 ttl_seconds: Optional[int] = None, 3603 max_retries: int, 3604 fetch_timeout_seconds: Optional[int], 3605 ) -> PromptClient: 3606 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3607 langfuse_logger.debug(f"Fetching prompt '{cache_key}' from server...") 3608 3609 try: 3610 3611 @backoff.on_exception( 3612 backoff.constant, Exception, max_tries=max_retries + 1, logger=None 3613 ) 3614 def fetch_prompts() -> Any: 3615 return self.api.prompts.get( 3616 self._url_encode(name), 3617 version=version, 3618 label=label, 3619 request_options={ 3620 "timeout_in_seconds": fetch_timeout_seconds, 3621 } 3622 if fetch_timeout_seconds is not None 3623 else None, 3624 ) 3625 3626 prompt_response = fetch_prompts() 3627 3628 prompt: PromptClient 3629 if prompt_response.type == "chat": 3630 prompt = ChatPromptClient(prompt_response) 3631 else: 3632 prompt = TextPromptClient(prompt_response) 3633 3634 if self._resources is not None: 3635 self._resources.prompt_cache.set(cache_key, prompt, ttl_seconds) 3636 3637 return prompt 3638 3639 except NotFoundError as not_found_error: 3640 langfuse_logger.warning( 3641 f"Prompt '{cache_key}' not found during refresh, evicting from cache." 3642 ) 3643 if self._resources is not None: 3644 self._resources.prompt_cache.delete(cache_key) 3645 raise not_found_error 3646 3647 except Exception as e: 3648 langfuse_logger.error( 3649 f"Error while fetching prompt '{cache_key}': {str(e)}" 3650 ) 3651 raise e 3652 3653 def _get_bounded_max_retries( 3654 self, 3655 max_retries: Optional[int], 3656 *, 3657 default_max_retries: int = 2, 3658 max_retries_upper_bound: int = 4, 3659 ) -> int: 3660 if max_retries is None: 3661 return default_max_retries 3662 3663 bounded_max_retries = min( 3664 max(max_retries, 0), 3665 max_retries_upper_bound, 3666 ) 3667 3668 return bounded_max_retries 3669 3670 @overload 3671 def create_prompt( 3672 self, 3673 *, 3674 name: str, 3675 prompt: List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]], 3676 labels: List[str] = [], 3677 tags: Optional[List[str]] = None, 3678 type: Optional[Literal["chat"]], 3679 config: Optional[Any] = None, 3680 commit_message: Optional[str] = None, 3681 ) -> ChatPromptClient: ... 3682 3683 @overload 3684 def create_prompt( 3685 self, 3686 *, 3687 name: str, 3688 prompt: str, 3689 labels: List[str] = [], 3690 tags: Optional[List[str]] = None, 3691 type: Optional[Literal["text"]] = "text", 3692 config: Optional[Any] = None, 3693 commit_message: Optional[str] = None, 3694 ) -> TextPromptClient: ... 3695 3696 def create_prompt( 3697 self, 3698 *, 3699 name: str, 3700 prompt: Union[ 3701 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3702 ], 3703 labels: List[str] = [], 3704 tags: Optional[List[str]] = None, 3705 type: Optional[Literal["chat", "text"]] = "text", 3706 config: Optional[Any] = None, 3707 commit_message: Optional[str] = None, 3708 ) -> PromptClient: 3709 """Create a new prompt in Langfuse. 3710 3711 Keyword Args: 3712 name : The name of the prompt to be created. 3713 prompt : The content of the prompt to be created. 3714 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3715 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3716 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3717 config: Additional structured data to be saved with the prompt. Defaults to None. 3718 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3719 commit_message: Optional string describing the change. 3720 3721 Returns: 3722 TextPromptClient: The prompt if type argument is 'text'. 3723 ChatPromptClient: The prompt if type argument is 'chat'. 3724 """ 3725 try: 3726 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3727 3728 if type == "chat": 3729 if not isinstance(prompt, list): 3730 raise ValueError( 3731 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3732 ) 3733 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3734 CreateChatPromptRequest( 3735 name=name, 3736 prompt=cast(Any, prompt), 3737 labels=labels, 3738 tags=tags, 3739 config=config or {}, 3740 commit_message=commit_message, 3741 type=CreateChatPromptType.CHAT, 3742 ) 3743 ) 3744 server_prompt = self.api.prompts.create(request=request) 3745 3746 if self._resources is not None: 3747 self._resources.prompt_cache.invalidate(name) 3748 3749 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3750 3751 if not isinstance(prompt, str): 3752 raise ValueError("For 'text' type, 'prompt' must be a string.") 3753 3754 request = CreateTextPromptRequest( 3755 name=name, 3756 prompt=prompt, 3757 labels=labels, 3758 tags=tags, 3759 config=config or {}, 3760 commit_message=commit_message, 3761 ) 3762 3763 server_prompt = self.api.prompts.create(request=request) 3764 3765 if self._resources is not None: 3766 self._resources.prompt_cache.invalidate(name) 3767 3768 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3769 3770 except Error as e: 3771 handle_fern_exception(e) 3772 raise e 3773 3774 def update_prompt( 3775 self, 3776 *, 3777 name: str, 3778 version: int, 3779 new_labels: List[str] = [], 3780 ) -> Any: 3781 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3782 3783 Args: 3784 name (str): The name of the prompt to update. 3785 version (int): The version number of the prompt to update. 3786 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3787 3788 Returns: 3789 Prompt: The updated prompt from the Langfuse API. 3790 3791 """ 3792 updated_prompt = self.api.prompt_version.update( 3793 name=self._url_encode(name), 3794 version=version, 3795 new_labels=new_labels, 3796 ) 3797 3798 if self._resources is not None: 3799 self._resources.prompt_cache.invalidate(name) 3800 3801 return updated_prompt 3802 3803 def _url_encode(self, url: str, *, is_url_param: Optional[bool] = False) -> str: 3804 # httpx ≥ 0.28 does its own WHATWG-compliant quoting (eg. encodes bare 3805 # “%”, “?”, “#”, “|”, … in query/path parts). Re-quoting here would 3806 # double-encode, so we skip when the value is about to be sent straight 3807 # to httpx (`is_url_param=True`) and the installed version is ≥ 0.28. 3808 if is_url_param and Version(httpx.__version__) >= Version("0.28.0"): 3809 return url 3810 3811 # urllib.parse.quote does not escape slashes "/" by default; we need to add safe="" to force escaping 3812 # we need add safe="" to force escaping of slashes 3813 # This is necessary for prompts in prompt folders 3814 return urllib.parse.quote(url, safe="") 3815 3816 def clear_prompt_cache(self) -> None: 3817 """Clear the entire prompt cache, removing all cached prompts. 3818 3819 This method is useful when you want to force a complete refresh of all 3820 cached prompts, for example after major updates or when you need to 3821 ensure the latest versions are fetched from the server. 3822 """ 3823 if self._resources is not None: 3824 self._resources.prompt_cache.clear()
Main client for Langfuse tracing and platform features.
This class provides an interface for creating and managing traces, spans, and generations in Langfuse as well as interacting with the Langfuse API.
The client features a thread-safe singleton pattern for each unique public API key, ensuring consistent trace context propagation across your application. It implements efficient batching of spans with configurable flush settings and includes background thread management for media uploads and score ingestion.
Configuration is flexible through either direct parameters or environment variables, with graceful fallbacks and runtime configuration updates.
Attributes:
- api: Synchronous API client for Langfuse backend communication
- async_api: Asynchronous API client for Langfuse backend communication
- _otel_tracer: Internal LangfuseTracer instance managing OpenTelemetry components
Arguments:
- public_key (Optional[str]): Your Langfuse public API key. Can also be set via LANGFUSE_PUBLIC_KEY environment variable.
- secret_key (Optional[str]): Your Langfuse secret API key. Can also be set via LANGFUSE_SECRET_KEY environment variable.
- base_url (Optional[str]): The Langfuse API base URL. Defaults to "https://cloud.langfuse.com". Can also be set via LANGFUSE_BASE_URL environment variable.
- host (Optional[str]): Deprecated. Use base_url instead. The Langfuse API host URL. Defaults to "https://cloud.langfuse.com".
- timeout (Optional[int]): Timeout in seconds for API requests. Defaults to 5 seconds.
- httpx_client (Optional[httpx.Client]): Custom httpx client for making non-tracing HTTP requests. If not provided, a default client will be created.
- debug (bool): Enable debug logging. Defaults to False. Can also be set via LANGFUSE_DEBUG environment variable.
- tracing_enabled (Optional[bool]): Enable or disable tracing. Defaults to True. Can also be set via LANGFUSE_TRACING_ENABLED environment variable.
- flush_at (Optional[int]): Number of spans to batch before sending to the API. Defaults to 512. Can also be set via LANGFUSE_FLUSH_AT environment variable.
- flush_interval (Optional[float]): Time in seconds between batch flushes. Defaults to 5 seconds. Can also be set via LANGFUSE_FLUSH_INTERVAL environment variable.
- environment (Optional[str]): Environment name for tracing. Default is 'default'. Can also be set via LANGFUSE_TRACING_ENVIRONMENT environment variable. Can be any lowercase alphanumeric string with hyphens and underscores that does not start with 'langfuse'.
- release (Optional[str]): Release version/hash of your application. Used for grouping analytics by release.
- media_upload_thread_count (Optional[int]): Number of background threads for handling media uploads. Defaults to 1. Can also be set via LANGFUSE_MEDIA_UPLOAD_THREAD_COUNT environment variable.
- sample_rate (Optional[float]): Sampling rate for traces (0.0 to 1.0). Defaults to 1.0 (100% of traces are sampled). Can also be set via LANGFUSE_SAMPLE_RATE environment variable.
- mask (Optional[MaskFunction]): Function to mask sensitive data synchronously when Langfuse SDK attributes are created. This applies only to data set through Langfuse SDK APIs such as
start_observation(),update(), andset_trace_io(). mask_otel_spans (Optional[MaskOtelSpansFunction]): Synchronous export-stage hook for masking raw OpenTelemetry span attributes before this Langfuse client sends them to Langfuse. Use this for spans created by third-party OpenTelemetry instrumentations, or when you need to inspect final span attributes after export filtering and Langfuse media handling. It does not modify spans already exported through other OpenTelemetry exporters.
The hook receives one OpenTelemetry export batch. A batch is not guaranteed to contain a complete trace, request, or Langfuse observation tree. The hook usually runs on the OpenTelemetry batch span processor worker thread; during
flush()and shutdown it may run on the caller thread. Keep it synchronous, deterministic, and fast.Return
Noneto leave the batch unchanged. ReturnMaskOtelSpansResultwithOtelSpanPatchvalues to delete or replace attributes on selected spans. If the hook raises or returns an invalid batch result, Langfuse drops the whole export batch. If one returned span patch is invalid, Langfuse drops only that span from the Langfuse export.Example:
from typing import Optional from langfuse import Langfuse from langfuse.types import ( MaskOtelSpansParams, MaskOtelSpansResult, OtelSpanPatch, ) def mask_otel_spans( *, params: MaskOtelSpansParams ) -> Optional[MaskOtelSpansResult]: patches = {} for identifier, span in params.spans.items(): if "gen_ai.prompt.0.content" in span.attributes: patches[identifier] = OtelSpanPatch( delete_attributes=("gen_ai.prompt.0.content",), set_attributes={"masking.applied": True}, ) return MaskOtelSpansResult(span_patches=patches) langfuse = Langfuse(mask_otel_spans=mask_otel_spans)blocked_instrumentation_scopes (Optional[List[str]]): Deprecated. Use
should_export_spaninstead. Equivalent behavior:from langfuse.span_filter import is_default_export_span blocked = {"sqlite", "requests"} should_export_span = lambda span: ( is_default_export_span(span) and ( span.instrumentation_scope is None or span.instrumentation_scope.name not in blocked ) )should_export_span (Optional[Callable[[ReadableSpan], bool]]): Callback to decide whether to export a span. If omitted, Langfuse uses the default filter (Langfuse SDK spans, spans with
gen_ai.*attributes, and known LLM instrumentation scopes).- additional_headers (Optional[Dict[str, str]]): Additional headers to include in all API requests and in the default OTLPSpanExporter requests. These headers will be merged with default headers. Note: If httpx_client is provided, additional_headers must be set directly on your custom httpx_client as well. If
span_exporteris provided, these headers are not wired into that exporter and must be configured on the exporter instance directly. - tracer_provider(Optional[TracerProvider]): OpenTelemetry TracerProvider to use for Langfuse. This can be useful to set to have disconnected tracing between Langfuse and other OpenTelemetry-span emitting libraries. Note: To track active spans, the context is still shared between TracerProviders. This may lead to broken trace trees.
- span_exporter (Optional[SpanExporter]): Custom OpenTelemetry span exporter for the Langfuse span processor. If omitted, Langfuse creates an OTLPSpanExporter pointed at the Langfuse OTLP endpoint. If provided, Langfuse does not wire
base_url, exporter headers, exporter auth, or exporter timeout into it. Configure endpoint, headers, and timeout on the exporter instance directly. If you are sending spans to Langfuse v4 or using Langfuse Cloud Fast Preview, includex-langfuse-ingestion-version=4on the exporter to enable real time processing of exported spans.
Example:
from langfuse.otel import Langfuse # Initialize the client (reads from env vars if not provided) langfuse = Langfuse( public_key="your-public-key", secret_key="your-secret-key", host="https://cloud.langfuse.com", # Optional, default shown ) # Create a trace span with langfuse.start_as_current_observation(name="process-query") as span: # Your application code here # Create a nested generation span for an LLM call with span.start_as_current_generation( name="generate-response", model="gpt-4", input={"query": "Tell me about AI"}, model_parameters={"temperature": 0.7, "max_tokens": 500} ) as generation: # Generate response here response = "AI is a field of computer science..." generation.update( output=response, usage_details={"prompt_tokens": 10, "completion_tokens": 50}, cost_details={"total_cost": 0.0023} ) # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL) generation.score(name="relevance", value=0.95, data_type="NUMERIC")
273 def __init__( 274 self, 275 *, 276 public_key: Optional[str] = None, 277 secret_key: Optional[str] = None, 278 base_url: Optional[str] = None, 279 host: Optional[str] = None, 280 timeout: Optional[int] = None, 281 httpx_client: Optional[httpx.Client] = None, 282 debug: bool = False, 283 tracing_enabled: Optional[bool] = True, 284 flush_at: Optional[int] = None, 285 flush_interval: Optional[float] = None, 286 environment: Optional[str] = None, 287 release: Optional[str] = None, 288 media_upload_thread_count: Optional[int] = None, 289 sample_rate: Optional[float] = None, 290 mask: Optional[MaskFunction] = None, 291 mask_otel_spans: Optional[MaskOtelSpansFunction] = None, 292 blocked_instrumentation_scopes: Optional[List[str]] = None, 293 should_export_span: Optional[Callable[[ReadableSpan], bool]] = None, 294 additional_headers: Optional[Dict[str, str]] = None, 295 tracer_provider: Optional[TracerProvider] = None, 296 span_exporter: Optional[SpanExporter] = None, 297 ): 298 self._base_url = ( 299 base_url 300 or os.environ.get(LANGFUSE_BASE_URL) 301 or host 302 or os.environ.get(LANGFUSE_HOST, "https://cloud.langfuse.com") 303 ) 304 self._environment = environment or cast( 305 str, os.environ.get(LANGFUSE_TRACING_ENVIRONMENT) 306 ) 307 self._release = ( 308 release 309 or os.environ.get(LANGFUSE_RELEASE, None) 310 or get_common_release_envs() 311 ) 312 self._project_id: Optional[str] = None 313 sample_rate = sample_rate or float(os.environ.get(LANGFUSE_SAMPLE_RATE, 1.0)) 314 if not 0.0 <= sample_rate <= 1.0: 315 raise ValueError( 316 f"Sample rate must be between 0.0 and 1.0, got {sample_rate}" 317 ) 318 319 timeout = timeout or int(os.environ.get(LANGFUSE_TIMEOUT, 5)) 320 321 self._tracing_enabled = ( 322 tracing_enabled 323 and os.environ.get(LANGFUSE_TRACING_ENABLED, "true").lower() != "false" 324 ) 325 if not self._tracing_enabled: 326 langfuse_logger.info( 327 "Configuration: Langfuse tracing is explicitly disabled. No data will be sent to the Langfuse API." 328 ) 329 330 debug = ( 331 debug if debug else (os.getenv(LANGFUSE_DEBUG, "false").lower() == "true") 332 ) 333 if debug: 334 logging.basicConfig( 335 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 336 ) 337 langfuse_logger.setLevel(logging.DEBUG) 338 339 public_key = public_key or os.environ.get(LANGFUSE_PUBLIC_KEY) 340 if public_key is None: 341 langfuse_logger.warning( 342 "Authentication error: Langfuse client initialized without public_key. Client will be disabled. " 343 "Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. " 344 ) 345 self._otel_tracer = otel_trace_api.NoOpTracer() 346 return 347 348 secret_key = secret_key or os.environ.get(LANGFUSE_SECRET_KEY) 349 if secret_key is None: 350 langfuse_logger.warning( 351 "Authentication error: Langfuse client initialized without secret_key. Client will be disabled. " 352 "Provide a secret_key parameter or set LANGFUSE_SECRET_KEY environment variable. " 353 ) 354 self._otel_tracer = otel_trace_api.NoOpTracer() 355 return 356 357 if os.environ.get("OTEL_SDK_DISABLED", "false").lower() == "true": 358 langfuse_logger.warning( 359 "OTEL_SDK_DISABLED is set. Langfuse tracing will be disabled and no traces will appear in the UI." 360 ) 361 362 if blocked_instrumentation_scopes is not None: 363 warnings.warn( 364 "`blocked_instrumentation_scopes` is deprecated and will be removed in a future release. " 365 "Use `should_export_span` instead. Example: " 366 "from langfuse.span_filter import is_default_export_span; " 367 'blocked={"scope"}; should_export_span=lambda span: ' 368 "is_default_export_span(span) and (span.instrumentation_scope is None or " 369 "span.instrumentation_scope.name not in blocked).", 370 DeprecationWarning, 371 stacklevel=2, 372 ) 373 374 # Initialize api and tracer if requirements are met 375 self._resources = LangfuseResourceManager( 376 public_key=public_key, 377 secret_key=secret_key, 378 base_url=self._base_url, 379 timeout=timeout, 380 environment=self._environment, 381 release=release, 382 flush_at=flush_at, 383 flush_interval=flush_interval, 384 httpx_client=httpx_client, 385 media_upload_thread_count=media_upload_thread_count, 386 sample_rate=sample_rate, 387 mask=mask, 388 mask_otel_spans=mask_otel_spans, 389 tracing_enabled=self._tracing_enabled, 390 blocked_instrumentation_scopes=blocked_instrumentation_scopes, 391 should_export_span=should_export_span, 392 additional_headers=additional_headers, 393 tracer_provider=tracer_provider, 394 span_exporter=span_exporter, 395 ) 396 self._mask = self._resources.mask 397 398 self._otel_tracer = ( 399 self._resources.tracer 400 if self._tracing_enabled and self._resources.tracer is not None 401 else otel_trace_api.NoOpTracer() 402 ) 403 self.api = self._resources.api 404 self.async_api = self._resources.async_api
553 def start_observation( 554 self, 555 *, 556 trace_context: Optional[TraceContext] = None, 557 name: str, 558 as_type: ObservationTypeLiteralNoEvent = "span", 559 input: Optional[Any] = None, 560 output: Optional[Any] = None, 561 metadata: Optional[Any] = None, 562 version: Optional[str] = None, 563 level: Optional[SpanLevel] = None, 564 status_message: Optional[str] = None, 565 completion_start_time: Optional[datetime] = None, 566 model: Optional[str] = None, 567 model_parameters: Optional[Dict[str, MapValue]] = None, 568 usage_details: Optional[Dict[str, int]] = None, 569 cost_details: Optional[Dict[str, float]] = None, 570 prompt: Optional[PromptClient] = None, 571 ) -> Union[ 572 LangfuseSpan, 573 LangfuseGeneration, 574 LangfuseAgent, 575 LangfuseTool, 576 LangfuseChain, 577 LangfuseRetriever, 578 LangfuseEvaluator, 579 LangfuseEmbedding, 580 LangfuseGuardrail, 581 ]: 582 """Create a new observation of the specified type. 583 584 This method creates a new observation but does not set it as the current span in the 585 context. To create and use an observation within a context, use start_as_current_observation(). 586 587 Args: 588 trace_context: Optional context for connecting to an existing trace 589 name: Name of the observation 590 as_type: Type of observation to create (defaults to "span") 591 input: Input data for the operation 592 output: Output data from the operation 593 metadata: Additional metadata to associate with the observation 594 version: Version identifier for the code or component 595 level: Importance level of the observation 596 status_message: Optional status message for the observation 597 completion_start_time: When the model started generating (for generation types) 598 model: Name/identifier of the AI model used (for generation types) 599 model_parameters: Parameters used for the model (for generation types) 600 usage_details: Token usage information (for generation types) 601 cost_details: Cost information (for generation types) 602 prompt: Associated prompt template (for generation types) 603 604 Returns: 605 An observation object of the appropriate type that must be ended with .end() 606 """ 607 if trace_context: 608 trace_id = trace_context.get("trace_id", None) 609 parent_span_id = trace_context.get("parent_span_id", None) 610 611 if trace_id: 612 remote_parent_span = self._create_remote_parent_span( 613 trace_id=trace_id, parent_span_id=parent_span_id 614 ) 615 616 with otel_trace_api.use_span( 617 cast(otel_trace_api.Span, remote_parent_span) 618 ): 619 otel_span = self._otel_tracer.start_span(name=name) 620 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 621 622 return self._create_observation_from_otel_span( 623 otel_span=otel_span, 624 as_type=as_type, 625 input=input, 626 output=output, 627 metadata=metadata, 628 version=version, 629 level=level, 630 status_message=status_message, 631 completion_start_time=completion_start_time, 632 model=model, 633 model_parameters=model_parameters, 634 usage_details=usage_details, 635 cost_details=cost_details, 636 prompt=prompt, 637 ) 638 639 otel_span = self._otel_tracer.start_span(name=name) 640 641 return self._create_observation_from_otel_span( 642 otel_span=otel_span, 643 as_type=as_type, 644 input=input, 645 output=output, 646 metadata=metadata, 647 version=version, 648 level=level, 649 status_message=status_message, 650 completion_start_time=completion_start_time, 651 model=model, 652 model_parameters=model_parameters, 653 usage_details=usage_details, 654 cost_details=cost_details, 655 prompt=prompt, 656 )
Create a new observation of the specified type.
This method creates a new observation but does not set it as the current span in the context. To create and use an observation within a context, use start_as_current_observation().
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation
- status_message: Optional status message for the observation
- completion_start_time: When the model started generating (for generation types)
- model: Name/identifier of the AI model used (for generation types)
- model_parameters: Parameters used for the model (for generation types)
- usage_details: Token usage information (for generation types)
- cost_details: Cost information (for generation types)
- prompt: Associated prompt template (for generation types)
Returns:
An observation object of the appropriate type that must be ended with .end()
886 def start_as_current_observation( 887 self, 888 *, 889 trace_context: Optional[TraceContext] = None, 890 name: str, 891 as_type: ObservationTypeLiteralNoEvent = "span", 892 input: Optional[Any] = None, 893 output: Optional[Any] = None, 894 metadata: Optional[Any] = None, 895 version: Optional[str] = None, 896 level: Optional[SpanLevel] = None, 897 status_message: Optional[str] = None, 898 completion_start_time: Optional[datetime] = None, 899 model: Optional[str] = None, 900 model_parameters: Optional[Dict[str, MapValue]] = None, 901 usage_details: Optional[Dict[str, int]] = None, 902 cost_details: Optional[Dict[str, float]] = None, 903 prompt: Optional[PromptClient] = None, 904 end_on_exit: Optional[bool] = None, 905 ) -> Union[ 906 _AgnosticContextManager[LangfuseGeneration], 907 _AgnosticContextManager[LangfuseSpan], 908 _AgnosticContextManager[LangfuseAgent], 909 _AgnosticContextManager[LangfuseTool], 910 _AgnosticContextManager[LangfuseChain], 911 _AgnosticContextManager[LangfuseRetriever], 912 _AgnosticContextManager[LangfuseEvaluator], 913 _AgnosticContextManager[LangfuseEmbedding], 914 _AgnosticContextManager[LangfuseGuardrail], 915 ]: 916 """Create a new observation and set it as the current span in a context manager. 917 918 This method creates a new observation of the specified type and sets it as the 919 current span within a context manager. Use this method with a 'with' statement to 920 automatically handle the observation lifecycle within a code block. 921 922 The created observation will be the child of the current span in the context. 923 924 Args: 925 trace_context: Optional context for connecting to an existing trace 926 name: Name of the observation (e.g., function or operation name) 927 as_type: Type of observation to create (defaults to "span") 928 input: Input data for the operation (can be any JSON-serializable object) 929 output: Output data from the operation (can be any JSON-serializable object) 930 metadata: Additional metadata to associate with the observation 931 version: Version identifier for the code or component 932 level: Importance level of the observation (info, warning, error) 933 status_message: Optional status message for the observation 934 end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks. 935 936 The following parameters are available when as_type is: "generation" or "embedding". 937 completion_start_time: When the model started generating the response 938 model: Name/identifier of the AI model used (e.g., "gpt-4") 939 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 940 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 941 cost_details: Cost information for the model call 942 prompt: Associated prompt template from Langfuse prompt management 943 944 Returns: 945 A context manager that yields the appropriate observation type based on as_type 946 947 Example: 948 ```python 949 # Create a span 950 with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: 951 # Do work 952 result = process_data() 953 span.update(output=result) 954 955 # Create a child span automatically 956 with span.start_as_current_observation(name="sub-operation") as child_span: 957 # Do sub-operation work 958 child_span.update(output="sub-result") 959 960 # Create a tool observation 961 with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: 962 # Do tool work 963 results = search_web(query) 964 tool.update(output=results) 965 966 # Create a generation observation 967 with langfuse.start_as_current_observation( 968 name="answer-generation", 969 as_type="generation", 970 model="gpt-4" 971 ) as generation: 972 # Generate answer 973 response = llm.generate(...) 974 generation.update(output=response) 975 ``` 976 """ 977 if as_type in get_observation_types_list(ObservationTypeGenerationLike): 978 if trace_context: 979 trace_id = trace_context.get("trace_id", None) 980 parent_span_id = trace_context.get("parent_span_id", None) 981 982 if trace_id: 983 remote_parent_span = self._create_remote_parent_span( 984 trace_id=trace_id, parent_span_id=parent_span_id 985 ) 986 987 return cast( 988 Union[ 989 _AgnosticContextManager[LangfuseGeneration], 990 _AgnosticContextManager[LangfuseEmbedding], 991 ], 992 self._create_span_with_parent_context( 993 as_type=as_type, 994 name=name, 995 remote_parent_span=remote_parent_span, 996 parent=None, 997 end_on_exit=end_on_exit, 998 input=input, 999 output=output, 1000 metadata=metadata, 1001 version=version, 1002 level=level, 1003 status_message=status_message, 1004 completion_start_time=completion_start_time, 1005 model=model, 1006 model_parameters=model_parameters, 1007 usage_details=usage_details, 1008 cost_details=cost_details, 1009 prompt=prompt, 1010 ), 1011 ) 1012 1013 return cast( 1014 Union[ 1015 _AgnosticContextManager[LangfuseGeneration], 1016 _AgnosticContextManager[LangfuseEmbedding], 1017 ], 1018 self._start_as_current_otel_span_with_processed_media( 1019 as_type=as_type, 1020 name=name, 1021 end_on_exit=end_on_exit, 1022 input=input, 1023 output=output, 1024 metadata=metadata, 1025 version=version, 1026 level=level, 1027 status_message=status_message, 1028 completion_start_time=completion_start_time, 1029 model=model, 1030 model_parameters=model_parameters, 1031 usage_details=usage_details, 1032 cost_details=cost_details, 1033 prompt=prompt, 1034 ), 1035 ) 1036 1037 if as_type in get_observation_types_list(ObservationTypeSpanLike): 1038 if trace_context: 1039 trace_id = trace_context.get("trace_id", None) 1040 parent_span_id = trace_context.get("parent_span_id", None) 1041 1042 if trace_id: 1043 remote_parent_span = self._create_remote_parent_span( 1044 trace_id=trace_id, parent_span_id=parent_span_id 1045 ) 1046 1047 return cast( 1048 Union[ 1049 _AgnosticContextManager[LangfuseSpan], 1050 _AgnosticContextManager[LangfuseAgent], 1051 _AgnosticContextManager[LangfuseTool], 1052 _AgnosticContextManager[LangfuseChain], 1053 _AgnosticContextManager[LangfuseRetriever], 1054 _AgnosticContextManager[LangfuseEvaluator], 1055 _AgnosticContextManager[LangfuseGuardrail], 1056 ], 1057 self._create_span_with_parent_context( 1058 as_type=as_type, 1059 name=name, 1060 remote_parent_span=remote_parent_span, 1061 parent=None, 1062 end_on_exit=end_on_exit, 1063 input=input, 1064 output=output, 1065 metadata=metadata, 1066 version=version, 1067 level=level, 1068 status_message=status_message, 1069 ), 1070 ) 1071 1072 return cast( 1073 Union[ 1074 _AgnosticContextManager[LangfuseSpan], 1075 _AgnosticContextManager[LangfuseAgent], 1076 _AgnosticContextManager[LangfuseTool], 1077 _AgnosticContextManager[LangfuseChain], 1078 _AgnosticContextManager[LangfuseRetriever], 1079 _AgnosticContextManager[LangfuseEvaluator], 1080 _AgnosticContextManager[LangfuseGuardrail], 1081 ], 1082 self._start_as_current_otel_span_with_processed_media( 1083 as_type=as_type, 1084 name=name, 1085 end_on_exit=end_on_exit, 1086 input=input, 1087 output=output, 1088 metadata=metadata, 1089 version=version, 1090 level=level, 1091 status_message=status_message, 1092 ), 1093 ) 1094 1095 # This should never be reached since all valid types are handled above 1096 langfuse_logger.warning( 1097 f"Unknown observation type: {as_type}, falling back to span" 1098 ) 1099 return self._start_as_current_otel_span_with_processed_media( 1100 as_type="span", 1101 name=name, 1102 end_on_exit=end_on_exit, 1103 input=input, 1104 output=output, 1105 metadata=metadata, 1106 version=version, 1107 level=level, 1108 status_message=status_message, 1109 )
Create a new observation and set it as the current span in a context manager.
This method creates a new observation of the specified type and sets it as the current span within a context manager. Use this method with a 'with' statement to automatically handle the observation lifecycle within a code block.
The created observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the observation (e.g., function or operation name)
- as_type: Type of observation to create (defaults to "span")
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the observation
- version: Version identifier for the code or component
- level: Importance level of the observation (info, warning, error)
- status_message: Optional status message for the observation
- end_on_exit (default: True): Whether to end the span automatically when leaving the context manager. If False, the span must be manually ended to avoid memory leaks.
- The following parameters are available when as_type is: "generation" or "embedding".
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Returns:
A context manager that yields the appropriate observation type based on as_type
Example:
# Create a span with langfuse.start_as_current_observation(name="process-query", as_type="span") as span: # Do work result = process_data() span.update(output=result) # Create a child span automatically with span.start_as_current_observation(name="sub-operation") as child_span: # Do sub-operation work child_span.update(output="sub-result") # Create a tool observation with langfuse.start_as_current_observation(name="web-search", as_type="tool") as tool: # Do tool work results = search_web(query) tool.update(output=results) # Create a generation observation with langfuse.start_as_current_observation( name="answer-generation", as_type="generation", model="gpt-4" ) as generation: # Generate answer response = llm.generate(...) generation.update(output=response)
1301 def update_current_generation( 1302 self, 1303 *, 1304 name: Optional[str] = None, 1305 input: Optional[Any] = None, 1306 output: Optional[Any] = None, 1307 metadata: Optional[Any] = None, 1308 version: Optional[str] = None, 1309 level: Optional[SpanLevel] = None, 1310 status_message: Optional[str] = None, 1311 completion_start_time: Optional[datetime] = None, 1312 model: Optional[str] = None, 1313 model_parameters: Optional[Dict[str, MapValue]] = None, 1314 usage_details: Optional[Dict[str, int]] = None, 1315 cost_details: Optional[Dict[str, float]] = None, 1316 prompt: Optional[PromptClient] = None, 1317 ) -> None: 1318 """Update the current active generation span with new information. 1319 1320 This method updates the current generation span in the active context with 1321 additional information. It's useful for adding output, usage stats, or other 1322 details that become available during or after model generation. 1323 1324 Args: 1325 name: The generation name 1326 input: Updated input data for the model 1327 output: Output from the model (e.g., completions) 1328 metadata: Additional metadata to associate with the generation 1329 version: Version identifier for the model or component 1330 level: Importance level of the generation (info, warning, error) 1331 status_message: Optional status message for the generation 1332 completion_start_time: When the model started generating the response 1333 model: Name/identifier of the AI model used (e.g., "gpt-4") 1334 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1335 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1336 cost_details: Cost information for the model call 1337 prompt: Associated prompt template from Langfuse prompt management 1338 1339 Example: 1340 ```python 1341 with langfuse.start_as_current_generation(name="answer-query") as generation: 1342 # Initial setup and API call 1343 response = llm.generate(...) 1344 1345 # Update with results that weren't available at creation time 1346 langfuse.update_current_generation( 1347 output=response.text, 1348 usage_details={ 1349 "prompt_tokens": response.usage.prompt_tokens, 1350 "completion_tokens": response.usage.completion_tokens 1351 } 1352 ) 1353 ``` 1354 """ 1355 if not self._tracing_enabled: 1356 langfuse_logger.debug( 1357 "Operation skipped: update_current_generation - Tracing is disabled or client is in no-op mode." 1358 ) 1359 return 1360 1361 current_otel_span = self._get_current_otel_span() 1362 1363 if current_otel_span is not None: 1364 generation = LangfuseGeneration( 1365 otel_span=current_otel_span, langfuse_client=self 1366 ) 1367 1368 if name: 1369 current_otel_span.update_name(name) 1370 1371 generation.update( 1372 input=input, 1373 output=output, 1374 metadata=metadata, 1375 version=version, 1376 level=level, 1377 status_message=status_message, 1378 completion_start_time=completion_start_time, 1379 model=model, 1380 model_parameters=model_parameters, 1381 usage_details=usage_details, 1382 cost_details=cost_details, 1383 prompt=prompt, 1384 )
Update the current active generation span with new information.
This method updates the current generation span in the active context with additional information. It's useful for adding output, usage stats, or other details that become available during or after model generation.
Arguments:
- name: The generation name
- input: Updated input data for the model
- output: Output from the model (e.g., completions)
- metadata: Additional metadata to associate with the generation
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Initial setup and API call response = llm.generate(...) # Update with results that weren't available at creation time langfuse.update_current_generation( output=response.text, usage_details={ "prompt_tokens": response.usage.prompt_tokens, "completion_tokens": response.usage.completion_tokens } )
1386 def update_current_span( 1387 self, 1388 *, 1389 name: Optional[str] = None, 1390 input: Optional[Any] = None, 1391 output: Optional[Any] = None, 1392 metadata: Optional[Any] = None, 1393 version: Optional[str] = None, 1394 level: Optional[SpanLevel] = None, 1395 status_message: Optional[str] = None, 1396 ) -> None: 1397 """Update the current active span with new information. 1398 1399 This method updates the current span in the active context with 1400 additional information. It's useful for adding outputs or metadata 1401 that become available during execution. 1402 1403 Args: 1404 name: The span name 1405 input: Updated input data for the operation 1406 output: Output data from the operation 1407 metadata: Additional metadata to associate with the span 1408 version: Version identifier for the code or component 1409 level: Importance level of the span (info, warning, error) 1410 status_message: Optional status message for the span 1411 1412 Example: 1413 ```python 1414 with langfuse.start_as_current_observation(name="process-data") as span: 1415 # Initial processing 1416 result = process_first_part() 1417 1418 # Update with intermediate results 1419 langfuse.update_current_span(metadata={"intermediate_result": result}) 1420 1421 # Continue processing 1422 final_result = process_second_part(result) 1423 1424 # Final update 1425 langfuse.update_current_span(output=final_result) 1426 ``` 1427 """ 1428 if not self._tracing_enabled: 1429 langfuse_logger.debug( 1430 "Operation skipped: update_current_span - Tracing is disabled or client is in no-op mode." 1431 ) 1432 return 1433 1434 current_otel_span = self._get_current_otel_span() 1435 1436 if current_otel_span is not None: 1437 span_class = self._get_span_class( 1438 self._get_observation_type_from_otel_span(current_otel_span) 1439 ) 1440 span = span_class( 1441 otel_span=current_otel_span, 1442 langfuse_client=self, 1443 environment=self._environment, 1444 release=self._release, 1445 ) 1446 1447 if name: 1448 current_otel_span.update_name(name) 1449 1450 span.update( 1451 input=input, 1452 output=output, 1453 metadata=metadata, 1454 version=version, 1455 level=level, 1456 status_message=status_message, 1457 )
Update the current active span with new information.
This method updates the current span in the active context with additional information. It's useful for adding outputs or metadata that become available during execution.
Arguments:
- name: The span name
- input: Updated input data for the operation
- output: Output data from the operation
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Example:
with langfuse.start_as_current_observation(name="process-data") as span: # Initial processing result = process_first_part() # Update with intermediate results langfuse.update_current_span(metadata={"intermediate_result": result}) # Continue processing final_result = process_second_part(result) # Final update langfuse.update_current_span(output=final_result)
1459 @deprecated( 1460 "Trace-level input/output is deprecated. " 1461 "For trace attributes (user_id, session_id, tags, etc.), use propagate_attributes() instead. " 1462 "This method will be removed in a future major version." 1463 ) 1464 def set_current_trace_io( 1465 self, 1466 *, 1467 input: Optional[Any] = None, 1468 output: Optional[Any] = None, 1469 ) -> None: 1470 """Set trace-level input and output for the current span's trace. 1471 1472 .. deprecated:: 1473 This is a legacy method for backward compatibility with Langfuse platform 1474 features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge 1475 evaluators). It will be removed in a future major version. 1476 1477 For setting other trace attributes (user_id, session_id, metadata, tags, version), 1478 use :meth:`propagate_attributes` instead. 1479 1480 Args: 1481 input: Input data to associate with the trace. 1482 output: Output data to associate with the trace. 1483 """ 1484 if not self._tracing_enabled: 1485 langfuse_logger.debug( 1486 "Operation skipped: set_current_trace_io - Tracing is disabled or client is in no-op mode." 1487 ) 1488 return 1489 1490 current_otel_span = self._get_current_otel_span() 1491 1492 if current_otel_span is not None and current_otel_span.is_recording(): 1493 span_class = self._get_span_class( 1494 self._get_observation_type_from_otel_span(current_otel_span) 1495 ) 1496 span = span_class( 1497 otel_span=current_otel_span, 1498 langfuse_client=self, 1499 environment=self._environment, 1500 release=self._release, 1501 ) 1502 1503 span.set_trace_io( 1504 input=input, 1505 output=output, 1506 )
Set trace-level input and output for the current span's trace.
Deprecated since version : This is a legacy method for backward compatibility with Langfuse platform features that still rely on trace-level input/output (e.g., legacy LLM-as-a-judge evaluators). It will be removed in a future major version.
For setting other trace attributes (user_id, session_id, metadata, tags, version),
use propagate_attributes() instead.
Arguments:
- input: Input data to associate with the trace.
- output: Output data to associate with the trace.
1508 def set_current_trace_as_public(self) -> None: 1509 """Make the current trace publicly accessible via its URL. 1510 1511 When a trace is published, anyone with the trace link can view the full trace 1512 without needing to be logged in to Langfuse. This action cannot be undone 1513 programmatically - once published, the entire trace becomes public. 1514 1515 This is a convenience method that publishes the trace from the currently 1516 active span context. Use this when you want to make a trace public from 1517 within a traced function without needing direct access to the span object. 1518 """ 1519 if not self._tracing_enabled: 1520 langfuse_logger.debug( 1521 "Operation skipped: set_current_trace_as_public - Tracing is disabled or client is in no-op mode." 1522 ) 1523 return 1524 1525 current_otel_span = self._get_current_otel_span() 1526 1527 if current_otel_span is not None and current_otel_span.is_recording(): 1528 span_class = self._get_span_class( 1529 self._get_observation_type_from_otel_span(current_otel_span) 1530 ) 1531 span = span_class( 1532 otel_span=current_otel_span, 1533 langfuse_client=self, 1534 environment=self._environment, 1535 ) 1536 1537 span.set_trace_as_public()
Make the current trace publicly accessible via its URL.
When a trace is published, anyone with the trace link can view the full trace without needing to be logged in to Langfuse. This action cannot be undone programmatically - once published, the entire trace becomes public.
This is a convenience method that publishes the trace from the currently active span context. Use this when you want to make a trace public from within a traced function without needing direct access to the span object.
1539 def create_event( 1540 self, 1541 *, 1542 trace_context: Optional[TraceContext] = None, 1543 name: str, 1544 input: Optional[Any] = None, 1545 output: Optional[Any] = None, 1546 metadata: Optional[Any] = None, 1547 version: Optional[str] = None, 1548 level: Optional[SpanLevel] = None, 1549 status_message: Optional[str] = None, 1550 ) -> LangfuseEvent: 1551 """Create a new Langfuse observation of type 'EVENT'. 1552 1553 The created Langfuse Event observation will be the child of the current span in the context. 1554 1555 Args: 1556 trace_context: Optional context for connecting to an existing trace 1557 name: Name of the span (e.g., function or operation name) 1558 input: Input data for the operation (can be any JSON-serializable object) 1559 output: Output data from the operation (can be any JSON-serializable object) 1560 metadata: Additional metadata to associate with the span 1561 version: Version identifier for the code or component 1562 level: Importance level of the span (info, warning, error) 1563 status_message: Optional status message for the span 1564 1565 Returns: 1566 The Langfuse Event object 1567 1568 Example: 1569 ```python 1570 event = langfuse.create_event(name="process-event") 1571 ``` 1572 """ 1573 timestamp = time_ns() 1574 1575 if trace_context: 1576 trace_id = trace_context.get("trace_id", None) 1577 parent_span_id = trace_context.get("parent_span_id", None) 1578 1579 if trace_id: 1580 remote_parent_span = self._create_remote_parent_span( 1581 trace_id=trace_id, parent_span_id=parent_span_id 1582 ) 1583 1584 with otel_trace_api.use_span( 1585 cast(otel_trace_api.Span, remote_parent_span) 1586 ): 1587 otel_span = self._otel_tracer.start_span( 1588 name=name, start_time=timestamp 1589 ) 1590 otel_span.set_attribute(LangfuseOtelSpanAttributes.AS_ROOT, True) 1591 1592 return cast( 1593 LangfuseEvent, 1594 LangfuseEvent( 1595 otel_span=otel_span, 1596 langfuse_client=self, 1597 environment=self._environment, 1598 release=self._release, 1599 input=input, 1600 output=output, 1601 metadata=metadata, 1602 version=version, 1603 level=level, 1604 status_message=status_message, 1605 ).end(end_time=timestamp), 1606 ) 1607 1608 otel_span = self._otel_tracer.start_span(name=name, start_time=timestamp) 1609 1610 return cast( 1611 LangfuseEvent, 1612 LangfuseEvent( 1613 otel_span=otel_span, 1614 langfuse_client=self, 1615 environment=self._environment, 1616 release=self._release, 1617 input=input, 1618 output=output, 1619 metadata=metadata, 1620 version=version, 1621 level=level, 1622 status_message=status_message, 1623 ).end(end_time=timestamp), 1624 )
Create a new Langfuse observation of type 'EVENT'.
The created Langfuse Event observation will be the child of the current span in the context.
Arguments:
- trace_context: Optional context for connecting to an existing trace
- name: Name of the span (e.g., function or operation name)
- input: Input data for the operation (can be any JSON-serializable object)
- output: Output data from the operation (can be any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
Returns:
The Langfuse Event object
Example:
event = langfuse.create_event(name="process-event")
1713 @staticmethod 1714 def create_trace_id(*, seed: Optional[str] = None) -> str: 1715 """Create a unique trace ID for use with Langfuse. 1716 1717 This method generates a unique trace ID for use with various Langfuse APIs. 1718 It can either generate a random ID or create a deterministic ID based on 1719 a seed string. 1720 1721 Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. 1722 This method ensures the generated ID meets this requirement. If you need to 1723 correlate an external ID with a Langfuse trace ID, use the external ID as the 1724 seed to get a valid, deterministic Langfuse trace ID. 1725 1726 Args: 1727 seed: Optional string to use as a seed for deterministic ID generation. 1728 If provided, the same seed will always produce the same ID. 1729 If not provided, a random ID will be generated. 1730 1731 Returns: 1732 A 32-character lowercase hexadecimal string representing the Langfuse trace ID. 1733 1734 Example: 1735 ```python 1736 # Generate a random trace ID 1737 trace_id = langfuse.create_trace_id() 1738 1739 # Generate a deterministic ID based on a seed 1740 session_trace_id = langfuse.create_trace_id(seed="session-456") 1741 1742 # Correlate an external ID with a Langfuse trace ID 1743 external_id = "external-system-123456" 1744 correlated_trace_id = langfuse.create_trace_id(seed=external_id) 1745 1746 # Use the ID with trace context 1747 with langfuse.start_as_current_observation( 1748 name="process-request", 1749 trace_context={"trace_id": trace_id} 1750 ) as span: 1751 # Operation will be part of the specific trace 1752 pass 1753 ``` 1754 """ 1755 if not seed: 1756 trace_id_int = RandomIdGenerator().generate_trace_id() 1757 1758 return Langfuse._format_otel_trace_id(trace_id_int) 1759 1760 return sha256(seed.encode("utf-8")).digest()[:16].hex()
Create a unique trace ID for use with Langfuse.
This method generates a unique trace ID for use with various Langfuse APIs. It can either generate a random ID or create a deterministic ID based on a seed string.
Trace IDs must be 32 lowercase hexadecimal characters, representing 16 bytes. This method ensures the generated ID meets this requirement. If you need to correlate an external ID with a Langfuse trace ID, use the external ID as the seed to get a valid, deterministic Langfuse trace ID.
Arguments:
- seed: Optional string to use as a seed for deterministic ID generation. If provided, the same seed will always produce the same ID. If not provided, a random ID will be generated.
Returns:
A 32-character lowercase hexadecimal string representing the Langfuse trace ID.
Example:
# Generate a random trace ID trace_id = langfuse.create_trace_id() # Generate a deterministic ID based on a seed session_trace_id = langfuse.create_trace_id(seed="session-456") # Correlate an external ID with a Langfuse trace ID external_id = "external-system-123456" correlated_trace_id = langfuse.create_trace_id(seed=external_id) # Use the ID with trace context with langfuse.start_as_current_observation( name="process-request", trace_context={"trace_id": trace_id} ) as span: # Operation will be part of the specific trace pass
1840 def create_score( 1841 self, 1842 *, 1843 name: str, 1844 value: Union[float, str], 1845 session_id: Optional[str] = None, 1846 dataset_run_id: Optional[str] = None, 1847 trace_id: Optional[str] = None, 1848 observation_id: Optional[str] = None, 1849 score_id: Optional[str] = None, 1850 data_type: Optional[ScoreDataType] = None, 1851 comment: Optional[str] = None, 1852 config_id: Optional[str] = None, 1853 metadata: Optional[Any] = None, 1854 timestamp: Optional[datetime] = None, 1855 ) -> None: 1856 """Create a score for a specific trace or observation. 1857 1858 This method creates a score for evaluating a Langfuse trace or observation. Scores can be 1859 used to track quality metrics, user feedback, or automated evaluations. 1860 1861 Args: 1862 name: Name of the score (e.g., "relevance", "accuracy") 1863 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 1864 session_id: ID of the Langfuse session to associate the score with 1865 dataset_run_id: ID of the Langfuse dataset run to associate the score with 1866 trace_id: ID of the Langfuse trace to associate the score with 1867 observation_id: Optional ID of the specific observation to score. Trace ID must be provided too. 1868 score_id: Optional custom ID for the score (auto-generated if not provided) 1869 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 1870 comment: Optional comment or explanation for the score 1871 config_id: Optional ID of a score config defined in Langfuse 1872 metadata: Optional metadata to be attached to the score 1873 timestamp: Optional timestamp for the score (defaults to current UTC time) 1874 1875 Example: 1876 ```python 1877 # Create a numeric score for accuracy 1878 langfuse.create_score( 1879 name="accuracy", 1880 value=0.92, 1881 trace_id="abcdef1234567890abcdef1234567890", 1882 data_type="NUMERIC", 1883 comment="High accuracy with minor irrelevant details" 1884 ) 1885 1886 # Create a categorical score for sentiment 1887 langfuse.create_score( 1888 name="sentiment", 1889 value="positive", 1890 trace_id="abcdef1234567890abcdef1234567890", 1891 observation_id="abcdef1234567890", 1892 data_type="CATEGORICAL" 1893 ) 1894 ``` 1895 """ 1896 if not self._tracing_enabled: 1897 return 1898 1899 score_id = score_id or self._create_observation_id() 1900 1901 try: 1902 new_body = ScoreBody( 1903 id=score_id, 1904 sessionId=session_id, 1905 datasetRunId=dataset_run_id, 1906 traceId=trace_id, 1907 observationId=observation_id, 1908 name=name, 1909 value=value, 1910 dataType=data_type, # type: ignore 1911 comment=comment, 1912 configId=config_id, 1913 environment=self._environment, 1914 metadata=metadata, 1915 ) 1916 1917 event = { 1918 "id": self.create_trace_id(), 1919 "type": "score-create", 1920 "timestamp": timestamp or _get_timestamp(), 1921 "body": new_body, 1922 } 1923 1924 if self._resources is not None: 1925 # Force the score to be in sample if it was for a legacy trace ID, i.e. non-32 hexchar 1926 force_sample = ( 1927 not self._is_valid_trace_id(trace_id) if trace_id else True 1928 ) 1929 1930 self._resources.add_score_task( 1931 event, 1932 force_sample=force_sample, 1933 ) 1934 1935 except Exception as e: 1936 langfuse_logger.exception( 1937 f"Error creating score: Failed to process score event for trace_id={trace_id}, name={name}. Error: {e}" 1938 )
Create a score for a specific trace or observation.
This method creates a score for evaluating a Langfuse trace or observation. Scores can be used to track quality metrics, user feedback, or automated evaluations.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
- session_id: ID of the Langfuse session to associate the score with
- dataset_run_id: ID of the Langfuse dataset run to associate the score with
- trace_id: ID of the Langfuse trace to associate the score with
- observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
- timestamp: Optional timestamp for the score (defaults to current UTC time)
Example:
# Create a numeric score for accuracy langfuse.create_score( name="accuracy", value=0.92, trace_id="abcdef1234567890abcdef1234567890", data_type="NUMERIC", comment="High accuracy with minor irrelevant details" ) # Create a categorical score for sentiment langfuse.create_score( name="sentiment", value="positive", trace_id="abcdef1234567890abcdef1234567890", observation_id="abcdef1234567890", data_type="CATEGORICAL" )
2001 def score_current_span( 2002 self, 2003 *, 2004 name: str, 2005 value: Union[float, str], 2006 score_id: Optional[str] = None, 2007 data_type: Optional[ScoreDataType] = None, 2008 comment: Optional[str] = None, 2009 config_id: Optional[str] = None, 2010 metadata: Optional[Any] = None, 2011 ) -> None: 2012 """Create a score for the current active span. 2013 2014 This method scores the currently active span in the context. It's a convenient 2015 way to score the current operation without needing to know its trace and span IDs. 2016 2017 Args: 2018 name: Name of the score (e.g., "relevance", "accuracy") 2019 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2020 score_id: Optional custom ID for the score (auto-generated if not provided) 2021 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2022 comment: Optional comment or explanation for the score 2023 config_id: Optional ID of a score config defined in Langfuse 2024 metadata: Optional metadata to be attached to the score 2025 2026 Example: 2027 ```python 2028 with langfuse.start_as_current_generation(name="answer-query") as generation: 2029 # Generate answer 2030 response = generate_answer(...) 2031 generation.update(output=response) 2032 2033 # Score the generation 2034 langfuse.score_current_span( 2035 name="relevance", 2036 value=0.85, 2037 data_type="NUMERIC", 2038 comment="Mostly relevant but contains some tangential information", 2039 metadata={"model": "gpt-4", "prompt_version": "v2"} 2040 ) 2041 ``` 2042 """ 2043 current_span = self._get_current_otel_span() 2044 2045 if current_span is not None: 2046 trace_id = self._get_otel_trace_id(current_span) 2047 observation_id = self._get_otel_span_id(current_span) 2048 2049 langfuse_logger.info( 2050 f"Score: Creating score name='{name}' value={value} for current span ({observation_id}) in trace {trace_id}" 2051 ) 2052 2053 self.create_score( 2054 trace_id=trace_id, 2055 observation_id=observation_id, 2056 name=name, 2057 value=cast(str, value), 2058 score_id=score_id, 2059 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2060 comment=comment, 2061 config_id=config_id, 2062 metadata=metadata, 2063 )
Create a score for the current active span.
This method scores the currently active span in the context. It's a convenient way to score the current operation without needing to know its trace and span IDs.
Arguments:
- name: Name of the score (e.g., "relevance", "accuracy")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_generation(name="answer-query") as generation: # Generate answer response = generate_answer(...) generation.update(output=response) # Score the generation langfuse.score_current_span( name="relevance", value=0.85, data_type="NUMERIC", comment="Mostly relevant but contains some tangential information", metadata={"model": "gpt-4", "prompt_version": "v2"} )
2093 def score_current_trace( 2094 self, 2095 *, 2096 name: str, 2097 value: Union[float, str], 2098 score_id: Optional[str] = None, 2099 data_type: Optional[ScoreDataType] = None, 2100 comment: Optional[str] = None, 2101 config_id: Optional[str] = None, 2102 metadata: Optional[Any] = None, 2103 ) -> None: 2104 """Create a score for the current trace. 2105 2106 This method scores the trace of the currently active span. Unlike score_current_span, 2107 this method associates the score with the entire trace rather than a specific span. 2108 It's useful for scoring overall performance or quality of the entire operation. 2109 2110 Args: 2111 name: Name of the score (e.g., "user_satisfaction", "overall_quality") 2112 value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION) 2113 score_id: Optional custom ID for the score (auto-generated if not provided) 2114 data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION) 2115 comment: Optional comment or explanation for the score 2116 config_id: Optional ID of a score config defined in Langfuse 2117 metadata: Optional metadata to be attached to the score 2118 2119 Example: 2120 ```python 2121 with langfuse.start_as_current_observation(name="process-user-request") as span: 2122 # Process request 2123 result = process_complete_request() 2124 span.update(output=result) 2125 2126 # Score the overall trace 2127 langfuse.score_current_trace( 2128 name="overall_quality", 2129 value=0.95, 2130 data_type="NUMERIC", 2131 comment="High quality end-to-end response", 2132 metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} 2133 ) 2134 ``` 2135 """ 2136 current_span = self._get_current_otel_span() 2137 2138 if current_span is not None: 2139 trace_id = self._get_otel_trace_id(current_span) 2140 2141 langfuse_logger.info( 2142 f"Score: Creating score name='{name}' value={value} for entire trace {trace_id}" 2143 ) 2144 2145 self.create_score( 2146 trace_id=trace_id, 2147 name=name, 2148 value=cast(str, value), 2149 score_id=score_id, 2150 data_type=cast(Literal["CATEGORICAL", "TEXT", "CORRECTION"], data_type), 2151 comment=comment, 2152 config_id=config_id, 2153 metadata=metadata, 2154 )
Create a score for the current trace.
This method scores the trace of the currently active span. Unlike score_current_span, this method associates the score with the entire trace rather than a specific span. It's useful for scoring overall performance or quality of the entire operation.
Arguments:
- name: Name of the score (e.g., "user_satisfaction", "overall_quality")
- value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT/CORRECTION)
- score_id: Optional custom ID for the score (auto-generated if not provided)
- data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, or CORRECTION)
- comment: Optional comment or explanation for the score
- config_id: Optional ID of a score config defined in Langfuse
- metadata: Optional metadata to be attached to the score
Example:
with langfuse.start_as_current_observation(name="process-user-request") as span: # Process request result = process_complete_request() span.update(output=result) # Score the overall trace langfuse.score_current_trace( name="overall_quality", value=0.95, data_type="NUMERIC", comment="High quality end-to-end response", metadata={"evaluator": "gpt-4", "criteria": "comprehensive"} )
2156 def flush(self) -> None: 2157 """Force flush all pending spans and events to the Langfuse API. 2158 2159 This method manually flushes any pending spans, scores, and other events to the 2160 Langfuse API. It's useful in scenarios where you want to ensure all data is sent 2161 before proceeding, without waiting for the automatic flush interval. 2162 2163 Example: 2164 ```python 2165 # Record some spans and scores 2166 with langfuse.start_as_current_observation(name="operation") as span: 2167 # Do work... 2168 pass 2169 2170 # Ensure all data is sent to Langfuse before proceeding 2171 langfuse.flush() 2172 2173 # Continue with other work 2174 ``` 2175 """ 2176 if self._resources is not None: 2177 self._resources.flush()
Force flush all pending spans and events to the Langfuse API.
This method manually flushes any pending spans, scores, and other events to the Langfuse API. It's useful in scenarios where you want to ensure all data is sent before proceeding, without waiting for the automatic flush interval.
Example:
# Record some spans and scores with langfuse.start_as_current_observation(name="operation") as span: # Do work... pass # Ensure all data is sent to Langfuse before proceeding langfuse.flush() # Continue with other work
2179 def shutdown(self) -> None: 2180 """Shut down the Langfuse client and flush all pending data. 2181 2182 This method cleanly shuts down the Langfuse client, ensuring all pending data 2183 is flushed to the API and all background threads are properly terminated. 2184 2185 It's important to call this method when your application is shutting down to 2186 prevent data loss and resource leaks. For most applications, using the client 2187 as a context manager or relying on the automatic shutdown via atexit is sufficient. 2188 2189 Example: 2190 ```python 2191 # Initialize Langfuse 2192 langfuse = Langfuse(public_key="...", secret_key="...") 2193 2194 # Use Langfuse throughout your application 2195 # ... 2196 2197 # When application is shutting down 2198 langfuse.shutdown() 2199 ``` 2200 """ 2201 if self._resources is not None: 2202 self._resources.shutdown()
Shut down the Langfuse client and flush all pending data.
This method cleanly shuts down the Langfuse client, ensuring all pending data is flushed to the API and all background threads are properly terminated.
It's important to call this method when your application is shutting down to prevent data loss and resource leaks. For most applications, using the client as a context manager or relying on the automatic shutdown via atexit is sufficient.
Example:
# Initialize Langfuse langfuse = Langfuse(public_key="...", secret_key="...") # Use Langfuse throughout your application # ... # When application is shutting down langfuse.shutdown()
2204 def get_current_trace_id(self) -> Optional[str]: 2205 """Get the trace ID of the current active span. 2206 2207 This method retrieves the trace ID from the currently active span in the context. 2208 It can be used to get the trace ID for referencing in logs, external systems, 2209 or for creating related operations. 2210 2211 Returns: 2212 The current trace ID as a 32-character lowercase hexadecimal string, 2213 or None if there is no active span. 2214 2215 Example: 2216 ```python 2217 with langfuse.start_as_current_observation(name="process-request") as span: 2218 # Get the current trace ID for reference 2219 trace_id = langfuse.get_current_trace_id() 2220 2221 # Use it for external correlation 2222 log.info(f"Processing request with trace_id: {trace_id}") 2223 2224 # Or pass to another system 2225 external_system.process(data, trace_id=trace_id) 2226 ``` 2227 """ 2228 if not self._tracing_enabled: 2229 langfuse_logger.debug( 2230 "Operation skipped: get_current_trace_id - Tracing is disabled or client is in no-op mode." 2231 ) 2232 return None 2233 2234 current_otel_span = self._get_current_otel_span() 2235 2236 return self._get_otel_trace_id(current_otel_span) if current_otel_span else None
Get the trace ID of the current active span.
This method retrieves the trace ID from the currently active span in the context. It can be used to get the trace ID for referencing in logs, external systems, or for creating related operations.
Returns:
The current trace ID as a 32-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-request") as span: # Get the current trace ID for reference trace_id = langfuse.get_current_trace_id() # Use it for external correlation log.info(f"Processing request with trace_id: {trace_id}") # Or pass to another system external_system.process(data, trace_id=trace_id)
2238 def get_current_observation_id(self) -> Optional[str]: 2239 """Get the observation ID (span ID) of the current active span. 2240 2241 This method retrieves the observation ID from the currently active span in the context. 2242 It can be used to get the observation ID for referencing in logs, external systems, 2243 or for creating scores or other related operations. 2244 2245 Returns: 2246 The current observation ID as a 16-character lowercase hexadecimal string, 2247 or None if there is no active span. 2248 2249 Example: 2250 ```python 2251 with langfuse.start_as_current_observation(name="process-user-query") as span: 2252 # Get the current observation ID 2253 observation_id = langfuse.get_current_observation_id() 2254 2255 # Store it for later reference 2256 cache.set(f"query_{query_id}_observation", observation_id) 2257 2258 # Process the query... 2259 ``` 2260 """ 2261 if not self._tracing_enabled: 2262 langfuse_logger.debug( 2263 "Operation skipped: get_current_observation_id - Tracing is disabled or client is in no-op mode." 2264 ) 2265 return None 2266 2267 current_otel_span = self._get_current_otel_span() 2268 2269 return self._get_otel_span_id(current_otel_span) if current_otel_span else None
Get the observation ID (span ID) of the current active span.
This method retrieves the observation ID from the currently active span in the context. It can be used to get the observation ID for referencing in logs, external systems, or for creating scores or other related operations.
Returns:
The current observation ID as a 16-character lowercase hexadecimal string, or None if there is no active span.
Example:
with langfuse.start_as_current_observation(name="process-user-query") as span: # Get the current observation ID observation_id = langfuse.get_current_observation_id() # Store it for later reference cache.set(f"query_{query_id}_observation", observation_id) # Process the query...
2282 def get_trace_url(self, *, trace_id: Optional[str] = None) -> Optional[str]: 2283 """Get the URL to view a trace in the Langfuse UI. 2284 2285 This method generates a URL that links directly to a trace in the Langfuse UI. 2286 It's useful for providing links in logs, notifications, or debugging tools. 2287 2288 Args: 2289 trace_id: Optional trace ID to generate a URL for. If not provided, 2290 the trace ID of the current active span will be used. 2291 2292 Returns: 2293 A URL string pointing to the trace in the Langfuse UI, 2294 or None if the project ID couldn't be retrieved or no trace ID is available. 2295 2296 Example: 2297 ```python 2298 # Get URL for the current trace 2299 with langfuse.start_as_current_observation(name="process-request") as span: 2300 trace_url = langfuse.get_trace_url() 2301 log.info(f"Processing trace: {trace_url}") 2302 2303 # Get URL for a specific trace 2304 specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") 2305 send_notification(f"Review needed for trace: {specific_trace_url}") 2306 ``` 2307 """ 2308 final_trace_id = trace_id or self.get_current_trace_id() 2309 if not final_trace_id: 2310 return None 2311 2312 project_id = self._get_project_id() 2313 2314 return ( 2315 f"{self._base_url}/project/{project_id}/traces/{final_trace_id}" 2316 if project_id and final_trace_id 2317 else None 2318 )
Get the URL to view a trace in the Langfuse UI.
This method generates a URL that links directly to a trace in the Langfuse UI. It's useful for providing links in logs, notifications, or debugging tools.
Arguments:
- trace_id: Optional trace ID to generate a URL for. If not provided, the trace ID of the current active span will be used.
Returns:
A URL string pointing to the trace in the Langfuse UI, or None if the project ID couldn't be retrieved or no trace ID is available.
Example:
# Get URL for the current trace with langfuse.start_as_current_observation(name="process-request") as span: trace_url = langfuse.get_trace_url() log.info(f"Processing trace: {trace_url}") # Get URL for a specific trace specific_trace_url = langfuse.get_trace_url(trace_id="1234567890abcdef1234567890abcdef") send_notification(f"Review needed for trace: {specific_trace_url}")
2320 def get_dataset( 2321 self, 2322 name: str, 2323 *, 2324 fetch_items_page_size: Optional[int] = 50, 2325 version: Optional[datetime] = None, 2326 ) -> "DatasetClient": 2327 """Fetch a dataset by its name. 2328 2329 Args: 2330 name (str): The name of the dataset to fetch. 2331 fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50. 2332 version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). 2333 If provided, returns the state of items at the specified UTC timestamp. 2334 If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC. 2335 2336 Returns: 2337 DatasetClient: The dataset with the given name. 2338 """ 2339 try: 2340 langfuse_logger.debug(f"Getting datasets {name}") 2341 dataset = self.api.datasets.get(dataset_name=self._url_encode(name)) 2342 2343 dataset_items = [] 2344 page = 1 2345 2346 while True: 2347 new_items = self.api.dataset_items.list( 2348 dataset_name=self._url_encode(name, is_url_param=True), 2349 page=page, 2350 limit=fetch_items_page_size, 2351 version=version, 2352 ) 2353 dataset_items.extend(new_items.data) 2354 2355 if new_items.meta.total_pages <= page: 2356 break 2357 2358 page += 1 2359 2360 return DatasetClient( 2361 dataset=dataset, 2362 items=dataset_items, 2363 version=version, 2364 langfuse_client=self, 2365 ) 2366 2367 except Error as e: 2368 handle_fern_exception(e) 2369 raise e
Fetch a dataset by its name.
Arguments:
- name (str): The name of the dataset to fetch.
- fetch_items_page_size (Optional[int]): All items of the dataset will be fetched in chunks of this size. Defaults to 50.
- version (Optional[datetime]): Retrieve dataset items as they existed at this specific point in time (UTC). If provided, returns the state of items at the specified UTC timestamp. If not provided, returns the latest version. Must be a timezone-aware datetime object in UTC.
Returns:
DatasetClient: The dataset with the given name.
2371 def get_dataset_run( 2372 self, *, dataset_name: str, run_name: str 2373 ) -> DatasetRunWithItems: 2374 """Fetch a dataset run by dataset name and run name. 2375 2376 Args: 2377 dataset_name (str): The name of the dataset. 2378 run_name (str): The name of the run. 2379 2380 Returns: 2381 DatasetRunWithItems: The dataset run with its items. 2382 """ 2383 try: 2384 return cast( 2385 DatasetRunWithItems, 2386 self.api.datasets.get_run( 2387 dataset_name=self._url_encode(dataset_name), 2388 run_name=self._url_encode(run_name), 2389 request_options=None, 2390 ), 2391 ) 2392 except Error as e: 2393 handle_fern_exception(e) 2394 raise e
Fetch a dataset run by dataset name and run name.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DatasetRunWithItems: The dataset run with its items.
2396 def get_dataset_runs( 2397 self, 2398 *, 2399 dataset_name: str, 2400 page: Optional[int] = None, 2401 limit: Optional[int] = None, 2402 ) -> PaginatedDatasetRuns: 2403 """Fetch all runs for a dataset. 2404 2405 Args: 2406 dataset_name (str): The name of the dataset. 2407 page (Optional[int]): Page number, starts at 1. 2408 limit (Optional[int]): Limit of items per page. 2409 2410 Returns: 2411 PaginatedDatasetRuns: Paginated list of dataset runs. 2412 """ 2413 try: 2414 return cast( 2415 PaginatedDatasetRuns, 2416 self.api.datasets.get_runs( 2417 dataset_name=self._url_encode(dataset_name), 2418 page=page, 2419 limit=limit, 2420 request_options=None, 2421 ), 2422 ) 2423 except Error as e: 2424 handle_fern_exception(e) 2425 raise e
Fetch all runs for a dataset.
Arguments:
- dataset_name (str): The name of the dataset.
- page (Optional[int]): Page number, starts at 1.
- limit (Optional[int]): Limit of items per page.
Returns:
PaginatedDatasetRuns: Paginated list of dataset runs.
2427 def delete_dataset_run( 2428 self, *, dataset_name: str, run_name: str 2429 ) -> DeleteDatasetRunResponse: 2430 """Delete a dataset run and all its run items. This action is irreversible. 2431 2432 Args: 2433 dataset_name (str): The name of the dataset. 2434 run_name (str): The name of the run. 2435 2436 Returns: 2437 DeleteDatasetRunResponse: Confirmation of deletion. 2438 """ 2439 try: 2440 return cast( 2441 DeleteDatasetRunResponse, 2442 self.api.datasets.delete_run( 2443 dataset_name=self._url_encode(dataset_name), 2444 run_name=self._url_encode(run_name), 2445 request_options=None, 2446 ), 2447 ) 2448 except Error as e: 2449 handle_fern_exception(e) 2450 raise e
Delete a dataset run and all its run items. This action is irreversible.
Arguments:
- dataset_name (str): The name of the dataset.
- run_name (str): The name of the run.
Returns:
DeleteDatasetRunResponse: Confirmation of deletion.
2452 def run_experiment( 2453 self, 2454 *, 2455 name: str, 2456 run_name: Optional[str] = None, 2457 description: Optional[str] = None, 2458 data: ExperimentData, 2459 task: TaskFunction, 2460 evaluators: List[EvaluatorFunction] = [], 2461 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 2462 run_evaluators: List[RunEvaluatorFunction] = [], 2463 max_concurrency: int = 50, 2464 metadata: Optional[Dict[str, str]] = None, 2465 _dataset_version: Optional[datetime] = None, 2466 ) -> ExperimentResult: 2467 """Run an experiment on a dataset with automatic tracing and evaluation. 2468 2469 This method executes a task function on each item in the provided dataset, 2470 automatically traces all executions with Langfuse for observability, runs 2471 item-level and run-level evaluators on the outputs, and returns comprehensive 2472 results with evaluation metrics. 2473 2474 The experiment system provides: 2475 - Automatic tracing of all task executions 2476 - Concurrent processing with configurable limits 2477 - Comprehensive error handling that isolates failures 2478 - Integration with Langfuse datasets for experiment tracking 2479 - Flexible evaluation framework supporting both sync and async evaluators 2480 2481 Args: 2482 name: Human-readable name for the experiment. Used for identification 2483 in the Langfuse UI. 2484 run_name: Optional exact name for the experiment run. If provided, this will be 2485 used as the exact dataset run name if the `data` contains Langfuse dataset items. 2486 If not provided, this will default to the experiment name appended with an ISO timestamp. 2487 description: Optional description explaining the experiment's purpose, 2488 methodology, or expected outcomes. 2489 data: Array of data items to process. Can be either: 2490 - List of dict-like items with 'input', 'expected_output', 'metadata' keys 2491 - List of Langfuse DatasetItem objects from dataset.items 2492 task: Function that processes each data item and returns output. 2493 Must accept 'item' as keyword argument and can return sync or async results. 2494 The task function signature should be: task(*, item, **kwargs) -> Any 2495 evaluators: List of functions to evaluate each item's output individually. 2496 Each evaluator receives input, output, expected_output, and metadata. 2497 Can return single Evaluation dict or list of Evaluation dicts. 2498 composite_evaluator: Optional function that creates composite scores from item-level evaluations. 2499 Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) 2500 plus the list of evaluations from item-level evaluators. Useful for weighted averages, 2501 pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics. 2502 run_evaluators: List of functions to evaluate the entire experiment run. 2503 Each run evaluator receives all item_results and can compute aggregate metrics. 2504 Useful for calculating averages, distributions, or cross-item comparisons. 2505 max_concurrency: Maximum number of concurrent task executions (default: 50). 2506 Controls the number of items processed simultaneously. Adjust based on 2507 API rate limits and system resources. 2508 metadata: Optional metadata dictionary to attach to all experiment traces. 2509 This metadata will be included in every trace created during the experiment. 2510 If `data` are Langfuse dataset items, the metadata will be attached to the dataset run, too. 2511 2512 Returns: 2513 ExperimentResult containing: 2514 - run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset. 2515 - item_results: List of results for each processed item with outputs and evaluations 2516 - run_evaluations: List of aggregate evaluation results for the entire run 2517 - experiment_id: Stable identifier for the experiment run across all items 2518 - dataset_run_id: ID of the dataset run (if using Langfuse datasets) 2519 - dataset_run_url: Direct URL to view results in Langfuse UI (if applicable) 2520 2521 Raises: 2522 ValueError: If required parameters are missing or invalid 2523 Exception: If experiment setup fails (individual item failures are handled gracefully) 2524 2525 Examples: 2526 Basic experiment with local data: 2527 ```python 2528 def summarize_text(*, item, **kwargs): 2529 return f"Summary: {item['input'][:50]}..." 2530 2531 def length_evaluator(*, input, output, expected_output=None, **kwargs): 2532 return { 2533 "name": "output_length", 2534 "value": len(output), 2535 "comment": f"Output contains {len(output)} characters" 2536 } 2537 2538 result = langfuse.run_experiment( 2539 name="Text Summarization Test", 2540 description="Evaluate summarization quality and length", 2541 data=[ 2542 {"input": "Long article text...", "expected_output": "Expected summary"}, 2543 {"input": "Another article...", "expected_output": "Another summary"} 2544 ], 2545 task=summarize_text, 2546 evaluators=[length_evaluator] 2547 ) 2548 2549 print(f"Processed {len(result.item_results)} items") 2550 for item_result in result.item_results: 2551 print(f"Input: {item_result.item['input']}") 2552 print(f"Output: {item_result.output}") 2553 print(f"Evaluations: {item_result.evaluations}") 2554 ``` 2555 2556 Advanced experiment with async task and multiple evaluators: 2557 ```python 2558 async def llm_task(*, item, **kwargs): 2559 # Simulate async LLM call 2560 response = await openai_client.chat.completions.create( 2561 model="gpt-4", 2562 messages=[{"role": "user", "content": item["input"]}] 2563 ) 2564 return response.choices[0].message.content 2565 2566 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 2567 if expected_output and expected_output.lower() in output.lower(): 2568 return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} 2569 return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} 2570 2571 def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): 2572 # Simulate toxicity check 2573 toxicity_score = check_toxicity(output) # Your toxicity checker 2574 return { 2575 "name": "toxicity", 2576 "value": toxicity_score, 2577 "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" 2578 } 2579 2580 def average_accuracy(*, item_results, **kwargs): 2581 accuracies = [ 2582 eval.value for result in item_results 2583 for eval in result.evaluations 2584 if eval.name == "accuracy" 2585 ] 2586 return { 2587 "name": "average_accuracy", 2588 "value": sum(accuracies) / len(accuracies) if accuracies else 0, 2589 "comment": f"Average accuracy across {len(accuracies)} items" 2590 } 2591 2592 result = langfuse.run_experiment( 2593 name="LLM Safety and Accuracy Test", 2594 description="Evaluate model accuracy and safety across diverse prompts", 2595 data=test_dataset, # Your dataset items 2596 task=llm_task, 2597 evaluators=[accuracy_evaluator, toxicity_evaluator], 2598 run_evaluators=[average_accuracy], 2599 max_concurrency=5, # Limit concurrent API calls 2600 metadata={"model": "gpt-4", "temperature": 0.7} 2601 ) 2602 ``` 2603 2604 Using with Langfuse datasets: 2605 ```python 2606 # Get dataset from Langfuse 2607 dataset = langfuse.get_dataset("my-eval-dataset") 2608 2609 result = dataset.run_experiment( 2610 name="Production Model Evaluation", 2611 description="Monthly evaluation of production model performance", 2612 task=my_production_task, 2613 evaluators=[accuracy_evaluator, latency_evaluator] 2614 ) 2615 2616 # Results automatically linked to dataset in Langfuse UI 2617 print(f"View results: {result['dataset_run_url']}") 2618 ``` 2619 2620 Note: 2621 - Task and evaluator functions can be either synchronous or asynchronous 2622 - Individual item failures are logged but don't stop the experiment 2623 - All executions are automatically traced and visible in Langfuse UI 2624 - When using Langfuse datasets, results are automatically linked for easy comparison 2625 - This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.) 2626 - Async execution is handled automatically with smart event loop detection 2627 """ 2628 return cast( 2629 ExperimentResult, 2630 run_async_safely( 2631 self._run_experiment_async( 2632 name=name, 2633 run_name=self._create_experiment_run_name( 2634 name=name, run_name=run_name 2635 ), 2636 description=description, 2637 data=data, 2638 task=task, 2639 evaluators=evaluators or [], 2640 composite_evaluator=composite_evaluator, 2641 run_evaluators=run_evaluators or [], 2642 max_concurrency=max_concurrency, 2643 metadata=metadata, 2644 dataset_version=_dataset_version, 2645 ), 2646 ), 2647 )
Run an experiment on a dataset with automatic tracing and evaluation.
This method executes a task function on each item in the provided dataset, automatically traces all executions with Langfuse for observability, runs item-level and run-level evaluators on the outputs, and returns comprehensive results with evaluation metrics.
The experiment system provides:
- Automatic tracing of all task executions
- Concurrent processing with configurable limits
- Comprehensive error handling that isolates failures
- Integration with Langfuse datasets for experiment tracking
- Flexible evaluation framework supporting both sync and async evaluators
Arguments:
- name: Human-readable name for the experiment. Used for identification in the Langfuse UI.
- run_name: Optional exact name for the experiment run. If provided, this will be
used as the exact dataset run name if the
datacontains Langfuse dataset items. If not provided, this will default to the experiment name appended with an ISO timestamp. - description: Optional description explaining the experiment's purpose, methodology, or expected outcomes.
- data: Array of data items to process. Can be either:
- List of dict-like items with 'input', 'expected_output', 'metadata' keys
- List of Langfuse DatasetItem objects from dataset.items
- task: Function that processes each data item and returns output. Must accept 'item' as keyword argument and can return sync or async results. The task function signature should be: task(*, item, **kwargs) -> Any
- evaluators: List of functions to evaluate each item's output individually. Each evaluator receives input, output, expected_output, and metadata. Can return single Evaluation dict or list of Evaluation dicts.
- composite_evaluator: Optional function that creates composite scores from item-level evaluations. Receives the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations from item-level evaluators. Useful for weighted averages, pass/fail decisions based on multiple criteria, or custom scoring logic combining multiple metrics.
- run_evaluators: List of functions to evaluate the entire experiment run. Each run evaluator receives all item_results and can compute aggregate metrics. Useful for calculating averages, distributions, or cross-item comparisons.
- max_concurrency: Maximum number of concurrent task executions (default: 50). Controls the number of items processed simultaneously. Adjust based on API rate limits and system resources.
- metadata: Optional metadata dictionary to attach to all experiment traces.
This metadata will be included in every trace created during the experiment.
If
dataare Langfuse dataset items, the metadata will be attached to the dataset run, too.
Returns:
ExperimentResult containing:
- run_name: The experiment run name. This is equal to the dataset run name if experiment was on Langfuse dataset.
- item_results: List of results for each processed item with outputs and evaluations
- run_evaluations: List of aggregate evaluation results for the entire run
- experiment_id: Stable identifier for the experiment run across all items
- dataset_run_id: ID of the dataset run (if using Langfuse datasets)
- dataset_run_url: Direct URL to view results in Langfuse UI (if applicable)
Raises:
- ValueError: If required parameters are missing or invalid
- Exception: If experiment setup fails (individual item failures are handled gracefully)
Examples:
Basic experiment with local data:
def summarize_text(*, item, **kwargs): return f"Summary: {item['input'][:50]}..." def length_evaluator(*, input, output, expected_output=None, **kwargs): return { "name": "output_length", "value": len(output), "comment": f"Output contains {len(output)} characters" } result = langfuse.run_experiment( name="Text Summarization Test", description="Evaluate summarization quality and length", data=[ {"input": "Long article text...", "expected_output": "Expected summary"}, {"input": "Another article...", "expected_output": "Another summary"} ], task=summarize_text, evaluators=[length_evaluator] ) print(f"Processed {len(result.item_results)} items") for item_result in result.item_results: print(f"Input: {item_result.item['input']}") print(f"Output: {item_result.output}") print(f"Evaluations: {item_result.evaluations}")Advanced experiment with async task and multiple evaluators:
async def llm_task(*, item, **kwargs): # Simulate async LLM call response = await openai_client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": item["input"]}] ) return response.choices[0].message.content def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if expected_output and expected_output.lower() in output.lower(): return {"name": "accuracy", "value": 1.0, "comment": "Correct answer"} return {"name": "accuracy", "value": 0.0, "comment": "Incorrect answer"} def toxicity_evaluator(*, input, output, expected_output=None, **kwargs): # Simulate toxicity check toxicity_score = check_toxicity(output) # Your toxicity checker return { "name": "toxicity", "value": toxicity_score, "comment": f"Toxicity level: {'high' if toxicity_score > 0.7 else 'low'}" } def average_accuracy(*, item_results, **kwargs): accuracies = [ eval.value for result in item_results for eval in result.evaluations if eval.name == "accuracy" ] return { "name": "average_accuracy", "value": sum(accuracies) / len(accuracies) if accuracies else 0, "comment": f"Average accuracy across {len(accuracies)} items" } result = langfuse.run_experiment( name="LLM Safety and Accuracy Test", description="Evaluate model accuracy and safety across diverse prompts", data=test_dataset, # Your dataset items task=llm_task, evaluators=[accuracy_evaluator, toxicity_evaluator], run_evaluators=[average_accuracy], max_concurrency=5, # Limit concurrent API calls metadata={"model": "gpt-4", "temperature": 0.7} )Using with Langfuse datasets:
# Get dataset from Langfuse dataset = langfuse.get_dataset("my-eval-dataset") result = dataset.run_experiment( name="Production Model Evaluation", description="Monthly evaluation of production model performance", task=my_production_task, evaluators=[accuracy_evaluator, latency_evaluator] ) # Results automatically linked to dataset in Langfuse UI print(f"View results: {result['dataset_run_url']}")
Note:
- Task and evaluator functions can be either synchronous or asynchronous
- Individual item failures are logged but don't stop the experiment
- All executions are automatically traced and visible in Langfuse UI
- When using Langfuse datasets, results are automatically linked for easy comparison
- This method works in both sync and async contexts (Jupyter notebooks, web apps, etc.)
- Async execution is handled automatically with smart event loop detection
3009 def run_batched_evaluation( 3010 self, 3011 *, 3012 scope: Literal["traces", "observations"], 3013 mapper: MapperFunction, 3014 filter: Optional[str] = None, 3015 fetch_batch_size: int = 50, 3016 fetch_trace_fields: Optional[str] = None, 3017 max_items: Optional[int] = None, 3018 max_retries: int = 3, 3019 evaluators: List[EvaluatorFunction], 3020 composite_evaluator: Optional[CompositeEvaluatorFunction] = None, 3021 max_concurrency: int = 5, 3022 metadata: Optional[Dict[str, Any]] = None, 3023 _add_observation_scores_to_trace: bool = False, 3024 _additional_trace_tags: Optional[List[str]] = None, 3025 resume_from: Optional[BatchEvaluationResumeToken] = None, 3026 verbose: bool = False, 3027 ) -> BatchEvaluationResult: 3028 """Fetch traces or observations and run evaluations on each item. 3029 3030 This method provides a powerful way to evaluate existing data in Langfuse at scale. 3031 It fetches items based on filters, transforms them using a mapper function, runs 3032 evaluators on each item, and creates scores that are linked back to the original 3033 entities. This is ideal for: 3034 3035 - Running evaluations on production traces after deployment 3036 - Backtesting new evaluation metrics on historical data 3037 - Batch scoring of observations for quality monitoring 3038 - Periodic evaluation runs on recent data 3039 3040 The method uses a streaming/pipeline approach to process items in batches, making 3041 it memory-efficient for large datasets. It includes comprehensive error handling, 3042 retry logic, and resume capability for long-running evaluations. 3043 3044 Args: 3045 scope: The type of items to evaluate. Must be one of: 3046 - "traces": Evaluate complete traces with all their observations 3047 - "observations": Evaluate individual observations (spans, generations, events) 3048 mapper: Function that transforms API response objects into evaluator inputs. 3049 Receives a trace/observation object and returns an EvaluatorInputs 3050 instance with input, output, expected_output, and metadata fields. 3051 Can be sync or async. 3052 evaluators: List of evaluation functions to run on each item. Each evaluator 3053 receives the mapped inputs and returns Evaluation object(s). Evaluator 3054 failures are logged but don't stop the batch evaluation. 3055 filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples: 3056 - '{"tags": ["production"]}' 3057 - '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' 3058 Default: None (fetches all items). 3059 fetch_batch_size: Number of items to fetch per API call and hold in memory. 3060 Larger values may be faster but use more memory. Default: 50. 3061 fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. 3062 max_items: Maximum total number of items to process. If None, processes all 3063 items matching the filter. Useful for testing or limiting evaluation runs. 3064 Default: None (process all). 3065 max_concurrency: Maximum number of items to evaluate concurrently. Controls 3066 parallelism and resource usage. Default: 5. 3067 composite_evaluator: Optional function that creates a composite score from 3068 item-level evaluations. Receives the original item and its evaluations, 3069 returns a single Evaluation. Useful for weighted averages or combined metrics. 3070 Default: None. 3071 metadata: Optional metadata dict to add to all created scores. Useful for 3072 tracking evaluation runs, versions, or other context. Default: None. 3073 max_retries: Maximum number of retry attempts for failed batch fetches. 3074 Uses exponential backoff (1s, 2s, 4s). Default: 3. 3075 verbose: If True, logs progress information to console. Useful for monitoring 3076 long-running evaluations. Default: False. 3077 resume_from: Optional resume token from a previous incomplete run. Allows 3078 continuing evaluation after interruption or failure. Default: None. 3079 3080 3081 Returns: 3082 BatchEvaluationResult containing: 3083 - total_items_fetched: Number of items fetched from API 3084 - total_items_processed: Number of items successfully evaluated 3085 - total_items_failed: Number of items that failed evaluation 3086 - total_scores_created: Scores created by item-level evaluators 3087 - total_composite_scores_created: Scores created by composite evaluator 3088 - total_evaluations_failed: Individual evaluator failures 3089 - evaluator_stats: Per-evaluator statistics (success rate, scores created) 3090 - resume_token: Token for resuming if incomplete (None if completed) 3091 - completed: True if all items processed 3092 - duration_seconds: Total execution time 3093 - failed_item_ids: IDs of items that failed 3094 - error_summary: Error types and counts 3095 - has_more_items: True if max_items reached but more exist 3096 3097 Raises: 3098 ValueError: If invalid scope is provided. 3099 3100 Examples: 3101 Basic trace evaluation: 3102 ```python 3103 from langfuse import Langfuse, EvaluatorInputs, Evaluation 3104 3105 client = Langfuse() 3106 3107 # Define mapper to extract fields from traces 3108 def trace_mapper(trace): 3109 return EvaluatorInputs( 3110 input=trace.input, 3111 output=trace.output, 3112 expected_output=None, 3113 metadata={"trace_id": trace.id} 3114 ) 3115 3116 # Define evaluator 3117 def length_evaluator(*, input, output, expected_output, metadata): 3118 return Evaluation( 3119 name="output_length", 3120 value=len(output) if output else 0 3121 ) 3122 3123 # Run batch evaluation 3124 result = client.run_batched_evaluation( 3125 scope="traces", 3126 mapper=trace_mapper, 3127 evaluators=[length_evaluator], 3128 filter='{"tags": ["production"]}', 3129 max_items=1000, 3130 verbose=True 3131 ) 3132 3133 print(f"Processed {result.total_items_processed} traces") 3134 print(f"Created {result.total_scores_created} scores") 3135 ``` 3136 3137 Evaluation with composite scorer: 3138 ```python 3139 def accuracy_evaluator(*, input, output, expected_output, metadata): 3140 # ... evaluation logic 3141 return Evaluation(name="accuracy", value=0.85) 3142 3143 def relevance_evaluator(*, input, output, expected_output, metadata): 3144 # ... evaluation logic 3145 return Evaluation(name="relevance", value=0.92) 3146 3147 def composite_evaluator(*, item, evaluations): 3148 # Weighted average of evaluations 3149 weights = {"accuracy": 0.6, "relevance": 0.4} 3150 total = sum( 3151 e.value * weights.get(e.name, 0) 3152 for e in evaluations 3153 if isinstance(e.value, (int, float)) 3154 ) 3155 return Evaluation( 3156 name="composite_score", 3157 value=total, 3158 comment=f"Weighted average of {len(evaluations)} metrics" 3159 ) 3160 3161 result = client.run_batched_evaluation( 3162 scope="traces", 3163 mapper=trace_mapper, 3164 evaluators=[accuracy_evaluator, relevance_evaluator], 3165 composite_evaluator=composite_evaluator, 3166 filter='{"user_id": "important_user"}', 3167 verbose=True 3168 ) 3169 ``` 3170 3171 Handling incomplete runs with resume: 3172 ```python 3173 # Initial run that may fail or timeout 3174 result = client.run_batched_evaluation( 3175 scope="observations", 3176 mapper=obs_mapper, 3177 evaluators=[my_evaluator], 3178 max_items=10000, 3179 verbose=True 3180 ) 3181 3182 # Check if incomplete 3183 if not result.completed and result.resume_token: 3184 print(f"Processed {result.resume_token.items_processed} items before interruption") 3185 3186 # Resume from where it left off 3187 result = client.run_batched_evaluation( 3188 scope="observations", 3189 mapper=obs_mapper, 3190 evaluators=[my_evaluator], 3191 resume_from=result.resume_token, 3192 verbose=True 3193 ) 3194 3195 print(f"Total items processed: {result.total_items_processed}") 3196 ``` 3197 3198 Monitoring evaluator performance: 3199 ```python 3200 result = client.run_batched_evaluation(...) 3201 3202 for stats in result.evaluator_stats: 3203 success_rate = stats.successful_runs / stats.total_runs 3204 print(f"{stats.name}:") 3205 print(f" Success rate: {success_rate:.1%}") 3206 print(f" Scores created: {stats.total_scores_created}") 3207 3208 if stats.failed_runs > 0: 3209 print(f" ⚠️ Failed {stats.failed_runs} times") 3210 ``` 3211 3212 Note: 3213 - Evaluator failures are logged but don't stop the batch evaluation 3214 - Individual item failures are tracked but don't stop processing 3215 - Fetch failures are retried with exponential backoff 3216 - All scores are automatically flushed to Langfuse at the end 3217 - The resume mechanism uses timestamp-based filtering to avoid duplicates 3218 """ 3219 runner = BatchEvaluationRunner(self) 3220 3221 return cast( 3222 BatchEvaluationResult, 3223 run_async_safely( 3224 runner.run_async( 3225 scope=scope, 3226 mapper=mapper, 3227 evaluators=evaluators, 3228 filter=filter, 3229 fetch_batch_size=fetch_batch_size, 3230 fetch_trace_fields=fetch_trace_fields, 3231 max_items=max_items, 3232 max_concurrency=max_concurrency, 3233 composite_evaluator=composite_evaluator, 3234 metadata=metadata, 3235 _add_observation_scores_to_trace=_add_observation_scores_to_trace, 3236 _additional_trace_tags=_additional_trace_tags, 3237 max_retries=max_retries, 3238 verbose=verbose, 3239 resume_from=resume_from, 3240 ) 3241 ), 3242 )
Fetch traces or observations and run evaluations on each item.
This method provides a powerful way to evaluate existing data in Langfuse at scale. It fetches items based on filters, transforms them using a mapper function, runs evaluators on each item, and creates scores that are linked back to the original entities. This is ideal for:
- Running evaluations on production traces after deployment
- Backtesting new evaluation metrics on historical data
- Batch scoring of observations for quality monitoring
- Periodic evaluation runs on recent data
The method uses a streaming/pipeline approach to process items in batches, making it memory-efficient for large datasets. It includes comprehensive error handling, retry logic, and resume capability for long-running evaluations.
Arguments:
- scope: The type of items to evaluate. Must be one of:
- "traces": Evaluate complete traces with all their observations
- "observations": Evaluate individual observations (spans, generations, events)
- mapper: Function that transforms API response objects into evaluator inputs. Receives a trace/observation object and returns an EvaluatorInputs instance with input, output, expected_output, and metadata fields. Can be sync or async.
- evaluators: List of evaluation functions to run on each item. Each evaluator receives the mapped inputs and returns Evaluation object(s). Evaluator failures are logged but don't stop the batch evaluation.
- filter: Optional JSON filter string for querying items (same format as Langfuse API). Examples:
- '{"tags": ["production"]}'
- '{"user_id": "user123", "timestamp": {"operator": ">", "value": "2024-01-01"}}' Default: None (fetches all items).
- fetch_batch_size: Number of items to fetch per API call and hold in memory. Larger values may be faster but use more memory. Default: 50.
- fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'.
- max_items: Maximum total number of items to process. If None, processes all items matching the filter. Useful for testing or limiting evaluation runs. Default: None (process all).
- max_concurrency: Maximum number of items to evaluate concurrently. Controls parallelism and resource usage. Default: 5.
- composite_evaluator: Optional function that creates a composite score from item-level evaluations. Receives the original item and its evaluations, returns a single Evaluation. Useful for weighted averages or combined metrics. Default: None.
- metadata: Optional metadata dict to add to all created scores. Useful for tracking evaluation runs, versions, or other context. Default: None.
- max_retries: Maximum number of retry attempts for failed batch fetches. Uses exponential backoff (1s, 2s, 4s). Default: 3.
- verbose: If True, logs progress information to console. Useful for monitoring long-running evaluations. Default: False.
- resume_from: Optional resume token from a previous incomplete run. Allows continuing evaluation after interruption or failure. Default: None.
Returns:
BatchEvaluationResult containing: - total_items_fetched: Number of items fetched from API - total_items_processed: Number of items successfully evaluated - total_items_failed: Number of items that failed evaluation - total_scores_created: Scores created by item-level evaluators - total_composite_scores_created: Scores created by composite evaluator - total_evaluations_failed: Individual evaluator failures - evaluator_stats: Per-evaluator statistics (success rate, scores created) - resume_token: Token for resuming if incomplete (None if completed) - completed: True if all items processed - duration_seconds: Total execution time - failed_item_ids: IDs of items that failed - error_summary: Error types and counts - has_more_items: True if max_items reached but more exist
Raises:
- ValueError: If invalid scope is provided.
Examples:
Basic trace evaluation:
from langfuse import Langfuse, EvaluatorInputs, Evaluation client = Langfuse() # Define mapper to extract fields from traces def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, metadata={"trace_id": trace.id} ) # Define evaluator def length_evaluator(*, input, output, expected_output, metadata): return Evaluation( name="output_length", value=len(output) if output else 0 ) # Run batch evaluation result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[length_evaluator], filter='{"tags": ["production"]}', max_items=1000, verbose=True ) print(f"Processed {result.total_items_processed} traces") print(f"Created {result.total_scores_created} scores")Evaluation with composite scorer:
def accuracy_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="accuracy", value=0.85) def relevance_evaluator(*, input, output, expected_output, metadata): # ... evaluation logic return Evaluation(name="relevance", value=0.92) def composite_evaluator(*, item, evaluations): # Weighted average of evaluations weights = {"accuracy": 0.6, "relevance": 0.4} total = sum( e.value * weights.get(e.name, 0) for e in evaluations if isinstance(e.value, (int, float)) ) return Evaluation( name="composite_score", value=total, comment=f"Weighted average of {len(evaluations)} metrics" ) result = client.run_batched_evaluation( scope="traces", mapper=trace_mapper, evaluators=[accuracy_evaluator, relevance_evaluator], composite_evaluator=composite_evaluator, filter='{"user_id": "important_user"}', verbose=True )Handling incomplete runs with resume:
# Initial run that may fail or timeout result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], max_items=10000, verbose=True ) # Check if incomplete if not result.completed and result.resume_token: print(f"Processed {result.resume_token.items_processed} items before interruption") # Resume from where it left off result = client.run_batched_evaluation( scope="observations", mapper=obs_mapper, evaluators=[my_evaluator], resume_from=result.resume_token, verbose=True ) print(f"Total items processed: {result.total_items_processed}")Monitoring evaluator performance:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs print(f"{stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failed {stats.failed_runs} times")
Note:
- Evaluator failures are logged but don't stop the batch evaluation
- Individual item failures are tracked but don't stop processing
- Fetch failures are retried with exponential backoff
- All scores are automatically flushed to Langfuse at the end
- The resume mechanism uses timestamp-based filtering to avoid duplicates
3244 def auth_check(self) -> bool: 3245 """Check if the provided credentials (public and secret key) are valid. 3246 3247 Raises: 3248 Exception: If no projects were found for the provided credentials. 3249 3250 Note: 3251 This method is blocking. It is discouraged to use it in production code. 3252 """ 3253 try: 3254 projects = self.api.projects.get() 3255 langfuse_logger.debug( 3256 f"Auth check successful, found {len(projects.data)} projects" 3257 ) 3258 if len(projects.data) == 0: 3259 raise Exception( 3260 "Auth check failed, no project found for the keys provided." 3261 ) 3262 return True 3263 3264 except AttributeError as e: 3265 langfuse_logger.warning( 3266 f"Auth check failed: Client not properly initialized. Error: {e}" 3267 ) 3268 return False 3269 3270 except Error as e: 3271 handle_fern_exception(e) 3272 raise e
Check if the provided credentials (public and secret key) are valid.
Raises:
- Exception: If no projects were found for the provided credentials.
Note:
This method is blocking. It is discouraged to use it in production code.
3274 def create_dataset( 3275 self, 3276 *, 3277 name: str, 3278 description: Optional[str] = None, 3279 metadata: Optional[Any] = None, 3280 input_schema: Optional[Any] = None, 3281 expected_output_schema: Optional[Any] = None, 3282 ) -> Dataset: 3283 """Create a dataset with the given name on Langfuse. 3284 3285 Args: 3286 name: Name of the dataset to create. 3287 description: Description of the dataset. Defaults to None. 3288 metadata: Additional metadata. Defaults to None. 3289 input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema. 3290 expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema. 3291 3292 Returns: 3293 Dataset: The created dataset as returned by the Langfuse API. 3294 """ 3295 try: 3296 langfuse_logger.debug(f"Creating datasets {name}") 3297 3298 result = self.api.datasets.create( 3299 name=name, 3300 description=description, 3301 metadata=metadata, 3302 input_schema=input_schema, 3303 expected_output_schema=expected_output_schema, 3304 ) 3305 3306 return cast(Dataset, result) 3307 3308 except Error as e: 3309 handle_fern_exception(e) 3310 raise e
Create a dataset with the given name on Langfuse.
Arguments:
- name: Name of the dataset to create.
- description: Description of the dataset. Defaults to None.
- metadata: Additional metadata. Defaults to None.
- input_schema: JSON Schema for validating dataset item inputs. When set, all new items will be validated against this schema.
- expected_output_schema: JSON Schema for validating dataset item expected outputs. When set, all new items will be validated against this schema.
Returns:
Dataset: The created dataset as returned by the Langfuse API.
3312 def create_dataset_item( 3313 self, 3314 *, 3315 dataset_name: str, 3316 input: Optional[Any] = None, 3317 expected_output: Optional[Any] = None, 3318 metadata: Optional[Any] = None, 3319 source_trace_id: Optional[str] = None, 3320 source_observation_id: Optional[str] = None, 3321 status: Optional[DatasetStatus] = None, 3322 id: Optional[str] = None, 3323 ) -> DatasetItem: 3324 """Create a dataset item. 3325 3326 Upserts if an item with id already exists. 3327 3328 Args: 3329 dataset_name: Name of the dataset in which the dataset item should be created. 3330 input: Input data. Defaults to None. Can contain any dict, list or scalar. 3331 expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar. 3332 metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar. 3333 source_trace_id: Id of the source trace. Defaults to None. 3334 source_observation_id: Id of the source observation. Defaults to None. 3335 status: Status of the dataset item. Defaults to ACTIVE for newly created items. 3336 id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets. 3337 3338 Returns: 3339 DatasetItem: The created dataset item as returned by the Langfuse API. 3340 3341 Example: 3342 ```python 3343 from langfuse import Langfuse 3344 3345 langfuse = Langfuse() 3346 3347 # Uploading items to the Langfuse dataset named "capital_cities" 3348 langfuse.create_dataset_item( 3349 dataset_name="capital_cities", 3350 input={"input": {"country": "Italy"}}, 3351 expected_output={"expected_output": "Rome"}, 3352 metadata={"foo": "bar"} 3353 ) 3354 ``` 3355 """ 3356 try: 3357 langfuse_logger.debug(f"Creating dataset item for dataset {dataset_name}") 3358 3359 result = self.api.dataset_items.create( 3360 dataset_name=dataset_name, 3361 input=input, 3362 expected_output=expected_output, 3363 metadata=metadata, 3364 source_trace_id=source_trace_id, 3365 source_observation_id=source_observation_id, 3366 status=status, 3367 id=id, 3368 ) 3369 3370 return cast(DatasetItem, result) 3371 except Error as e: 3372 handle_fern_exception(e) 3373 raise e
Create a dataset item.
Upserts if an item with id already exists.
Arguments:
- dataset_name: Name of the dataset in which the dataset item should be created.
- input: Input data. Defaults to None. Can contain any dict, list or scalar.
- expected_output: Expected output data. Defaults to None. Can contain any dict, list or scalar.
- metadata: Additional metadata. Defaults to None. Can contain any dict, list or scalar.
- source_trace_id: Id of the source trace. Defaults to None.
- source_observation_id: Id of the source observation. Defaults to None.
- status: Status of the dataset item. Defaults to ACTIVE for newly created items.
- id: Id of the dataset item. Defaults to None. Provide your own id if you want to dedupe dataset items. Id needs to be globally unique and cannot be reused across datasets.
Returns:
DatasetItem: The created dataset item as returned by the Langfuse API.
Example:
from langfuse import Langfuse langfuse = Langfuse() # Uploading items to the Langfuse dataset named "capital_cities" langfuse.create_dataset_item( dataset_name="capital_cities", input={"input": {"country": "Italy"}}, expected_output={"expected_output": "Rome"}, metadata={"foo": "bar"} )
3375 def resolve_media_references( 3376 self, 3377 *, 3378 obj: Any, 3379 resolve_with: Literal["base64_data_uri"], 3380 max_depth: int = 10, 3381 content_fetch_timeout_seconds: int = 5, 3382 ) -> Any: 3383 """Replace media reference strings in an object with base64 data URIs. 3384 3385 This method recursively traverses an object (up to max_depth) looking for media reference strings 3386 in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using 3387 the provided Langfuse client and replaces the reference string with a base64 data URI. 3388 3389 If fetching media content fails for a reference string, a warning is logged and the reference 3390 string is left unchanged. 3391 3392 Args: 3393 obj: The object to process. Can be a primitive value, array, or nested object. 3394 If the object has a __dict__ attribute, a dict will be returned instead of the original object type. 3395 resolve_with: The representation of the media content to replace the media reference string with. 3396 Currently only "base64_data_uri" is supported. 3397 max_depth: int: The maximum depth to traverse the object. Default is 10. 3398 content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5. 3399 3400 Returns: 3401 A deep copy of the input object with all media references replaced with base64 data URIs where possible. 3402 If the input object has a __dict__ attribute, a dict will be returned instead of the original object type. 3403 3404 Example: 3405 obj = { 3406 "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", 3407 "nested": { 3408 "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" 3409 } 3410 } 3411 3412 result = await LangfuseMedia.resolve_media_references(obj, langfuse_client) 3413 3414 # Result: 3415 # { 3416 # "image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...", 3417 # "nested": { 3418 # "pdf": "data:application/pdf;base64,JVBERi0xLjcK..." 3419 # } 3420 # } 3421 """ 3422 return LangfuseMedia.resolve_media_references( 3423 langfuse_client=self, 3424 obj=obj, 3425 resolve_with=resolve_with, 3426 max_depth=max_depth, 3427 content_fetch_timeout_seconds=content_fetch_timeout_seconds, 3428 )
Replace media reference strings in an object with base64 data URIs.
This method recursively traverses an object (up to max_depth) looking for media reference strings in the format "@@@langfuseMedia:...@@@". When found, it (synchronously) fetches the actual media content using the provided Langfuse client and replaces the reference string with a base64 data URI.
If fetching media content fails for a reference string, a warning is logged and the reference string is left unchanged.
Arguments:
- obj: The object to process. Can be a primitive value, array, or nested object. If the object has a __dict__ attribute, a dict will be returned instead of the original object type.
- resolve_with: The representation of the media content to replace the media reference string with. Currently only "base64_data_uri" is supported.
- max_depth: int: The maximum depth to traverse the object. Default is 10.
- content_fetch_timeout_seconds: int: The timeout in seconds for fetching media content. Default is 5.
Returns:
A deep copy of the input object with all media references replaced with base64 data URIs where possible. If the input object has a __dict__ attribute, a dict will be returned instead of the original object type.
Example:
obj = { "image": "@@@langfuseMedia:type=image/jpeg|id=123|source=bytes@@@", "nested": { "pdf": "@@@langfuseMedia:type=application/pdf|id=456|source=bytes@@@" } }
result = await LangfuseMedia.resolve_media_references(obj, langfuse_client)
Result:
{
"image": "data:image/jpeg;base64,/9j/4AAQSkZJRg...",
"nested": {
"pdf": "data:application/pdf;base64,JVBERi0xLjcK..."
}
}
3458 def get_prompt( 3459 self, 3460 name: str, 3461 *, 3462 version: Optional[int] = None, 3463 label: Optional[str] = None, 3464 type: Literal["chat", "text"] = "text", 3465 cache_ttl_seconds: Optional[int] = None, 3466 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]] = None, 3467 max_retries: Optional[int] = None, 3468 fetch_timeout_seconds: Optional[int] = None, 3469 ) -> PromptClient: 3470 """Get a prompt. 3471 3472 This method attempts to fetch the requested prompt from the local cache. If the prompt is not found 3473 in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again 3474 and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will 3475 return the expired prompt as a fallback. 3476 3477 Args: 3478 name (str): The name of the prompt to retrieve. 3479 3480 Keyword Args: 3481 version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3482 label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the `production` label is returned. Specify either version or label, not both. 3483 cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a 3484 keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0. 3485 type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text". 3486 fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None. 3487 max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds. 3488 fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default. 3489 3490 Returns: 3491 The prompt object retrieved from the cache or directly fetched if not cached or expired of type 3492 - TextPromptClient, if type argument is 'text'. 3493 - ChatPromptClient, if type argument is 'chat'. 3494 3495 Raises: 3496 Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an 3497 expired prompt in the cache, in which case it logs a warning and returns the expired prompt. 3498 """ 3499 if self._resources is None: 3500 raise Error( 3501 "SDK is not correctly initialized. Check the init logs for more details." 3502 ) 3503 if version is not None and label is not None: 3504 raise ValueError("Cannot specify both version and label at the same time.") 3505 3506 if not name: 3507 raise ValueError("Prompt name cannot be empty.") 3508 3509 cache_key = PromptCache.generate_cache_key(name, version=version, label=label) 3510 bounded_max_retries = self._get_bounded_max_retries( 3511 max_retries, default_max_retries=2, max_retries_upper_bound=4 3512 ) 3513 3514 langfuse_logger.debug(f"Getting prompt '{cache_key}'") 3515 cached_prompt = self._resources.prompt_cache.get(cache_key) 3516 3517 if cached_prompt is None or cache_ttl_seconds == 0: 3518 langfuse_logger.debug( 3519 f"Prompt '{cache_key}' not found in cache or caching disabled." 3520 ) 3521 try: 3522 return self._fetch_prompt_and_update_cache( 3523 name, 3524 version=version, 3525 label=label, 3526 ttl_seconds=cache_ttl_seconds, 3527 max_retries=bounded_max_retries, 3528 fetch_timeout_seconds=fetch_timeout_seconds, 3529 ) 3530 except Exception as e: 3531 if fallback: 3532 langfuse_logger.warning( 3533 f"Returning fallback prompt for '{cache_key}' due to fetch error: {e}" 3534 ) 3535 3536 fallback_client_args: Dict[str, Any] = { 3537 "name": name, 3538 "prompt": fallback, 3539 "type": type, 3540 "version": version or 0, 3541 "config": {}, 3542 "labels": [label] if label else [], 3543 "tags": [], 3544 } 3545 3546 if type == "text": 3547 return TextPromptClient( 3548 prompt=Prompt_Text(**fallback_client_args), 3549 is_fallback=True, 3550 ) 3551 3552 if type == "chat": 3553 return ChatPromptClient( 3554 prompt=Prompt_Chat(**fallback_client_args), 3555 is_fallback=True, 3556 ) 3557 3558 raise e 3559 3560 if cached_prompt.is_expired(): 3561 langfuse_logger.debug(f"Stale prompt '{cache_key}' found in cache.") 3562 try: 3563 # refresh prompt in background thread, refresh_prompt deduplicates tasks 3564 langfuse_logger.debug(f"Refreshing prompt '{cache_key}' in background.") 3565 3566 def refresh_task() -> None: 3567 self._fetch_prompt_and_update_cache( 3568 name, 3569 version=version, 3570 label=label, 3571 ttl_seconds=cache_ttl_seconds, 3572 max_retries=bounded_max_retries, 3573 fetch_timeout_seconds=fetch_timeout_seconds, 3574 ) 3575 3576 self._resources.prompt_cache.add_refresh_prompt_task_if_current( 3577 cache_key, 3578 cached_prompt, 3579 refresh_task, 3580 ) 3581 langfuse_logger.debug( 3582 f"Returning stale prompt '{cache_key}' from cache." 3583 ) 3584 # return stale prompt 3585 return cached_prompt.value 3586 3587 except Exception as e: 3588 langfuse_logger.warning( 3589 f"Error when refreshing cached prompt '{cache_key}', returning cached version. Error: {e}" 3590 ) 3591 # creation of refresh prompt task failed, return stale prompt 3592 return cached_prompt.value 3593 3594 return cached_prompt.value
Get a prompt.
This method attempts to fetch the requested prompt from the local cache. If the prompt is not found in the cache or if the cached prompt has expired, it will try to fetch the prompt from the server again and update the cache. If fetching the new prompt fails, and there is an expired prompt in the cache, it will return the expired prompt as a fallback.
Arguments:
- name (str): The name of the prompt to retrieve.
Keyword Args:
- version (Optional[int]): The version of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. - label: Optional[str]: The label of the prompt to retrieve. If no label and version is specified, the
productionlabel is returned. Specify either version or label, not both. - cache_ttl_seconds: Optional[int]: Time-to-live in seconds for caching the prompt. Must be specified as a
- keyword argument. If not set, defaults to 60 seconds. Disables caching if set to 0.
- type: Literal["chat", "text"]: The type of the prompt to retrieve. Defaults to "text".
- fallback: Union[Optional[List[ChatMessageDict]], Optional[str]]: The prompt string to return if fetching the prompt fails. Important on the first call where no cached prompt is available. Follows Langfuse prompt formatting with double curly braces for variables. Defaults to None.
- max_retries: Optional[int]: The maximum number of retries in case of API/network errors. Defaults to 2. The maximum value is 4. Retries have an exponential backoff with a maximum delay of 10 seconds.
- fetch_timeout_seconds: Optional[int]: The timeout in milliseconds for fetching the prompt. Defaults to the default timeout set on the SDK, which is 5 seconds per default.
Returns:
The prompt object retrieved from the cache or directly fetched if not cached or expired of type
- TextPromptClient, if type argument is 'text'.
- ChatPromptClient, if type argument is 'chat'.
Raises:
- Exception: Propagates any exceptions raised during the fetching of a new prompt, unless there is an
- expired prompt in the cache, in which case it logs a warning and returns the expired prompt.
3696 def create_prompt( 3697 self, 3698 *, 3699 name: str, 3700 prompt: Union[ 3701 str, List[Union[ChatMessageDict, ChatMessageWithPlaceholdersDict]] 3702 ], 3703 labels: List[str] = [], 3704 tags: Optional[List[str]] = None, 3705 type: Optional[Literal["chat", "text"]] = "text", 3706 config: Optional[Any] = None, 3707 commit_message: Optional[str] = None, 3708 ) -> PromptClient: 3709 """Create a new prompt in Langfuse. 3710 3711 Keyword Args: 3712 name : The name of the prompt to be created. 3713 prompt : The content of the prompt to be created. 3714 is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead. 3715 labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label. 3716 tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt. 3717 config: Additional structured data to be saved with the prompt. Defaults to None. 3718 type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text". 3719 commit_message: Optional string describing the change. 3720 3721 Returns: 3722 TextPromptClient: The prompt if type argument is 'text'. 3723 ChatPromptClient: The prompt if type argument is 'chat'. 3724 """ 3725 try: 3726 langfuse_logger.debug(f"Creating prompt {name=}, {labels=}") 3727 3728 if type == "chat": 3729 if not isinstance(prompt, list): 3730 raise ValueError( 3731 "For 'chat' type, 'prompt' must be a list of chat messages with role and content attributes." 3732 ) 3733 request: Union[CreateChatPromptRequest, CreateTextPromptRequest] = ( 3734 CreateChatPromptRequest( 3735 name=name, 3736 prompt=cast(Any, prompt), 3737 labels=labels, 3738 tags=tags, 3739 config=config or {}, 3740 commit_message=commit_message, 3741 type=CreateChatPromptType.CHAT, 3742 ) 3743 ) 3744 server_prompt = self.api.prompts.create(request=request) 3745 3746 if self._resources is not None: 3747 self._resources.prompt_cache.invalidate(name) 3748 3749 return ChatPromptClient(prompt=cast(Prompt_Chat, server_prompt)) 3750 3751 if not isinstance(prompt, str): 3752 raise ValueError("For 'text' type, 'prompt' must be a string.") 3753 3754 request = CreateTextPromptRequest( 3755 name=name, 3756 prompt=prompt, 3757 labels=labels, 3758 tags=tags, 3759 config=config or {}, 3760 commit_message=commit_message, 3761 ) 3762 3763 server_prompt = self.api.prompts.create(request=request) 3764 3765 if self._resources is not None: 3766 self._resources.prompt_cache.invalidate(name) 3767 3768 return TextPromptClient(prompt=cast(Prompt_Text, server_prompt)) 3769 3770 except Error as e: 3771 handle_fern_exception(e) 3772 raise e
Create a new prompt in Langfuse.
Keyword Args:
- name : The name of the prompt to be created.
- prompt : The content of the prompt to be created.
- is_active [DEPRECATED] : A flag indicating whether the prompt is active or not. This is deprecated and will be removed in a future release. Please use the 'production' label instead.
- labels: The labels of the prompt. Defaults to None. To create a default-served prompt, add the 'production' label.
- tags: The tags of the prompt. Defaults to None. Will be applied to all versions of the prompt.
- config: Additional structured data to be saved with the prompt. Defaults to None.
- type: The type of the prompt to be created. "chat" vs. "text". Defaults to "text".
- commit_message: Optional string describing the change.
Returns:
TextPromptClient: The prompt if type argument is 'text'. ChatPromptClient: The prompt if type argument is 'chat'.
3774 def update_prompt( 3775 self, 3776 *, 3777 name: str, 3778 version: int, 3779 new_labels: List[str] = [], 3780 ) -> Any: 3781 """Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name. 3782 3783 Args: 3784 name (str): The name of the prompt to update. 3785 version (int): The version number of the prompt to update. 3786 new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to []. 3787 3788 Returns: 3789 Prompt: The updated prompt from the Langfuse API. 3790 3791 """ 3792 updated_prompt = self.api.prompt_version.update( 3793 name=self._url_encode(name), 3794 version=version, 3795 new_labels=new_labels, 3796 ) 3797 3798 if self._resources is not None: 3799 self._resources.prompt_cache.invalidate(name) 3800 3801 return updated_prompt
Update an existing prompt version in Langfuse. The Langfuse SDK prompt cache is invalidated for all prompts witht he specified name.
Arguments:
- name (str): The name of the prompt to update.
- version (int): The version number of the prompt to update.
- new_labels (List[str], optional): New labels to assign to the prompt version. Labels are unique across versions. The "latest" label is reserved and managed by Langfuse. Defaults to [].
Returns:
Prompt: The updated prompt from the Langfuse API.
3816 def clear_prompt_cache(self) -> None: 3817 """Clear the entire prompt cache, removing all cached prompts. 3818 3819 This method is useful when you want to force a complete refresh of all 3820 cached prompts, for example after major updates or when you need to 3821 ensure the latest versions are fetched from the server. 3822 """ 3823 if self._resources is not None: 3824 self._resources.prompt_cache.clear()
Clear the entire prompt cache, removing all cached prompts.
This method is useful when you want to force a complete refresh of all cached prompts, for example after major updates or when you need to ensure the latest versions are fetched from the server.
64def get_client(*, public_key: Optional[str] = None) -> Langfuse: 65 """Get or create a Langfuse client instance. 66 67 Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, 68 providing a public_key is required. Multi-project support is experimental - see Langfuse docs. 69 70 Behavior: 71 - Single project: Returns existing client or creates new one 72 - Multi-project: Requires public_key to return specific client 73 - No public_key in multi-project: Returns disabled client to prevent data leakage 74 75 The function uses a singleton pattern per public_key to conserve resources and maintain state. 76 77 Args: 78 public_key (Optional[str]): Project identifier 79 - With key: Returns client for that project 80 - Without key: Returns single client or disabled client if multiple exist 81 82 Returns: 83 Langfuse: Client instance in one of three states: 84 1. Client for specified public_key 85 2. Default client for single-project setup 86 3. Disabled client when multiple projects exist without key 87 88 Security: 89 Disables tracing when multiple projects exist without explicit key to prevent 90 cross-project data leakage. Multi-project setups are experimental. 91 92 Example: 93 ```python 94 # Single project 95 client = get_client() # Default client 96 97 # In multi-project usage: 98 client_a = get_client(public_key="project_a_key") # Returns project A's client 99 client_b = get_client(public_key="project_b_key") # Returns project B's client 100 101 # Without specific key in multi-project setup: 102 client = get_client() # Returns disabled client for safety 103 ``` 104 """ 105 with LangfuseResourceManager._lock: 106 active_instances = LangfuseResourceManager._instances 107 108 # If no explicit public_key provided, check execution context 109 if not public_key: 110 public_key = _current_public_key.get(None) 111 112 if not public_key: 113 if len(active_instances) == 0: 114 # No clients initialized yet, create default instance 115 return Langfuse() 116 117 if len(active_instances) == 1: 118 # Only one client exists, safe to use without specifying key 119 instance = list(active_instances.values())[0] 120 121 # Initialize with the credentials bound to the instance 122 # This is important if the original instance was instantiated 123 # via constructor arguments 124 return _create_client_from_instance(instance) 125 126 else: 127 # Multiple clients exist but no key specified - disable tracing 128 # to prevent cross-project data leakage 129 langfuse_logger.warning( 130 "No 'langfuse_public_key' passed to decorated function, but multiple langfuse clients are instantiated in current process. Skipping tracing for this function to avoid cross-project leakage." 131 ) 132 return Langfuse( 133 tracing_enabled=False, public_key="fake", secret_key="fake" 134 ) 135 136 else: 137 # Specific key provided, look up existing instance 138 target_instance: Optional[LangfuseResourceManager] = active_instances.get( 139 public_key, None 140 ) 141 142 if target_instance is None: 143 # No instance found with this key - client not initialized properly 144 langfuse_logger.warning( 145 f"No Langfuse client with public key {public_key} has been initialized. Skipping tracing for decorated function." 146 ) 147 return Langfuse( 148 tracing_enabled=False, public_key="fake", secret_key="fake" 149 ) 150 151 # target_instance is guaranteed to be not None at this point 152 return _create_client_from_instance(target_instance, public_key)
Get or create a Langfuse client instance.
Returns an existing Langfuse client or creates a new one if none exists. In multi-project setups, providing a public_key is required. Multi-project support is experimental - see Langfuse docs.
Behavior:
- Single project: Returns existing client or creates new one
- Multi-project: Requires public_key to return specific client
- No public_key in multi-project: Returns disabled client to prevent data leakage
The function uses a singleton pattern per public_key to conserve resources and maintain state.
Arguments:
- public_key (Optional[str]): Project identifier
- With key: Returns client for that project
- Without key: Returns single client or disabled client if multiple exist
Returns:
Langfuse: Client instance in one of three states: 1. Client for specified public_key 2. Default client for single-project setup 3. Disabled client when multiple projects exist without key
Security:
Disables tracing when multiple projects exist without explicit key to prevent cross-project data leakage. Multi-project setups are experimental.
Example:
# Single project client = get_client() # Default client # In multi-project usage: client_a = get_client(public_key="project_a_key") # Returns project A's client client_b = get_client(public_key="project_b_key") # Returns project B's client # Without specific key in multi-project setup: client = get_client() # Returns disabled client for safety
88 def observe( 89 self, 90 func: Optional[F] = None, 91 *, 92 name: Optional[str] = None, 93 as_type: Optional[ObservationTypeLiteralNoEvent] = None, 94 capture_input: Optional[bool] = None, 95 capture_output: Optional[bool] = None, 96 transform_to_string: Optional[Callable[[Iterable], str]] = None, 97 ) -> Union[F, Callable[[F], F]]: 98 """Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions. 99 100 This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates 101 spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator 102 intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints. 103 104 Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, 105 enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details. 106 107 Args: 108 func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None. 109 name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used. 110 as_type (Optional[Literal]): Set the observation type. Supported values: 111 "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". 112 Observation types are highlighted in the Langfuse UI for filtering and visualization. 113 The types "generation" and "embedding" create a span on which additional attributes such as model metrics 114 can be set. 115 116 Returns: 117 Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans. 118 119 Example: 120 For general function tracing with automatic naming: 121 ```python 122 @observe() 123 def process_user_request(user_id, query): 124 # Function is automatically traced with name "process_user_request" 125 return get_response(query) 126 ``` 127 128 For language model generation tracking: 129 ```python 130 @observe(name="answer-generation", as_type="generation") 131 async def generate_answer(query): 132 # Creates a generation-type span with extended LLM metrics 133 response = await openai.chat.completions.create( 134 model="gpt-4", 135 messages=[{"role": "user", "content": query}] 136 ) 137 return response.choices[0].message.content 138 ``` 139 140 For trace context propagation between functions: 141 ```python 142 @observe() 143 def main_process(): 144 # Parent span is created 145 return sub_process() # Child span automatically connected to parent 146 147 @observe() 148 def sub_process(): 149 # Automatically becomes a child span of main_process 150 return "result" 151 ``` 152 153 Raises: 154 Exception: Propagates any exceptions from the wrapped function after logging them in the trace. 155 156 Notes: 157 - The decorator preserves the original function's signature, docstring, and return type. 158 - Proper parent-child relationships between spans are automatically maintained. 159 - Special keyword arguments can be passed to control tracing: 160 - langfuse_trace_id: Explicitly set the trace ID for this function call 161 - langfuse_parent_observation_id: Explicitly set the parent span ID 162 - langfuse_public_key: Use a specific Langfuse project (when multiple clients exist) 163 - For async functions, the decorator returns an async function wrapper. 164 - For sync functions, the decorator returns a synchronous wrapper. 165 """ 166 valid_types = set(get_observation_types_list(ObservationTypeLiteralNoEvent)) 167 if as_type is not None and as_type not in valid_types: 168 logger.warning( 169 f"Invalid as_type '{as_type}'. Valid types are: {', '.join(sorted(valid_types))}. Defaulting to 'span'." 170 ) 171 as_type = "span" 172 173 function_io_capture_enabled = os.environ.get( 174 LANGFUSE_OBSERVE_DECORATOR_IO_CAPTURE_ENABLED, "True" 175 ).lower() not in ("false", "0") 176 177 should_capture_input = ( 178 capture_input if capture_input is not None else function_io_capture_enabled 179 ) 180 181 should_capture_output = ( 182 capture_output 183 if capture_output is not None 184 else function_io_capture_enabled 185 ) 186 187 def decorator(func: F) -> F: 188 return ( 189 self._async_observe( 190 func, 191 name=name, 192 as_type=as_type, 193 capture_input=should_capture_input, 194 capture_output=should_capture_output, 195 transform_to_string=transform_to_string, 196 ) 197 if asyncio.iscoroutinefunction(func) 198 else self._sync_observe( 199 func, 200 name=name, 201 as_type=as_type, 202 capture_input=should_capture_input, 203 capture_output=should_capture_output, 204 transform_to_string=transform_to_string, 205 ) 206 ) 207 208 """Handle decorator with or without parentheses. 209 210 This logic enables the decorator to work both with and without parentheses: 211 - @observe - Python passes the function directly to the decorator 212 - @observe() - Python calls the decorator first, which must return a function decorator 213 214 When called without arguments (@observe), the func parameter contains the function to decorate, 215 so we directly apply the decorator to it. When called with parentheses (@observe()), 216 func is None, so we return the decorator function itself for Python to apply in the next step. 217 """ 218 if func is None: 219 return decorator 220 else: 221 return decorator(func)
Wrap a function to create and manage Langfuse tracing around its execution, supporting both synchronous and asynchronous functions.
This decorator provides seamless integration of Langfuse observability into your codebase. It automatically creates spans or generations around function execution, capturing timing, inputs/outputs, and error states. The decorator intelligently handles both synchronous and asynchronous functions, preserving function signatures and type hints.
Using OpenTelemetry's distributed tracing system, it maintains proper trace context propagation throughout your application, enabling you to see hierarchical traces of function calls with detailed performance metrics and function-specific details.
Arguments:
- func (Optional[Callable]): The function to decorate. When used with parentheses @observe(), this will be None.
- name (Optional[str]): Custom name for the created trace or span. If not provided, the function name is used.
- as_type (Optional[Literal]): Set the observation type. Supported values: "generation", "span", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail". Observation types are highlighted in the Langfuse UI for filtering and visualization. The types "generation" and "embedding" create a span on which additional attributes such as model metrics can be set.
Returns:
Callable: A wrapped version of the original function that automatically creates and manages Langfuse spans.
Example:
For general function tracing with automatic naming:
@observe() def process_user_request(user_id, query): # Function is automatically traced with name "process_user_request" return get_response(query)For language model generation tracking:
@observe(name="answer-generation", as_type="generation") async def generate_answer(query): # Creates a generation-type span with extended LLM metrics response = await openai.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": query}] ) return response.choices[0].message.contentFor trace context propagation between functions:
@observe() def main_process(): # Parent span is created return sub_process() # Child span automatically connected to parent @observe() def sub_process(): # Automatically becomes a child span of main_process return "result"
Raises:
- Exception: Propagates any exceptions from the wrapped function after logging them in the trace.
Notes:
- The decorator preserves the original function's signature, docstring, and return type.
- Proper parent-child relationships between spans are automatically maintained.
- Special keyword arguments can be passed to control tracing:
- langfuse_trace_id: Explicitly set the trace ID for this function call
- langfuse_parent_observation_id: Explicitly set the parent span ID
- langfuse_public_key: Use a specific Langfuse project (when multiple clients exist)
- For async functions, the decorator returns an async function wrapper.
- For sync functions, the decorator returns a synchronous wrapper.
95def propagate_attributes( 96 *, 97 user_id: Optional[str] = None, 98 session_id: Optional[str] = None, 99 metadata: Optional[Dict[str, Any]] = None, 100 version: Optional[str] = None, 101 tags: Optional[List[str]] = None, 102 trace_name: Optional[str] = None, 103 as_baggage: bool = False, 104) -> _AgnosticContextManager[Any]: 105 """Propagate trace-level attributes to all spans created within this context. 106 107 This context manager sets attributes on the currently active span AND automatically 108 propagates them to all new child spans created within the context. This is the 109 recommended way to set trace-level attributes like user_id, session_id, and metadata 110 dimensions that should be consistently applied across all observations in a trace. 111 112 **IMPORTANT**: Call this as early as possible within your trace/workflow. Only the 113 currently active span and spans created after entering this context will have these 114 attributes. Pre-existing spans will NOT be retroactively updated. 115 116 **Why this matters**: Langfuse aggregation queries (e.g., total cost by user_id, 117 filtering by session_id) only include observations that have the attribute set. 118 If you call `propagate_attributes` late in your workflow, earlier spans won't be 119 included in aggregations for that attribute. 120 121 Args: 122 user_id: User identifier to associate with all spans in this context. 123 Must be US-ASCII string, ≤200 characters. Use this to track which user 124 generated each trace and enable e.g. per-user cost/performance analysis. 125 session_id: Session identifier to associate with all spans in this context. 126 Must be US-ASCII string, ≤200 characters. Use this to group related traces 127 within a user session (e.g., a conversation thread, multi-turn interaction). 128 metadata: Additional key-value metadata to propagate to all spans. 129 - Keys must be US-ASCII strings 130 - Values are coerced to strings 131 - Coerced values must be ≤200 characters 132 - Use for dimensions like internal correlating identifiers 133 - AVOID: large payloads or sensitive data 134 version: Version identfier for parts of your application that are independently versioned, e.g. agents 135 tags: List of tags to categorize the group of observations 136 trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. 137 Use this to set a consistent trace name for all spans created within this context. 138 as_baggage: If True, propagates attributes using OpenTelemetry baggage for 139 cross-process/service propagation. **Security warning**: When enabled, 140 attribute values are added to HTTP headers on ALL outbound requests. 141 Only enable if values are safe to transmit via HTTP headers and you need 142 cross-service tracing. Default: False. 143 144 Returns: 145 Context manager that propagates attributes to all child spans. 146 147 Example: 148 Basic usage with user and session tracking: 149 150 ```python 151 from langfuse import Langfuse 152 153 langfuse = Langfuse() 154 155 # Set attributes early in the trace 156 with langfuse.start_as_current_observation(name="user_workflow") as span: 157 with langfuse.propagate_attributes( 158 user_id="user_123", 159 session_id="session_abc", 160 metadata={"experiment": "variant_a", "environment": "production"} 161 ): 162 # All spans created here will have user_id, session_id, and metadata 163 with langfuse.start_observation(name="llm_call") as llm_span: 164 # This span inherits: user_id, session_id, experiment, environment 165 ... 166 167 with langfuse.start_generation(name="completion") as gen: 168 # This span also inherits all attributes 169 ... 170 ``` 171 172 Late propagation (anti-pattern): 173 174 ```python 175 with langfuse.start_as_current_observation(name="workflow") as span: 176 # These spans WON'T have user_id 177 early_span = langfuse.start_observation(name="early_work") 178 early_span.end() 179 180 # Set attributes in the middle 181 with langfuse.propagate_attributes(user_id="user_123"): 182 # Only spans created AFTER this point will have user_id 183 late_span = langfuse.start_observation(name="late_work") 184 late_span.end() 185 186 # Result: Aggregations by user_id will miss "early_work" span 187 ``` 188 189 Cross-service propagation with baggage (advanced): 190 191 ```python 192 # Service A - originating service 193 with langfuse.start_as_current_observation(name="api_request"): 194 with langfuse.propagate_attributes( 195 user_id="user_123", 196 session_id="session_abc", 197 as_baggage=True # Propagate via HTTP headers 198 ): 199 # Make HTTP request to Service B 200 response = requests.get("https://service-b.example.com/api") 201 # user_id and session_id are now in HTTP headers 202 203 # Service B - downstream service 204 # OpenTelemetry will automatically extract baggage from HTTP headers 205 # and propagate to spans in Service B 206 ``` 207 208 Note: 209 - **Validation**: Attribute values (user_id, session_id, version, tags, 210 trace_name) must be strings ≤200 characters. Metadata values are 211 coerced to strings before the 200 character limit is applied. Invalid 212 values will be dropped with a warning logged. 213 - **OpenTelemetry**: This uses OpenTelemetry context propagation under the hood, 214 making it compatible with other OTel-instrumented libraries. 215 216 Raises: 217 No exceptions are raised. Invalid values are logged as warnings and dropped. 218 """ 219 return _propagate_attributes( 220 user_id=user_id, 221 session_id=session_id, 222 metadata=metadata, 223 version=version, 224 tags=tags, 225 trace_name=trace_name, 226 as_baggage=as_baggage, 227 )
Propagate trace-level attributes to all spans created within this context.
This context manager sets attributes on the currently active span AND automatically propagates them to all new child spans created within the context. This is the recommended way to set trace-level attributes like user_id, session_id, and metadata dimensions that should be consistently applied across all observations in a trace.
IMPORTANT: Call this as early as possible within your trace/workflow. Only the currently active span and spans created after entering this context will have these attributes. Pre-existing spans will NOT be retroactively updated.
Why this matters: Langfuse aggregation queries (e.g., total cost by user_id,
filtering by session_id) only include observations that have the attribute set.
If you call propagate_attributes late in your workflow, earlier spans won't be
included in aggregations for that attribute.
Arguments:
- user_id: User identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to track which user generated each trace and enable e.g. per-user cost/performance analysis.
- session_id: Session identifier to associate with all spans in this context. Must be US-ASCII string, ≤200 characters. Use this to group related traces within a user session (e.g., a conversation thread, multi-turn interaction).
- metadata: Additional key-value metadata to propagate to all spans.
- Keys must be US-ASCII strings
- Values are coerced to strings
- Coerced values must be ≤200 characters
- Use for dimensions like internal correlating identifiers
- AVOID: large payloads or sensitive data
- version: Version identfier for parts of your application that are independently versioned, e.g. agents
- tags: List of tags to categorize the group of observations
- trace_name: Name to assign to the trace. Must be US-ASCII string, ≤200 characters. Use this to set a consistent trace name for all spans created within this context.
- as_baggage: If True, propagates attributes using OpenTelemetry baggage for cross-process/service propagation. Security warning: When enabled, attribute values are added to HTTP headers on ALL outbound requests. Only enable if values are safe to transmit via HTTP headers and you need cross-service tracing. Default: False.
Returns:
Context manager that propagates attributes to all child spans.
Example:
Basic usage with user and session tracking:
from langfuse import Langfuse langfuse = Langfuse() # Set attributes early in the trace with langfuse.start_as_current_observation(name="user_workflow") as span: with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", metadata={"experiment": "variant_a", "environment": "production"} ): # All spans created here will have user_id, session_id, and metadata with langfuse.start_observation(name="llm_call") as llm_span: # This span inherits: user_id, session_id, experiment, environment ... with langfuse.start_generation(name="completion") as gen: # This span also inherits all attributes ...Late propagation (anti-pattern):
with langfuse.start_as_current_observation(name="workflow") as span: # These spans WON'T have user_id early_span = langfuse.start_observation(name="early_work") early_span.end() # Set attributes in the middle with langfuse.propagate_attributes(user_id="user_123"): # Only spans created AFTER this point will have user_id late_span = langfuse.start_observation(name="late_work") late_span.end() # Result: Aggregations by user_id will miss "early_work" spanCross-service propagation with baggage (advanced):
# Service A - originating service with langfuse.start_as_current_observation(name="api_request"): with langfuse.propagate_attributes( user_id="user_123", session_id="session_abc", as_baggage=True # Propagate via HTTP headers ): # Make HTTP request to Service B response = requests.get("https://service-b.example.com/api") # user_id and session_id are now in HTTP headers # Service B - downstream service # OpenTelemetry will automatically extract baggage from HTTP headers # and propagate to spans in Service B
Note:
- Validation: Attribute values (user_id, session_id, version, tags, trace_name) must be strings ≤200 characters. Metadata values are coerced to strings before the 200 character limit is applied. Invalid values will be dropped with a warning logged.
- OpenTelemetry: This uses OpenTelemetry context propagation under the hood, making it compatible with other OTel-instrumented libraries.
Raises:
- No exceptions are raised. Invalid values are logged as warnings and dropped.
1251class LangfuseSpan(LangfuseObservationWrapper): 1252 """Standard span implementation for general operations in Langfuse. 1253 1254 This class represents a general-purpose span that can be used to trace 1255 any operation in your application. It extends the base LangfuseObservationWrapper 1256 with specific methods for creating child spans, generations, and updating 1257 span-specific attributes. If possible, use a more specific type for 1258 better observability and insights. 1259 """ 1260 1261 def __init__( 1262 self, 1263 *, 1264 otel_span: otel_trace_api.Span, 1265 langfuse_client: "Langfuse", 1266 input: Optional[Any] = None, 1267 output: Optional[Any] = None, 1268 metadata: Optional[Any] = None, 1269 environment: Optional[str] = None, 1270 release: Optional[str] = None, 1271 version: Optional[str] = None, 1272 level: Optional[SpanLevel] = None, 1273 status_message: Optional[str] = None, 1274 ): 1275 """Initialize a new LangfuseSpan. 1276 1277 Args: 1278 otel_span: The OpenTelemetry span to wrap 1279 langfuse_client: Reference to the parent Langfuse client 1280 input: Input data for the span (any JSON-serializable object) 1281 output: Output data from the span (any JSON-serializable object) 1282 metadata: Additional metadata to associate with the span 1283 environment: The tracing environment 1284 release: Release identifier for the application 1285 version: Version identifier for the code or component 1286 level: Importance level of the span (info, warning, error) 1287 status_message: Optional status message for the span 1288 """ 1289 super().__init__( 1290 otel_span=otel_span, 1291 as_type="span", 1292 langfuse_client=langfuse_client, 1293 input=input, 1294 output=output, 1295 metadata=metadata, 1296 environment=environment, 1297 release=release, 1298 version=version, 1299 level=level, 1300 status_message=status_message, 1301 )
Standard span implementation for general operations in Langfuse.
This class represents a general-purpose span that can be used to trace any operation in your application. It extends the base LangfuseObservationWrapper with specific methods for creating child spans, generations, and updating span-specific attributes. If possible, use a more specific type for better observability and insights.
1261 def __init__( 1262 self, 1263 *, 1264 otel_span: otel_trace_api.Span, 1265 langfuse_client: "Langfuse", 1266 input: Optional[Any] = None, 1267 output: Optional[Any] = None, 1268 metadata: Optional[Any] = None, 1269 environment: Optional[str] = None, 1270 release: Optional[str] = None, 1271 version: Optional[str] = None, 1272 level: Optional[SpanLevel] = None, 1273 status_message: Optional[str] = None, 1274 ): 1275 """Initialize a new LangfuseSpan. 1276 1277 Args: 1278 otel_span: The OpenTelemetry span to wrap 1279 langfuse_client: Reference to the parent Langfuse client 1280 input: Input data for the span (any JSON-serializable object) 1281 output: Output data from the span (any JSON-serializable object) 1282 metadata: Additional metadata to associate with the span 1283 environment: The tracing environment 1284 release: Release identifier for the application 1285 version: Version identifier for the code or component 1286 level: Importance level of the span (info, warning, error) 1287 status_message: Optional status message for the span 1288 """ 1289 super().__init__( 1290 otel_span=otel_span, 1291 as_type="span", 1292 langfuse_client=langfuse_client, 1293 input=input, 1294 output=output, 1295 metadata=metadata, 1296 environment=environment, 1297 release=release, 1298 version=version, 1299 level=level, 1300 status_message=status_message, 1301 )
Initialize a new LangfuseSpan.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the span (any JSON-serializable object)
- output: Output data from the span (any JSON-serializable object)
- metadata: Additional metadata to associate with the span
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the code or component
- level: Importance level of the span (info, warning, error)
- status_message: Optional status message for the span
1304class LangfuseGeneration(LangfuseObservationWrapper): 1305 """Specialized span implementation for AI model generations in Langfuse. 1306 1307 This class represents a generation span specifically designed for tracking 1308 AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized 1309 attributes for model details, token usage, and costs. 1310 """ 1311 1312 def __init__( 1313 self, 1314 *, 1315 otel_span: otel_trace_api.Span, 1316 langfuse_client: "Langfuse", 1317 input: Optional[Any] = None, 1318 output: Optional[Any] = None, 1319 metadata: Optional[Any] = None, 1320 environment: Optional[str] = None, 1321 release: Optional[str] = None, 1322 version: Optional[str] = None, 1323 level: Optional[SpanLevel] = None, 1324 status_message: Optional[str] = None, 1325 completion_start_time: Optional[datetime] = None, 1326 model: Optional[str] = None, 1327 model_parameters: Optional[Dict[str, MapValue]] = None, 1328 usage_details: Optional[Dict[str, int]] = None, 1329 cost_details: Optional[Dict[str, float]] = None, 1330 prompt: Optional[PromptClient] = None, 1331 ): 1332 """Initialize a new LangfuseGeneration span. 1333 1334 Args: 1335 otel_span: The OpenTelemetry span to wrap 1336 langfuse_client: Reference to the parent Langfuse client 1337 input: Input data for the generation (e.g., prompts) 1338 output: Output from the generation (e.g., completions) 1339 metadata: Additional metadata to associate with the generation 1340 environment: The tracing environment 1341 release: Release identifier for the application 1342 version: Version identifier for the model or component 1343 level: Importance level of the generation (info, warning, error) 1344 status_message: Optional status message for the generation 1345 completion_start_time: When the model started generating the response 1346 model: Name/identifier of the AI model used (e.g., "gpt-4") 1347 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1348 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1349 cost_details: Cost information for the model call 1350 prompt: Associated prompt template from Langfuse prompt management 1351 """ 1352 super().__init__( 1353 as_type="generation", 1354 otel_span=otel_span, 1355 langfuse_client=langfuse_client, 1356 input=input, 1357 output=output, 1358 metadata=metadata, 1359 environment=environment, 1360 release=release, 1361 version=version, 1362 level=level, 1363 status_message=status_message, 1364 completion_start_time=completion_start_time, 1365 model=model, 1366 model_parameters=model_parameters, 1367 usage_details=usage_details, 1368 cost_details=cost_details, 1369 prompt=prompt, 1370 )
Specialized span implementation for AI model generations in Langfuse.
This class represents a generation span specifically designed for tracking AI/LLM operations. It extends the base LangfuseObservationWrapper with specialized attributes for model details, token usage, and costs.
1312 def __init__( 1313 self, 1314 *, 1315 otel_span: otel_trace_api.Span, 1316 langfuse_client: "Langfuse", 1317 input: Optional[Any] = None, 1318 output: Optional[Any] = None, 1319 metadata: Optional[Any] = None, 1320 environment: Optional[str] = None, 1321 release: Optional[str] = None, 1322 version: Optional[str] = None, 1323 level: Optional[SpanLevel] = None, 1324 status_message: Optional[str] = None, 1325 completion_start_time: Optional[datetime] = None, 1326 model: Optional[str] = None, 1327 model_parameters: Optional[Dict[str, MapValue]] = None, 1328 usage_details: Optional[Dict[str, int]] = None, 1329 cost_details: Optional[Dict[str, float]] = None, 1330 prompt: Optional[PromptClient] = None, 1331 ): 1332 """Initialize a new LangfuseGeneration span. 1333 1334 Args: 1335 otel_span: The OpenTelemetry span to wrap 1336 langfuse_client: Reference to the parent Langfuse client 1337 input: Input data for the generation (e.g., prompts) 1338 output: Output from the generation (e.g., completions) 1339 metadata: Additional metadata to associate with the generation 1340 environment: The tracing environment 1341 release: Release identifier for the application 1342 version: Version identifier for the model or component 1343 level: Importance level of the generation (info, warning, error) 1344 status_message: Optional status message for the generation 1345 completion_start_time: When the model started generating the response 1346 model: Name/identifier of the AI model used (e.g., "gpt-4") 1347 model_parameters: Parameters used for the model (e.g., temperature, max_tokens) 1348 usage_details: Token usage information (e.g., prompt_tokens, completion_tokens) 1349 cost_details: Cost information for the model call 1350 prompt: Associated prompt template from Langfuse prompt management 1351 """ 1352 super().__init__( 1353 as_type="generation", 1354 otel_span=otel_span, 1355 langfuse_client=langfuse_client, 1356 input=input, 1357 output=output, 1358 metadata=metadata, 1359 environment=environment, 1360 release=release, 1361 version=version, 1362 level=level, 1363 status_message=status_message, 1364 completion_start_time=completion_start_time, 1365 model=model, 1366 model_parameters=model_parameters, 1367 usage_details=usage_details, 1368 cost_details=cost_details, 1369 prompt=prompt, 1370 )
Initialize a new LangfuseGeneration span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the generation (e.g., prompts)
- output: Output from the generation (e.g., completions)
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
- completion_start_time: When the model started generating the response
- model: Name/identifier of the AI model used (e.g., "gpt-4")
- model_parameters: Parameters used for the model (e.g., temperature, max_tokens)
- usage_details: Token usage information (e.g., prompt_tokens, completion_tokens)
- cost_details: Cost information for the model call
- prompt: Associated prompt template from Langfuse prompt management
1373class LangfuseEvent(LangfuseObservationWrapper): 1374 """Specialized span implementation for Langfuse Events.""" 1375 1376 def __init__( 1377 self, 1378 *, 1379 otel_span: otel_trace_api.Span, 1380 langfuse_client: "Langfuse", 1381 input: Optional[Any] = None, 1382 output: Optional[Any] = None, 1383 metadata: Optional[Any] = None, 1384 environment: Optional[str] = None, 1385 release: Optional[str] = None, 1386 version: Optional[str] = None, 1387 level: Optional[SpanLevel] = None, 1388 status_message: Optional[str] = None, 1389 ): 1390 """Initialize a new LangfuseEvent span. 1391 1392 Args: 1393 otel_span: The OpenTelemetry span to wrap 1394 langfuse_client: Reference to the parent Langfuse client 1395 input: Input data for the event 1396 output: Output from the event 1397 metadata: Additional metadata to associate with the generation 1398 environment: The tracing environment 1399 release: Release identifier for the application 1400 version: Version identifier for the model or component 1401 level: Importance level of the generation (info, warning, error) 1402 status_message: Optional status message for the generation 1403 """ 1404 super().__init__( 1405 otel_span=otel_span, 1406 as_type="event", 1407 langfuse_client=langfuse_client, 1408 input=input, 1409 output=output, 1410 metadata=metadata, 1411 environment=environment, 1412 release=release, 1413 version=version, 1414 level=level, 1415 status_message=status_message, 1416 ) 1417 1418 def update( 1419 self, 1420 *, 1421 name: Optional[str] = None, 1422 input: Optional[Any] = None, 1423 output: Optional[Any] = None, 1424 metadata: Optional[Any] = None, 1425 version: Optional[str] = None, 1426 level: Optional[SpanLevel] = None, 1427 status_message: Optional[str] = None, 1428 completion_start_time: Optional[datetime] = None, 1429 model: Optional[str] = None, 1430 model_parameters: Optional[Dict[str, MapValue]] = None, 1431 usage_details: Optional[Dict[str, int]] = None, 1432 cost_details: Optional[Dict[str, float]] = None, 1433 prompt: Optional[PromptClient] = None, 1434 **kwargs: Any, 1435 ) -> "LangfuseEvent": 1436 """Update is not allowed for LangfuseEvent because events cannot be updated. 1437 1438 This method logs a warning and returns self without making changes. 1439 1440 Returns: 1441 self: Returns the unchanged LangfuseEvent instance 1442 """ 1443 langfuse_logger.warning( 1444 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1445 ) 1446 return self
Specialized span implementation for Langfuse Events.
1376 def __init__( 1377 self, 1378 *, 1379 otel_span: otel_trace_api.Span, 1380 langfuse_client: "Langfuse", 1381 input: Optional[Any] = None, 1382 output: Optional[Any] = None, 1383 metadata: Optional[Any] = None, 1384 environment: Optional[str] = None, 1385 release: Optional[str] = None, 1386 version: Optional[str] = None, 1387 level: Optional[SpanLevel] = None, 1388 status_message: Optional[str] = None, 1389 ): 1390 """Initialize a new LangfuseEvent span. 1391 1392 Args: 1393 otel_span: The OpenTelemetry span to wrap 1394 langfuse_client: Reference to the parent Langfuse client 1395 input: Input data for the event 1396 output: Output from the event 1397 metadata: Additional metadata to associate with the generation 1398 environment: The tracing environment 1399 release: Release identifier for the application 1400 version: Version identifier for the model or component 1401 level: Importance level of the generation (info, warning, error) 1402 status_message: Optional status message for the generation 1403 """ 1404 super().__init__( 1405 otel_span=otel_span, 1406 as_type="event", 1407 langfuse_client=langfuse_client, 1408 input=input, 1409 output=output, 1410 metadata=metadata, 1411 environment=environment, 1412 release=release, 1413 version=version, 1414 level=level, 1415 status_message=status_message, 1416 )
Initialize a new LangfuseEvent span.
Arguments:
- otel_span: The OpenTelemetry span to wrap
- langfuse_client: Reference to the parent Langfuse client
- input: Input data for the event
- output: Output from the event
- metadata: Additional metadata to associate with the generation
- environment: The tracing environment
- release: Release identifier for the application
- version: Version identifier for the model or component
- level: Importance level of the generation (info, warning, error)
- status_message: Optional status message for the generation
1418 def update( 1419 self, 1420 *, 1421 name: Optional[str] = None, 1422 input: Optional[Any] = None, 1423 output: Optional[Any] = None, 1424 metadata: Optional[Any] = None, 1425 version: Optional[str] = None, 1426 level: Optional[SpanLevel] = None, 1427 status_message: Optional[str] = None, 1428 completion_start_time: Optional[datetime] = None, 1429 model: Optional[str] = None, 1430 model_parameters: Optional[Dict[str, MapValue]] = None, 1431 usage_details: Optional[Dict[str, int]] = None, 1432 cost_details: Optional[Dict[str, float]] = None, 1433 prompt: Optional[PromptClient] = None, 1434 **kwargs: Any, 1435 ) -> "LangfuseEvent": 1436 """Update is not allowed for LangfuseEvent because events cannot be updated. 1437 1438 This method logs a warning and returns self without making changes. 1439 1440 Returns: 1441 self: Returns the unchanged LangfuseEvent instance 1442 """ 1443 langfuse_logger.warning( 1444 "Attempted to update LangfuseEvent observation. Events cannot be updated after creation." 1445 ) 1446 return self
Update is not allowed for LangfuseEvent because events cannot be updated.
This method logs a warning and returns self without making changes.
Returns:
self: Returns the unchanged LangfuseEvent instance
28class LangfuseOtelSpanAttributes: 29 # Langfuse-Trace attributes 30 TRACE_NAME = "langfuse.trace.name" 31 TRACE_USER_ID = "user.id" 32 TRACE_SESSION_ID = "session.id" 33 TRACE_TAGS = "langfuse.trace.tags" 34 TRACE_PUBLIC = "langfuse.trace.public" 35 TRACE_METADATA = "langfuse.trace.metadata" 36 TRACE_INPUT = "langfuse.trace.input" 37 TRACE_OUTPUT = "langfuse.trace.output" 38 39 # Langfuse-observation attributes 40 OBSERVATION_TYPE = "langfuse.observation.type" 41 OBSERVATION_METADATA = "langfuse.observation.metadata" 42 OBSERVATION_LEVEL = "langfuse.observation.level" 43 OBSERVATION_STATUS_MESSAGE = "langfuse.observation.status_message" 44 OBSERVATION_INPUT = "langfuse.observation.input" 45 OBSERVATION_OUTPUT = "langfuse.observation.output" 46 47 # Langfuse-observation of type Generation attributes 48 OBSERVATION_COMPLETION_START_TIME = "langfuse.observation.completion_start_time" 49 OBSERVATION_MODEL = "langfuse.observation.model.name" 50 OBSERVATION_MODEL_PARAMETERS = "langfuse.observation.model.parameters" 51 OBSERVATION_USAGE_DETAILS = "langfuse.observation.usage_details" 52 OBSERVATION_COST_DETAILS = "langfuse.observation.cost_details" 53 OBSERVATION_PROMPT_NAME = "langfuse.observation.prompt.name" 54 OBSERVATION_PROMPT_VERSION = "langfuse.observation.prompt.version" 55 56 # General 57 ENVIRONMENT = "langfuse.environment" 58 RELEASE = "langfuse.release" 59 VERSION = "langfuse.version" 60 61 # Internal 62 AS_ROOT = "langfuse.internal.as_root" 63 IS_APP_ROOT = "langfuse.internal.is_app_root" 64 65 # Experiments 66 EXPERIMENT_ID = "langfuse.experiment.id" 67 EXPERIMENT_NAME = "langfuse.experiment.name" 68 EXPERIMENT_DESCRIPTION = "langfuse.experiment.description" 69 EXPERIMENT_METADATA = "langfuse.experiment.metadata" 70 EXPERIMENT_DATASET_ID = "langfuse.experiment.dataset.id" 71 EXPERIMENT_ITEM_ID = "langfuse.experiment.item.id" 72 EXPERIMENT_ITEM_EXPECTED_OUTPUT = "langfuse.experiment.item.expected_output" 73 EXPERIMENT_ITEM_METADATA = "langfuse.experiment.item.metadata" 74 EXPERIMENT_ITEM_ROOT_OBSERVATION_ID = "langfuse.experiment.item.root_observation_id"
1449class LangfuseAgent(LangfuseObservationWrapper): 1450 """Agent observation for reasoning blocks that act on tools using LLM guidance.""" 1451 1452 def __init__(self, **kwargs: Any) -> None: 1453 """Initialize a new LangfuseAgent span.""" 1454 kwargs["as_type"] = "agent" 1455 super().__init__(**kwargs)
Agent observation for reasoning blocks that act on tools using LLM guidance.
1458class LangfuseTool(LangfuseObservationWrapper): 1459 """Tool observation representing external tool calls, e.g., calling a weather API.""" 1460 1461 def __init__(self, **kwargs: Any) -> None: 1462 """Initialize a new LangfuseTool span.""" 1463 kwargs["as_type"] = "tool" 1464 super().__init__(**kwargs)
Tool observation representing external tool calls, e.g., calling a weather API.
1467class LangfuseChain(LangfuseObservationWrapper): 1468 """Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.""" 1469 1470 def __init__(self, **kwargs: Any) -> None: 1471 """Initialize a new LangfuseChain span.""" 1472 kwargs["as_type"] = "chain" 1473 super().__init__(**kwargs)
Chain observation for connecting LLM application steps, e.g. passing context from retriever to LLM.
1485class LangfuseEmbedding(LangfuseObservationWrapper): 1486 """Embedding observation for LLM embedding calls, typically used before retrieval.""" 1487 1488 def __init__(self, **kwargs: Any) -> None: 1489 """Initialize a new LangfuseEmbedding span.""" 1490 kwargs["as_type"] = "embedding" 1491 super().__init__(**kwargs)
Embedding observation for LLM embedding calls, typically used before retrieval.
1494class LangfuseEvaluator(LangfuseObservationWrapper): 1495 """Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.""" 1496 1497 def __init__(self, **kwargs: Any) -> None: 1498 """Initialize a new LangfuseEvaluator span.""" 1499 kwargs["as_type"] = "evaluator" 1500 super().__init__(**kwargs)
Evaluator observation for assessing relevance, correctness, or helpfulness of LLM outputs.
1476class LangfuseRetriever(LangfuseObservationWrapper): 1477 """Retriever observation for data retrieval steps, e.g. vector store or database queries.""" 1478 1479 def __init__(self, **kwargs: Any) -> None: 1480 """Initialize a new LangfuseRetriever span.""" 1481 kwargs["as_type"] = "retriever" 1482 super().__init__(**kwargs)
Retriever observation for data retrieval steps, e.g. vector store or database queries.
1503class LangfuseGuardrail(LangfuseObservationWrapper): 1504 """Guardrail observation for protection e.g. against jailbreaks or offensive content.""" 1505 1506 def __init__(self, **kwargs: Any) -> None: 1507 """Initialize a new LangfuseGuardrail span.""" 1508 kwargs["as_type"] = "guardrail" 1509 super().__init__(**kwargs)
Guardrail observation for protection e.g. against jailbreaks or offensive content.
101class Evaluation: 102 """Represents an evaluation result for an experiment item or an entire experiment run. 103 104 This class provides a strongly-typed way to create evaluation results in evaluator functions. 105 Users must use keyword arguments when instantiating this class. 106 107 Attributes: 108 name: Unique identifier for the evaluation metric. Should be descriptive 109 and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). 110 Used for aggregation and comparison across experiment runs. 111 value: The evaluation score or result. Can be: 112 - Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42) 113 - String: For categorical results like "positive", "negative", "neutral" 114 - Boolean: For binary assessments like "passes_safety_check" 115 comment: Optional human-readable explanation of the evaluation result. 116 Useful for providing context, explaining scoring rationale, or noting 117 special conditions. Displayed in Langfuse UI for interpretability. 118 metadata: Optional structured metadata about the evaluation process. 119 Can include confidence scores, intermediate calculations, model versions, 120 or any other relevant technical details. 121 data_type: Optional score data type. Required if value is not NUMERIC. 122 One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC. 123 config_id: Optional Langfuse score config ID. 124 125 Examples: 126 Basic accuracy evaluation: 127 ```python 128 from langfuse import Evaluation 129 130 def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): 131 if not expected_output: 132 return Evaluation(name="accuracy", value=0, comment="No expected output") 133 134 is_correct = output.strip().lower() == expected_output.strip().lower() 135 return Evaluation( 136 name="accuracy", 137 value=1.0 if is_correct else 0.0, 138 comment="Correct answer" if is_correct else "Incorrect answer" 139 ) 140 ``` 141 142 Multi-metric evaluator: 143 ```python 144 def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): 145 return [ 146 Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), 147 Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), 148 Evaluation( 149 name="quality", 150 value=0.85, 151 comment="High quality response", 152 metadata={"confidence": 0.92, "model": "gpt-4"} 153 ) 154 ] 155 ``` 156 157 Categorical evaluation: 158 ```python 159 def sentiment_evaluator(*, input, output, **kwargs): 160 sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" 161 return Evaluation( 162 name="sentiment", 163 value=sentiment, 164 comment=f"Response expresses {sentiment} sentiment", 165 data_type="CATEGORICAL" 166 ) 167 ``` 168 169 Failed evaluation with error handling: 170 ```python 171 def external_api_evaluator(*, input, output, **kwargs): 172 try: 173 score = external_api.evaluate(output) 174 return Evaluation(name="external_score", value=score) 175 except Exception as e: 176 return Evaluation( 177 name="external_score", 178 value=0, 179 comment=f"API unavailable: {e}", 180 metadata={"error": str(e), "retry_count": 3} 181 ) 182 ``` 183 184 Note: 185 All arguments must be passed as keywords. Positional arguments are not allowed 186 to ensure code clarity and prevent errors from argument reordering. 187 """ 188 189 def __init__( 190 self, 191 *, 192 name: str, 193 value: Union[int, float, str, bool], 194 comment: Optional[str] = None, 195 metadata: Optional[Dict[str, Any]] = None, 196 data_type: Optional[ExperimentScoreType] = None, 197 config_id: Optional[str] = None, 198 ): 199 """Initialize an Evaluation with the provided data. 200 201 Args: 202 name: Unique identifier for the evaluation metric. 203 value: The evaluation score or result. 204 comment: Optional human-readable explanation of the result. 205 metadata: Optional structured metadata about the evaluation process. 206 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 207 config_id: Optional Langfuse score config ID. 208 209 Note: 210 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 211 """ 212 self.name = name 213 self.value = value 214 self.comment = comment 215 self.metadata = metadata 216 self.data_type = data_type 217 self.config_id = config_id
Represents an evaluation result for an experiment item or an entire experiment run.
This class provides a strongly-typed way to create evaluation results in evaluator functions. Users must use keyword arguments when instantiating this class.
Attributes:
- name: Unique identifier for the evaluation metric. Should be descriptive and consistent across runs (e.g., "accuracy", "bleu_score", "toxicity"). Used for aggregation and comparison across experiment runs.
- value: The evaluation score or result. Can be:
- Numeric (int/float): For quantitative metrics like accuracy (0.85), BLEU (0.42)
- String: For categorical results like "positive", "negative", "neutral"
- Boolean: For binary assessments like "passes_safety_check"
- comment: Optional human-readable explanation of the evaluation result. Useful for providing context, explaining scoring rationale, or noting special conditions. Displayed in Langfuse UI for interpretability.
- metadata: Optional structured metadata about the evaluation process. Can include confidence scores, intermediate calculations, model versions, or any other relevant technical details.
- data_type: Optional score data type. Required if value is not NUMERIC. One of NUMERIC, CATEGORICAL, or BOOLEAN. Defaults to NUMERIC.
- config_id: Optional Langfuse score config ID.
Examples:
Basic accuracy evaluation:
from langfuse import Evaluation def accuracy_evaluator(*, input, output, expected_output=None, **kwargs): if not expected_output: return Evaluation(name="accuracy", value=0, comment="No expected output") is_correct = output.strip().lower() == expected_output.strip().lower() return Evaluation( name="accuracy", value=1.0 if is_correct else 0.0, comment="Correct answer" if is_correct else "Incorrect answer" )Multi-metric evaluator:
def comprehensive_evaluator(*, input, output, expected_output=None, **kwargs): return [ Evaluation(name="length", value=len(output), comment=f"Output length: {len(output)} chars"), Evaluation(name="has_greeting", value="hello" in output.lower(), comment="Contains greeting"), Evaluation( name="quality", value=0.85, comment="High quality response", metadata={"confidence": 0.92, "model": "gpt-4"} ) ]Categorical evaluation:
def sentiment_evaluator(*, input, output, **kwargs): sentiment = analyze_sentiment(output) # Returns "positive", "negative", or "neutral" return Evaluation( name="sentiment", value=sentiment, comment=f"Response expresses {sentiment} sentiment", data_type="CATEGORICAL" )Failed evaluation with error handling:
def external_api_evaluator(*, input, output, **kwargs): try: score = external_api.evaluate(output) return Evaluation(name="external_score", value=score) except Exception as e: return Evaluation( name="external_score", value=0, comment=f"API unavailable: {e}", metadata={"error": str(e), "retry_count": 3} )
Note:
All arguments must be passed as keywords. Positional arguments are not allowed to ensure code clarity and prevent errors from argument reordering.
189 def __init__( 190 self, 191 *, 192 name: str, 193 value: Union[int, float, str, bool], 194 comment: Optional[str] = None, 195 metadata: Optional[Dict[str, Any]] = None, 196 data_type: Optional[ExperimentScoreType] = None, 197 config_id: Optional[str] = None, 198 ): 199 """Initialize an Evaluation with the provided data. 200 201 Args: 202 name: Unique identifier for the evaluation metric. 203 value: The evaluation score or result. 204 comment: Optional human-readable explanation of the result. 205 metadata: Optional structured metadata about the evaluation process. 206 data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN). 207 config_id: Optional Langfuse score config ID. 208 209 Note: 210 All arguments must be provided as keywords. Positional arguments will raise a TypeError. 211 """ 212 self.name = name 213 self.value = value 214 self.comment = comment 215 self.metadata = metadata 216 self.data_type = data_type 217 self.config_id = config_id
Initialize an Evaluation with the provided data.
Arguments:
- name: Unique identifier for the evaluation metric.
- value: The evaluation score or result.
- comment: Optional human-readable explanation of the result.
- metadata: Optional structured metadata about the evaluation process.
- data_type: Optional score data type (NUMERIC, CATEGORICAL, or BOOLEAN).
- config_id: Optional Langfuse score config ID.
Note:
All arguments must be provided as keywords. Positional arguments will raise a TypeError.
38class EvaluatorInputs: 39 """Input data structure for evaluators, returned by mapper functions. 40 41 This class provides a strongly-typed container for transforming API response 42 objects (traces, observations) into the standardized format expected 43 by evaluator functions. It ensures consistent access to input, output, expected 44 output, and metadata regardless of the source entity type. 45 46 Attributes: 47 input: The input data that was provided to generate the output being evaluated. 48 For traces, this might be the initial prompt or request. For observations, 49 this could be the span's input. The exact meaning depends on your use case. 50 output: The actual output that was produced and needs to be evaluated. 51 For traces, this is typically the final response. For observations, 52 this might be the generation output or span result. 53 expected_output: Optional ground truth or expected result for comparison. 54 Used by evaluators to assess correctness. May be None if no ground truth 55 is available for the entity being evaluated. 56 metadata: Optional structured metadata providing additional context for evaluation. 57 Can include information about the entity, execution context, user attributes, 58 or any other relevant data that evaluators might use. 59 60 Examples: 61 Simple mapper for traces: 62 ```python 63 from langfuse import EvaluatorInputs 64 65 def trace_mapper(trace): 66 return EvaluatorInputs( 67 input=trace.input, 68 output=trace.output, 69 expected_output=None, # No ground truth available 70 metadata={"user_id": trace.user_id, "tags": trace.tags} 71 ) 72 ``` 73 74 Mapper for observations extracting specific fields: 75 ```python 76 def observation_mapper(observation): 77 # Extract input/output from observation's data 78 input_data = observation.input if hasattr(observation, 'input') else None 79 output_data = observation.output if hasattr(observation, 'output') else None 80 81 return EvaluatorInputs( 82 input=input_data, 83 output=output_data, 84 expected_output=None, 85 metadata={ 86 "observation_type": observation.type, 87 "model": observation.model, 88 "latency_ms": observation.end_time - observation.start_time 89 } 90 ) 91 ``` 92 ``` 93 94 Note: 95 All arguments must be passed as keywords when instantiating this class. 96 """ 97 98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Input data structure for evaluators, returned by mapper functions.
This class provides a strongly-typed container for transforming API response objects (traces, observations) into the standardized format expected by evaluator functions. It ensures consistent access to input, output, expected output, and metadata regardless of the source entity type.
Attributes:
- input: The input data that was provided to generate the output being evaluated. For traces, this might be the initial prompt or request. For observations, this could be the span's input. The exact meaning depends on your use case.
- output: The actual output that was produced and needs to be evaluated. For traces, this is typically the final response. For observations, this might be the generation output or span result.
- expected_output: Optional ground truth or expected result for comparison. Used by evaluators to assess correctness. May be None if no ground truth is available for the entity being evaluated.
- metadata: Optional structured metadata providing additional context for evaluation. Can include information about the entity, execution context, user attributes, or any other relevant data that evaluators might use.
Examples:
Simple mapper for traces:
from langfuse import EvaluatorInputs def trace_mapper(trace): return EvaluatorInputs( input=trace.input, output=trace.output, expected_output=None, # No ground truth available metadata={"user_id": trace.user_id, "tags": trace.tags} )Mapper for observations extracting specific fields:
def observation_mapper(observation): # Extract input/output from observation's data input_data = observation.input if hasattr(observation, 'input') else None output_data = observation.output if hasattr(observation, 'output') else None return EvaluatorInputs( input=input_data, output=output_data, expected_output=None, metadata={ "observation_type": observation.type, "model": observation.model, "latency_ms": observation.end_time - observation.start_time } )```
Note:
All arguments must be passed as keywords when instantiating this class.
98 def __init__( 99 self, 100 *, 101 input: Any, 102 output: Any, 103 expected_output: Any = None, 104 metadata: Optional[Dict[str, Any]] = None, 105 ): 106 """Initialize EvaluatorInputs with the provided data. 107 108 Args: 109 input: The input data for evaluation. 110 output: The output data to be evaluated. 111 expected_output: Optional ground truth for comparison. 112 metadata: Optional additional context for evaluation. 113 114 Note: 115 All arguments must be provided as keywords. 116 """ 117 self.input = input 118 self.output = output 119 self.expected_output = expected_output 120 self.metadata = metadata
Initialize EvaluatorInputs with the provided data.
Arguments:
- input: The input data for evaluation.
- output: The output data to be evaluated.
- expected_output: Optional ground truth for comparison.
- metadata: Optional additional context for evaluation.
Note:
All arguments must be provided as keywords.
123class MapperFunction(Protocol): 124 """Protocol defining the interface for mapper functions in batch evaluation. 125 126 Mapper functions transform API response objects (traces or observations) 127 into the standardized EvaluatorInputs format that evaluators expect. This abstraction 128 allows you to define how to extract and structure evaluation data from different 129 entity types. 130 131 Mapper functions must: 132 - Accept a single item parameter (trace, observation) 133 - Return an EvaluatorInputs instance with input, output, expected_output, metadata 134 - Can be either synchronous or asynchronous 135 - Should handle missing or malformed data gracefully 136 """ 137 138 def __call__( 139 self, 140 *, 141 item: Union["TraceWithFullDetails", "ObservationsView"], 142 **kwargs: Dict[str, Any], 143 ) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]: 144 """Transform an API response object into evaluator inputs. 145 146 This method defines how to extract evaluation-relevant data from the raw 147 API response object. The implementation should map entity-specific fields 148 to the standardized input/output/expected_output/metadata structure. 149 150 Args: 151 item: The API response object to transform. The type depends on the scope: 152 - TraceWithFullDetails: When evaluating traces 153 - ObservationsView: When evaluating observations 154 155 Returns: 156 EvaluatorInputs: A structured container with: 157 - input: The input data that generated the output 158 - output: The output to be evaluated 159 - expected_output: Optional ground truth for comparison 160 - metadata: Optional additional context 161 162 Can return either a direct EvaluatorInputs instance or an awaitable 163 (for async mappers that need to fetch additional data). 164 165 Examples: 166 Basic trace mapper: 167 ```python 168 def map_trace(trace): 169 return EvaluatorInputs( 170 input=trace.input, 171 output=trace.output, 172 expected_output=None, 173 metadata={"trace_id": trace.id, "user": trace.user_id} 174 ) 175 ``` 176 177 Observation mapper with conditional logic: 178 ```python 179 def map_observation(observation): 180 # Extract fields based on observation type 181 if observation.type == "GENERATION": 182 input_data = observation.input 183 output_data = observation.output 184 else: 185 # For other types, use different fields 186 input_data = observation.metadata.get("input") 187 output_data = observation.metadata.get("output") 188 189 return EvaluatorInputs( 190 input=input_data, 191 output=output_data, 192 expected_output=None, 193 metadata={"obs_id": observation.id, "type": observation.type} 194 ) 195 ``` 196 197 Async mapper (if additional processing needed): 198 ```python 199 async def map_trace_async(trace): 200 # Could do async processing here if needed 201 processed_output = await some_async_transformation(trace.output) 202 203 return EvaluatorInputs( 204 input=trace.input, 205 output=processed_output, 206 expected_output=None, 207 metadata={"trace_id": trace.id} 208 ) 209 ``` 210 """ 211 ...
Protocol defining the interface for mapper functions in batch evaluation.
Mapper functions transform API response objects (traces or observations) into the standardized EvaluatorInputs format that evaluators expect. This abstraction allows you to define how to extract and structure evaluation data from different entity types.
Mapper functions must:
- Accept a single item parameter (trace, observation)
- Return an EvaluatorInputs instance with input, output, expected_output, metadata
- Can be either synchronous or asynchronous
- Should handle missing or malformed data gracefully
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
214class CompositeEvaluatorFunction(Protocol): 215 """Protocol defining the interface for composite evaluator functions. 216 217 Composite evaluators create aggregate scores from multiple item-level evaluations. 218 This is commonly used to compute weighted averages, combined metrics, or other 219 composite assessments based on individual evaluation results. 220 221 Composite evaluators: 222 - Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) 223 plus the list of evaluations 224 - Return either a single Evaluation, a list of Evaluations, or a dict 225 - Can be either synchronous or asynchronous 226 - Have access to both raw item data and evaluation results 227 """ 228 229 def __call__( 230 self, 231 *, 232 input: Optional[Any] = None, 233 output: Optional[Any] = None, 234 expected_output: Optional[Any] = None, 235 metadata: Optional[Dict[str, Any]] = None, 236 evaluations: List[Evaluation], 237 **kwargs: Dict[str, Any], 238 ) -> Union[ 239 Evaluation, 240 List[Evaluation], 241 Dict[str, Any], 242 Awaitable[Evaluation], 243 Awaitable[List[Evaluation]], 244 Awaitable[Dict[str, Any]], 245 ]: 246 r"""Create a composite evaluation from item-level evaluation results. 247 248 This method combines multiple evaluation scores into a single composite metric. 249 Common use cases include weighted averages, pass/fail decisions based on multiple 250 criteria, or custom scoring logic that considers multiple dimensions. 251 252 Args: 253 input: The input data that was provided to the system being evaluated. 254 output: The output generated by the system being evaluated. 255 expected_output: The expected/reference output for comparison (if available). 256 metadata: Additional metadata about the evaluation context. 257 evaluations: List of evaluation results from item-level evaluators. 258 Each evaluation contains name, value, comment, and metadata. 259 260 Returns: 261 Can return any of: 262 - Evaluation: A single composite evaluation result 263 - List[Evaluation]: Multiple composite evaluations 264 - Dict: A dict that will be converted to an Evaluation 265 - name: Identifier for the composite metric (e.g., "composite_score") 266 - value: The computed composite value 267 - comment: Optional explanation of how the score was computed 268 - metadata: Optional details about the composition logic 269 270 Can return either a direct Evaluation instance or an awaitable 271 (for async composite evaluators). 272 273 Examples: 274 Simple weighted average: 275 ```python 276 def weighted_composite(*, input, output, expected_output, metadata, evaluations): 277 weights = { 278 "accuracy": 0.5, 279 "relevance": 0.3, 280 "safety": 0.2 281 } 282 283 total_score = 0.0 284 total_weight = 0.0 285 286 for eval in evaluations: 287 if eval.name in weights and isinstance(eval.value, (int, float)): 288 total_score += eval.value * weights[eval.name] 289 total_weight += weights[eval.name] 290 291 final_score = total_score / total_weight if total_weight > 0 else 0.0 292 293 return Evaluation( 294 name="composite_score", 295 value=final_score, 296 comment=f"Weighted average of {len(evaluations)} metrics" 297 ) 298 ``` 299 300 Pass/fail composite based on thresholds: 301 ```python 302 def pass_fail_composite(*, input, output, expected_output, metadata, evaluations): 303 # Must pass all criteria 304 thresholds = { 305 "accuracy": 0.7, 306 "safety": 0.9, 307 "relevance": 0.6 308 } 309 310 passes = True 311 failing_metrics = [] 312 313 for metric, threshold in thresholds.items(): 314 eval_result = next((e for e in evaluations if e.name == metric), None) 315 if eval_result and isinstance(eval_result.value, (int, float)): 316 if eval_result.value < threshold: 317 passes = False 318 failing_metrics.append(metric) 319 320 return Evaluation( 321 name="passes_all_checks", 322 value=passes, 323 comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed", 324 data_type="BOOLEAN" 325 ) 326 ``` 327 328 Async composite with external scoring: 329 ```python 330 async def llm_composite(*, input, output, expected_output, metadata, evaluations): 331 # Use LLM to synthesize multiple evaluation results 332 eval_summary = "\n".join( 333 f"- {e.name}: {e.value}" for e in evaluations 334 ) 335 336 prompt = f"Given these evaluation scores:\n{eval_summary}\n" 337 prompt += f"For the output: {output}\n" 338 prompt += "Provide an overall quality score from 0-1." 339 340 response = await openai.chat.completions.create( 341 model="gpt-4", 342 messages=[{"role": "user", "content": prompt}] 343 ) 344 345 score = float(response.choices[0].message.content.strip()) 346 347 return Evaluation( 348 name="llm_composite_score", 349 value=score, 350 comment="LLM-synthesized composite score" 351 ) 352 ``` 353 354 Context-aware composite: 355 ```python 356 def context_composite(*, input, output, expected_output, metadata, evaluations): 357 # Adjust weighting based on metadata 358 base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2} 359 360 # If metadata indicates high importance, prioritize accuracy 361 if metadata and metadata.get('importance') == 'high': 362 weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1} 363 else: 364 weights = base_weights 365 366 total = sum( 367 e.value * weights.get(e.name, 0) 368 for e in evaluations 369 if isinstance(e.value, (int, float)) 370 ) 371 372 return Evaluation( 373 name="weighted_composite", 374 value=total, 375 comment="Context-aware weighted composite" 376 ) 377 ``` 378 """ 379 ...
Protocol defining the interface for composite evaluator functions.
Composite evaluators create aggregate scores from multiple item-level evaluations. This is commonly used to compute weighted averages, combined metrics, or other composite assessments based on individual evaluation results.
Composite evaluators:
- Accept the same inputs as item-level evaluators (input, output, expected_output, metadata) plus the list of evaluations
- Return either a single Evaluation, a list of Evaluations, or a dict
- Can be either synchronous or asynchronous
- Have access to both raw item data and evaluation results
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
382class EvaluatorStats: 383 """Statistics for a single evaluator's performance during batch evaluation. 384 385 This class tracks detailed metrics about how a specific evaluator performed 386 across all items in a batch evaluation run. It helps identify evaluator issues, 387 understand reliability, and optimize evaluation pipelines. 388 389 Attributes: 390 name: The name of the evaluator function (extracted from __name__). 391 total_runs: Total number of times the evaluator was invoked. 392 successful_runs: Number of times the evaluator completed successfully. 393 failed_runs: Number of times the evaluator raised an exception or failed. 394 total_scores_created: Total number of evaluation scores created by this evaluator. 395 Can be higher than successful_runs if the evaluator returns multiple scores. 396 397 Examples: 398 Accessing evaluator stats from batch evaluation result: 399 ```python 400 result = client.run_batched_evaluation(...) 401 402 for stats in result.evaluator_stats: 403 print(f"Evaluator: {stats.name}") 404 print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") 405 print(f" Scores created: {stats.total_scores_created}") 406 407 if stats.failed_runs > 0: 408 print(f" ⚠️ Failed {stats.failed_runs} times") 409 ``` 410 411 Identifying problematic evaluators: 412 ```python 413 result = client.run_batched_evaluation(...) 414 415 # Find evaluators with high failure rates 416 for stats in result.evaluator_stats: 417 failure_rate = stats.failed_runs / stats.total_runs 418 if failure_rate > 0.1: # More than 10% failures 419 print(f"⚠️ {stats.name} has {failure_rate:.1%} failure rate") 420 print(f" Consider debugging or removing this evaluator") 421 ``` 422 423 Note: 424 All arguments must be passed as keywords when instantiating this class. 425 """ 426 427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Statistics for a single evaluator's performance during batch evaluation.
This class tracks detailed metrics about how a specific evaluator performed across all items in a batch evaluation run. It helps identify evaluator issues, understand reliability, and optimize evaluation pipelines.
Attributes:
- name: The name of the evaluator function (extracted from __name__).
- total_runs: Total number of times the evaluator was invoked.
- successful_runs: Number of times the evaluator completed successfully.
- failed_runs: Number of times the evaluator raised an exception or failed.
- total_scores_created: Total number of evaluation scores created by this evaluator. Can be higher than successful_runs if the evaluator returns multiple scores.
Examples:
Accessing evaluator stats from batch evaluation result:
result = client.run_batched_evaluation(...) for stats in result.evaluator_stats: print(f"Evaluator: {stats.name}") print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failed {stats.failed_runs} times")Identifying problematic evaluators:
result = client.run_batched_evaluation(...) # Find evaluators with high failure rates for stats in result.evaluator_stats: failure_rate = stats.failed_runs / stats.total_runs if failure_rate > 0.1: # More than 10% failures print(f"⚠️ {stats.name} has {failure_rate:.1%} failure rate") print(f" Consider debugging or removing this evaluator")
Note:
All arguments must be passed as keywords when instantiating this class.
427 def __init__( 428 self, 429 *, 430 name: str, 431 total_runs: int = 0, 432 successful_runs: int = 0, 433 failed_runs: int = 0, 434 total_scores_created: int = 0, 435 ): 436 """Initialize EvaluatorStats with the provided metrics. 437 438 Args: 439 name: The evaluator function name. 440 total_runs: Total number of evaluator invocations. 441 successful_runs: Number of successful completions. 442 failed_runs: Number of failures. 443 total_scores_created: Total scores created by this evaluator. 444 445 Note: 446 All arguments must be provided as keywords. 447 """ 448 self.name = name 449 self.total_runs = total_runs 450 self.successful_runs = successful_runs 451 self.failed_runs = failed_runs 452 self.total_scores_created = total_scores_created
Initialize EvaluatorStats with the provided metrics.
Arguments:
- name: The evaluator function name.
- total_runs: Total number of evaluator invocations.
- successful_runs: Number of successful completions.
- failed_runs: Number of failures.
- total_scores_created: Total scores created by this evaluator.
Note:
All arguments must be provided as keywords.
455class BatchEvaluationResumeToken: 456 """Token for resuming a failed batch evaluation run. 457 458 This class encapsulates all the information needed to resume a batch evaluation 459 that was interrupted or failed partway through. It uses timestamp-based filtering 460 to avoid re-processing items that were already evaluated, even if the underlying 461 dataset changed between runs. 462 463 Attributes: 464 scope: The type of items being evaluated ("traces", "observations"). 465 filter: The original JSON filter string used to query items. 466 last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. 467 Used to construct a filter that only fetches items after this timestamp. 468 last_processed_id: The ID of the last successfully processed item, for reference. 469 items_processed: Count of items successfully processed before interruption. 470 471 Examples: 472 Resuming a failed batch evaluation: 473 ```python 474 # Initial run that fails partway through 475 try: 476 result = client.run_batched_evaluation( 477 scope="traces", 478 mapper=my_mapper, 479 evaluators=[evaluator1, evaluator2], 480 filter='{"tags": ["production"]}', 481 max_items=10000 482 ) 483 except Exception as e: 484 print(f"Evaluation failed: {e}") 485 486 # Save the resume token 487 if result.resume_token: 488 # Store resume token for later (e.g., in a file or database) 489 import json 490 with open("resume_token.json", "w") as f: 491 json.dump({ 492 "scope": result.resume_token.scope, 493 "filter": result.resume_token.filter, 494 "last_timestamp": result.resume_token.last_processed_timestamp, 495 "last_id": result.resume_token.last_processed_id, 496 "items_done": result.resume_token.items_processed 497 }, f) 498 499 # Later, resume from where it left off 500 with open("resume_token.json") as f: 501 token_data = json.load(f) 502 503 resume_token = BatchEvaluationResumeToken( 504 scope=token_data["scope"], 505 filter=token_data["filter"], 506 last_processed_timestamp=token_data["last_timestamp"], 507 last_processed_id=token_data["last_id"], 508 items_processed=token_data["items_done"] 509 ) 510 511 # Resume the evaluation 512 result = client.run_batched_evaluation( 513 scope="traces", 514 mapper=my_mapper, 515 evaluators=[evaluator1, evaluator2], 516 resume_from=resume_token 517 ) 518 519 print(f"Processed {result.total_items_processed} additional items") 520 ``` 521 522 Handling partial completion: 523 ```python 524 result = client.run_batched_evaluation(...) 525 526 if not result.completed: 527 print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") 528 print(f"Last item: {result.resume_token.last_processed_id}") 529 print(f"Resume from: {result.resume_token.last_processed_timestamp}") 530 531 # Optionally retry automatically 532 if result.resume_token: 533 print("Retrying...") 534 result = client.run_batched_evaluation( 535 scope=result.resume_token.scope, 536 mapper=my_mapper, 537 evaluators=my_evaluators, 538 resume_from=result.resume_token 539 ) 540 ``` 541 542 Note: 543 All arguments must be passed as keywords when instantiating this class. 544 The timestamp-based approach means that items created after the initial run 545 but before the timestamp will be skipped. This is intentional to avoid 546 duplicates and ensure consistent evaluation. 547 """ 548 549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Token for resuming a failed batch evaluation run.
This class encapsulates all the information needed to resume a batch evaluation that was interrupted or failed partway through. It uses timestamp-based filtering to avoid re-processing items that were already evaluated, even if the underlying dataset changed between runs.
Attributes:
- scope: The type of items being evaluated ("traces", "observations").
- filter: The original JSON filter string used to query items.
- last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item. Used to construct a filter that only fetches items after this timestamp.
- last_processed_id: The ID of the last successfully processed item, for reference.
- items_processed: Count of items successfully processed before interruption.
Examples:
Resuming a failed batch evaluation:
# Initial run that fails partway through try: result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], filter='{"tags": ["production"]}', max_items=10000 ) except Exception as e: print(f"Evaluation failed: {e}") # Save the resume token if result.resume_token: # Store resume token for later (e.g., in a file or database) import json with open("resume_token.json", "w") as f: json.dump({ "scope": result.resume_token.scope, "filter": result.resume_token.filter, "last_timestamp": result.resume_token.last_processed_timestamp, "last_id": result.resume_token.last_processed_id, "items_done": result.resume_token.items_processed }, f) # Later, resume from where it left off with open("resume_token.json") as f: token_data = json.load(f) resume_token = BatchEvaluationResumeToken( scope=token_data["scope"], filter=token_data["filter"], last_processed_timestamp=token_data["last_timestamp"], last_processed_id=token_data["last_id"], items_processed=token_data["items_done"] ) # Resume the evaluation result = client.run_batched_evaluation( scope="traces", mapper=my_mapper, evaluators=[evaluator1, evaluator2], resume_from=resume_token ) print(f"Processed {result.total_items_processed} additional items")Handling partial completion:
result = client.run_batched_evaluation(...) if not result.completed: print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items") print(f"Last item: {result.resume_token.last_processed_id}") print(f"Resume from: {result.resume_token.last_processed_timestamp}") # Optionally retry automatically if result.resume_token: print("Retrying...") result = client.run_batched_evaluation( scope=result.resume_token.scope, mapper=my_mapper, evaluators=my_evaluators, resume_from=result.resume_token )
Note:
All arguments must be passed as keywords when instantiating this class. The timestamp-based approach means that items created after the initial run but before the timestamp will be skipped. This is intentional to avoid duplicates and ensure consistent evaluation.
549 def __init__( 550 self, 551 *, 552 scope: str, 553 filter: Optional[str], 554 last_processed_timestamp: str, 555 last_processed_id: str, 556 items_processed: int, 557 ): 558 """Initialize BatchEvaluationResumeToken with the provided state. 559 560 Args: 561 scope: The scope type ("traces", "observations"). 562 filter: The original JSON filter string. 563 last_processed_timestamp: ISO 8601 timestamp of last processed item. 564 last_processed_id: ID of last processed item. 565 items_processed: Count of items processed before interruption. 566 567 Note: 568 All arguments must be provided as keywords. 569 """ 570 self.scope = scope 571 self.filter = filter 572 self.last_processed_timestamp = last_processed_timestamp 573 self.last_processed_id = last_processed_id 574 self.items_processed = items_processed
Initialize BatchEvaluationResumeToken with the provided state.
Arguments:
- scope: The scope type ("traces", "observations").
- filter: The original JSON filter string.
- last_processed_timestamp: ISO 8601 timestamp of last processed item.
- last_processed_id: ID of last processed item.
- items_processed: Count of items processed before interruption.
Note:
All arguments must be provided as keywords.
577class BatchEvaluationResult: 578 r"""Complete result structure for batch evaluation execution. 579 580 This class encapsulates comprehensive statistics and metadata about a batch 581 evaluation run, including counts, evaluator-specific metrics, timing information, 582 error details, and resume capability. 583 584 Attributes: 585 total_items_fetched: Total number of items fetched from the API. 586 total_items_processed: Number of items successfully evaluated. 587 total_items_failed: Number of items that failed during evaluation. 588 total_scores_created: Total scores created by all item-level evaluators. 589 total_composite_scores_created: Scores created by the composite evaluator. 590 total_evaluations_failed: Number of individual evaluator failures across all items. 591 evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created). 592 resume_token: Token for resuming if evaluation was interrupted (None if completed). 593 completed: True if all items were processed, False if stopped early or failed. 594 duration_seconds: Total time taken to execute the batch evaluation. 595 failed_item_ids: List of IDs for items that failed evaluation. 596 error_summary: Dictionary mapping error types to occurrence counts. 597 has_more_items: True if max_items limit was reached but more items exist. 598 item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite). 599 600 Examples: 601 Basic result inspection: 602 ```python 603 result = client.run_batched_evaluation(...) 604 605 print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") 606 print(f"Scores created: {result.total_scores_created}") 607 print(f"Duration: {result.duration_seconds:.2f}s") 608 print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}") 609 ``` 610 611 Detailed analysis with evaluator stats: 612 ```python 613 result = client.run_batched_evaluation(...) 614 615 print(f"\n📊 Batch Evaluation Results") 616 print(f"{'='*50}") 617 print(f"Items processed: {result.total_items_processed}") 618 print(f"Items failed: {result.total_items_failed}") 619 print(f"Scores created: {result.total_scores_created}") 620 621 if result.total_composite_scores_created > 0: 622 print(f"Composite scores: {result.total_composite_scores_created}") 623 624 print(f"\n📈 Evaluator Performance:") 625 for stats in result.evaluator_stats: 626 success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 627 print(f"\n {stats.name}:") 628 print(f" Success rate: {success_rate:.1%}") 629 print(f" Scores created: {stats.total_scores_created}") 630 if stats.failed_runs > 0: 631 print(f" ⚠️ Failures: {stats.failed_runs}") 632 633 if result.error_summary: 634 print(f"\n⚠️ Errors encountered:") 635 for error_type, count in result.error_summary.items(): 636 print(f" {error_type}: {count}") 637 ``` 638 639 Handling incomplete runs: 640 ```python 641 result = client.run_batched_evaluation(...) 642 643 if not result.completed: 644 print("⚠️ Evaluation incomplete!") 645 646 if result.resume_token: 647 print(f"Processed {result.resume_token.items_processed} items before failure") 648 print(f"Use resume_from parameter to continue from:") 649 print(f" Timestamp: {result.resume_token.last_processed_timestamp}") 650 print(f" Last ID: {result.resume_token.last_processed_id}") 651 652 if result.has_more_items: 653 print(f"ℹ️ More items available beyond max_items limit") 654 ``` 655 656 Performance monitoring: 657 ```python 658 result = client.run_batched_evaluation(...) 659 660 items_per_second = result.total_items_processed / result.duration_seconds 661 avg_scores_per_item = result.total_scores_created / result.total_items_processed 662 663 print(f"Performance metrics:") 664 print(f" Throughput: {items_per_second:.2f} items/second") 665 print(f" Avg scores/item: {avg_scores_per_item:.2f}") 666 print(f" Total duration: {result.duration_seconds:.2f}s") 667 668 if result.total_evaluations_failed > 0: 669 failure_rate = result.total_evaluations_failed / ( 670 result.total_items_processed * len(result.evaluator_stats) 671 ) 672 print(f" Evaluation failure rate: {failure_rate:.1%}") 673 ``` 674 675 Note: 676 All arguments must be passed as keywords when instantiating this class. 677 """ 678 679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations 732 733 def __str__(self) -> str: 734 """Return a formatted string representation of the batch evaluation results. 735 736 Returns: 737 A multi-line string with a summary of the evaluation results. 738 """ 739 lines = [] 740 lines.append("=" * 60) 741 lines.append("Batch Evaluation Results") 742 lines.append("=" * 60) 743 744 # Summary statistics 745 lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}") 746 lines.append(f"Duration: {self.duration_seconds:.2f}s") 747 lines.append(f"\nItems fetched: {self.total_items_fetched}") 748 lines.append(f"Items processed: {self.total_items_processed}") 749 750 if self.total_items_failed > 0: 751 lines.append(f"Items failed: {self.total_items_failed}") 752 753 # Success rate 754 if self.total_items_fetched > 0: 755 success_rate = self.total_items_processed / self.total_items_fetched * 100 756 lines.append(f"Success rate: {success_rate:.1f}%") 757 758 # Scores created 759 lines.append(f"\nScores created: {self.total_scores_created}") 760 if self.total_composite_scores_created > 0: 761 lines.append(f"Composite scores: {self.total_composite_scores_created}") 762 763 total_scores = self.total_scores_created + self.total_composite_scores_created 764 lines.append(f"Total scores: {total_scores}") 765 766 # Evaluator statistics 767 if self.evaluator_stats: 768 lines.append("\nEvaluator Performance:") 769 for stats in self.evaluator_stats: 770 lines.append(f" {stats.name}:") 771 if stats.total_runs > 0: 772 success_rate = ( 773 stats.successful_runs / stats.total_runs * 100 774 if stats.total_runs > 0 775 else 0 776 ) 777 lines.append( 778 f" Runs: {stats.successful_runs}/{stats.total_runs} " 779 f"({success_rate:.1f}% success)" 780 ) 781 lines.append(f" Scores created: {stats.total_scores_created}") 782 if stats.failed_runs > 0: 783 lines.append(f" Failed runs: {stats.failed_runs}") 784 785 # Performance metrics 786 if self.total_items_processed > 0 and self.duration_seconds > 0: 787 items_per_sec = self.total_items_processed / self.duration_seconds 788 lines.append("\nPerformance:") 789 lines.append(f" Throughput: {items_per_sec:.2f} items/second") 790 if self.total_scores_created > 0: 791 avg_scores = self.total_scores_created / self.total_items_processed 792 lines.append(f" Avg scores per item: {avg_scores:.2f}") 793 794 # Errors and warnings 795 if self.error_summary: 796 lines.append("\nErrors encountered:") 797 for error_type, count in self.error_summary.items(): 798 lines.append(f" {error_type}: {count}") 799 800 # Incomplete run information 801 if not self.completed: 802 lines.append("\nWarning: Evaluation incomplete") 803 if self.resume_token: 804 lines.append( 805 f" Last processed: {self.resume_token.last_processed_timestamp}" 806 ) 807 lines.append(f" Items processed: {self.resume_token.items_processed}") 808 lines.append(" Use resume_from parameter to continue") 809 810 if self.has_more_items: 811 lines.append("\nNote: More items available beyond max_items limit") 812 813 lines.append("=" * 60) 814 return "\n".join(lines)
Complete result structure for batch evaluation execution.
This class encapsulates comprehensive statistics and metadata about a batch evaluation run, including counts, evaluator-specific metrics, timing information, error details, and resume capability.
Attributes:
- total_items_fetched: Total number of items fetched from the API.
- total_items_processed: Number of items successfully evaluated.
- total_items_failed: Number of items that failed during evaluation.
- total_scores_created: Total scores created by all item-level evaluators.
- total_composite_scores_created: Scores created by the composite evaluator.
- total_evaluations_failed: Number of individual evaluator failures across all items.
- evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
- resume_token: Token for resuming if evaluation was interrupted (None if completed).
- completed: True if all items were processed, False if stopped early or failed.
- duration_seconds: Total time taken to execute the batch evaluation.
- failed_item_ids: List of IDs for items that failed evaluation.
- error_summary: Dictionary mapping error types to occurrence counts.
- has_more_items: True if max_items limit was reached but more items exist.
- item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
Examples:
Basic result inspection:
result = client.run_batched_evaluation(...) print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}") print(f"Scores created: {result.total_scores_created}") print(f"Duration: {result.duration_seconds:.2f}s") print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")Detailed analysis with evaluator stats:
result = client.run_batched_evaluation(...) print(f"\n📊 Batch Evaluation Results") print(f"{'='*50}") print(f"Items processed: {result.total_items_processed}") print(f"Items failed: {result.total_items_failed}") print(f"Scores created: {result.total_scores_created}") if result.total_composite_scores_created > 0: print(f"Composite scores: {result.total_composite_scores_created}") print(f"\n📈 Evaluator Performance:") for stats in result.evaluator_stats: success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0 print(f"\n {stats.name}:") print(f" Success rate: {success_rate:.1%}") print(f" Scores created: {stats.total_scores_created}") if stats.failed_runs > 0: print(f" ⚠️ Failures: {stats.failed_runs}") if result.error_summary: print(f"\n⚠️ Errors encountered:") for error_type, count in result.error_summary.items(): print(f" {error_type}: {count}")Handling incomplete runs:
result = client.run_batched_evaluation(...) if not result.completed: print("⚠️ Evaluation incomplete!") if result.resume_token: print(f"Processed {result.resume_token.items_processed} items before failure") print(f"Use resume_from parameter to continue from:") print(f" Timestamp: {result.resume_token.last_processed_timestamp}") print(f" Last ID: {result.resume_token.last_processed_id}") if result.has_more_items: print(f"ℹ️ More items available beyond max_items limit")Performance monitoring:
result = client.run_batched_evaluation(...) items_per_second = result.total_items_processed / result.duration_seconds avg_scores_per_item = result.total_scores_created / result.total_items_processed print(f"Performance metrics:") print(f" Throughput: {items_per_second:.2f} items/second") print(f" Avg scores/item: {avg_scores_per_item:.2f}") print(f" Total duration: {result.duration_seconds:.2f}s") if result.total_evaluations_failed > 0: failure_rate = result.total_evaluations_failed / ( result.total_items_processed * len(result.evaluator_stats) ) print(f" Evaluation failure rate: {failure_rate:.1%}")
Note:
All arguments must be passed as keywords when instantiating this class.
679 def __init__( 680 self, 681 *, 682 total_items_fetched: int, 683 total_items_processed: int, 684 total_items_failed: int, 685 total_scores_created: int, 686 total_composite_scores_created: int, 687 total_evaluations_failed: int, 688 evaluator_stats: List[EvaluatorStats], 689 resume_token: Optional[BatchEvaluationResumeToken], 690 completed: bool, 691 duration_seconds: float, 692 failed_item_ids: List[str], 693 error_summary: Dict[str, int], 694 has_more_items: bool, 695 item_evaluations: Dict[str, List["Evaluation"]], 696 ): 697 """Initialize BatchEvaluationResult with comprehensive statistics. 698 699 Args: 700 total_items_fetched: Total items fetched from API. 701 total_items_processed: Items successfully evaluated. 702 total_items_failed: Items that failed evaluation. 703 total_scores_created: Scores from item-level evaluators. 704 total_composite_scores_created: Scores from composite evaluator. 705 total_evaluations_failed: Individual evaluator failures. 706 evaluator_stats: Per-evaluator statistics. 707 resume_token: Token for resuming (None if completed). 708 completed: Whether all items were processed. 709 duration_seconds: Total execution time. 710 failed_item_ids: IDs of failed items. 711 error_summary: Error types and counts. 712 has_more_items: Whether more items exist beyond max_items. 713 item_evaluations: Dictionary mapping item IDs to their evaluation results. 714 715 Note: 716 All arguments must be provided as keywords. 717 """ 718 self.total_items_fetched = total_items_fetched 719 self.total_items_processed = total_items_processed 720 self.total_items_failed = total_items_failed 721 self.total_scores_created = total_scores_created 722 self.total_composite_scores_created = total_composite_scores_created 723 self.total_evaluations_failed = total_evaluations_failed 724 self.evaluator_stats = evaluator_stats 725 self.resume_token = resume_token 726 self.completed = completed 727 self.duration_seconds = duration_seconds 728 self.failed_item_ids = failed_item_ids 729 self.error_summary = error_summary 730 self.has_more_items = has_more_items 731 self.item_evaluations = item_evaluations
Initialize BatchEvaluationResult with comprehensive statistics.
Arguments:
- total_items_fetched: Total items fetched from API.
- total_items_processed: Items successfully evaluated.
- total_items_failed: Items that failed evaluation.
- total_scores_created: Scores from item-level evaluators.
- total_composite_scores_created: Scores from composite evaluator.
- total_evaluations_failed: Individual evaluator failures.
- evaluator_stats: Per-evaluator statistics.
- resume_token: Token for resuming (None if completed).
- completed: Whether all items were processed.
- duration_seconds: Total execution time.
- failed_item_ids: IDs of failed items.
- error_summary: Error types and counts.
- has_more_items: Whether more items exist beyond max_items.
- item_evaluations: Dictionary mapping item IDs to their evaluation results.
Note:
All arguments must be provided as keywords.
1062class RunnerContext: 1063 """Wraps :meth:`Langfuse.run_experiment` with CI-injected defaults. 1064 1065 Intended for use with the ``langfuse/experiment-action`` GitHub Action 1066 (https://github.com/langfuse/experiment-action). The action builds a 1067 ``RunnerContext`` before invoking the user's ``experiment(context)`` 1068 function. Defaults set here (dataset, metadata tags) are applied when 1069 the user omits them on the :meth:`run_experiment` call; users can 1070 override any default by passing the corresponding argument explicitly. 1071 """ 1072 1073 def __init__( 1074 self, 1075 *, 1076 client: "Langfuse", 1077 data: Optional[ExperimentData] = None, 1078 dataset_version: Optional[datetime] = None, 1079 metadata: Optional[Dict[str, str]] = None, 1080 ): 1081 """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. 1082 1083 Typically called by the ``langfuse/experiment-action`` GitHub Action, 1084 not by end users directly. Every field except ``client`` is optional: 1085 fields left as ``None`` simply mean the corresponding argument must be 1086 supplied on the :meth:`run_experiment` call. 1087 1088 Args: 1089 client: Initialized Langfuse SDK client used to execute the 1090 experiment. The action creates this from the 1091 ``langfuse_public_key`` / ``langfuse_secret_key`` / 1092 ``langfuse_base_url`` inputs. 1093 data: Default dataset items to run the experiment on. Accepts 1094 either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. 1095 Injected by the action when ``dataset_name`` is configured. 1096 If ``None``, the user must pass ``data=`` to 1097 :meth:`run_experiment`. 1098 dataset_version: Optional pinned dataset version. Injected by the 1099 action when ``dataset_version`` is configured. 1100 metadata: Default metadata attached to every experiment trace and 1101 the dataset run. The action injects GitHub-sourced tags (SHA, 1102 PR link, workflow run link, branch, GH user, etc.). Merged 1103 with any ``metadata`` passed to :meth:`run_experiment`, with 1104 user-supplied keys winning on collision. 1105 """ 1106 self.client = client 1107 self.data = data 1108 self.dataset_version = dataset_version 1109 self.metadata = metadata 1110 1111 def run_experiment( 1112 self, 1113 *, 1114 name: str, 1115 run_name: Optional[str] = None, 1116 description: Optional[str] = None, 1117 data: Optional[ExperimentData] = None, 1118 task: TaskFunction, 1119 evaluators: List[EvaluatorFunction] = [], 1120 composite_evaluator: Optional["CompositeEvaluatorFunction"] = None, 1121 run_evaluators: List[RunEvaluatorFunction] = [], 1122 max_concurrency: int = 50, 1123 metadata: Optional[Dict[str, str]] = None, 1124 _dataset_version: Optional[datetime] = None, 1125 ) -> ExperimentResult: 1126 resolved_data = data if data is not None else self.data 1127 if resolved_data is None: 1128 raise ValueError( 1129 "`data` must be provided either on the RunnerContext or the run_experiment call" 1130 ) 1131 1132 resolved_dataset_version = ( 1133 _dataset_version if _dataset_version is not None else self.dataset_version 1134 ) 1135 1136 merged_metadata: Optional[Dict[str, str]] 1137 if self.metadata is None and metadata is None: 1138 merged_metadata = None 1139 else: 1140 merged_metadata = {**(self.metadata or {}), **(metadata or {})} 1141 1142 return self.client.run_experiment( 1143 name=name, 1144 run_name=run_name, 1145 description=description, 1146 data=resolved_data, 1147 task=task, 1148 evaluators=evaluators, 1149 composite_evaluator=composite_evaluator, 1150 run_evaluators=run_evaluators, 1151 max_concurrency=max_concurrency, 1152 metadata=merged_metadata, 1153 _dataset_version=resolved_dataset_version, 1154 )
Wraps Langfuse.run_experiment() with CI-injected defaults.
Intended for use with the langfuse/experiment-action GitHub Action
(https://github.com/langfuse/experiment-action). The action builds a
RunnerContext before invoking the user's experiment(context)
function. Defaults set here (dataset, metadata tags) are applied when
the user omits them on the run_experiment() call; users can
override any default by passing the corresponding argument explicitly.
1073 def __init__( 1074 self, 1075 *, 1076 client: "Langfuse", 1077 data: Optional[ExperimentData] = None, 1078 dataset_version: Optional[datetime] = None, 1079 metadata: Optional[Dict[str, str]] = None, 1080 ): 1081 """Build a ``RunnerContext`` populated with defaults for ``run_experiment``. 1082 1083 Typically called by the ``langfuse/experiment-action`` GitHub Action, 1084 not by end users directly. Every field except ``client`` is optional: 1085 fields left as ``None`` simply mean the corresponding argument must be 1086 supplied on the :meth:`run_experiment` call. 1087 1088 Args: 1089 client: Initialized Langfuse SDK client used to execute the 1090 experiment. The action creates this from the 1091 ``langfuse_public_key`` / ``langfuse_secret_key`` / 1092 ``langfuse_base_url`` inputs. 1093 data: Default dataset items to run the experiment on. Accepts 1094 either ``List[LocalExperimentItem]`` or ``List[DatasetItem]``. 1095 Injected by the action when ``dataset_name`` is configured. 1096 If ``None``, the user must pass ``data=`` to 1097 :meth:`run_experiment`. 1098 dataset_version: Optional pinned dataset version. Injected by the 1099 action when ``dataset_version`` is configured. 1100 metadata: Default metadata attached to every experiment trace and 1101 the dataset run. The action injects GitHub-sourced tags (SHA, 1102 PR link, workflow run link, branch, GH user, etc.). Merged 1103 with any ``metadata`` passed to :meth:`run_experiment`, with 1104 user-supplied keys winning on collision. 1105 """ 1106 self.client = client 1107 self.data = data 1108 self.dataset_version = dataset_version 1109 self.metadata = metadata
Build a RunnerContext populated with defaults for run_experiment.
Typically called by the langfuse/experiment-action GitHub Action,
not by end users directly. Every field except client is optional:
fields left as None simply mean the corresponding argument must be
supplied on the run_experiment() call.
Arguments:
- client: Initialized Langfuse SDK client used to execute the
experiment. The action creates this from the
langfuse_public_key/langfuse_secret_key/langfuse_base_urlinputs. - data: Default dataset items to run the experiment on. Accepts
either
List[LocalExperimentItem]orList[DatasetItem]. Injected by the action whendataset_nameis configured. IfNone, the user must passdata=torun_experiment(). - dataset_version: Optional pinned dataset version. Injected by the
action when
dataset_versionis configured. - metadata: Default metadata attached to every experiment trace and
the dataset run. The action injects GitHub-sourced tags (SHA,
PR link, workflow run link, branch, GH user, etc.). Merged
with any
metadatapassed torun_experiment(), with user-supplied keys winning on collision.
1111 def run_experiment( 1112 self, 1113 *, 1114 name: str, 1115 run_name: Optional[str] = None, 1116 description: Optional[str] = None, 1117 data: Optional[ExperimentData] = None, 1118 task: TaskFunction, 1119 evaluators: List[EvaluatorFunction] = [], 1120 composite_evaluator: Optional["CompositeEvaluatorFunction"] = None, 1121 run_evaluators: List[RunEvaluatorFunction] = [], 1122 max_concurrency: int = 50, 1123 metadata: Optional[Dict[str, str]] = None, 1124 _dataset_version: Optional[datetime] = None, 1125 ) -> ExperimentResult: 1126 resolved_data = data if data is not None else self.data 1127 if resolved_data is None: 1128 raise ValueError( 1129 "`data` must be provided either on the RunnerContext or the run_experiment call" 1130 ) 1131 1132 resolved_dataset_version = ( 1133 _dataset_version if _dataset_version is not None else self.dataset_version 1134 ) 1135 1136 merged_metadata: Optional[Dict[str, str]] 1137 if self.metadata is None and metadata is None: 1138 merged_metadata = None 1139 else: 1140 merged_metadata = {**(self.metadata or {}), **(metadata or {})} 1141 1142 return self.client.run_experiment( 1143 name=name, 1144 run_name=run_name, 1145 description=description, 1146 data=resolved_data, 1147 task=task, 1148 evaluators=evaluators, 1149 composite_evaluator=composite_evaluator, 1150 run_evaluators=run_evaluators, 1151 max_concurrency=max_concurrency, 1152 metadata=merged_metadata, 1153 _dataset_version=resolved_dataset_version, 1154 )
1157class RegressionError(Exception): 1158 """Raised by a user's ``experiment`` function to signal a CI gate failure. 1159 1160 Intended for use with the ``langfuse/experiment-action`` GitHub Action 1161 (https://github.com/langfuse/experiment-action). The action catches this 1162 exception and, when ``should_fail_on_error`` is enabled, fails the 1163 workflow run and renders a callout in the PR comment using 1164 ``metric``/``value``/``threshold`` if supplied, otherwise ``str(exc)``. 1165 1166 Callers choose one of three forms: 1167 1168 - ``RegressionError(result=r)`` — minimal, generic message. 1169 - ``RegressionError(result=r, message="...")`` — free-form message. 1170 - ``RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)`` — 1171 structured; ``metric`` and ``value`` must be provided together so the 1172 action can render a targeted callout without ``None`` placeholders. 1173 """ 1174 1175 @overload 1176 def __init__(self, *, result: ExperimentResult) -> None: ... 1177 @overload 1178 def __init__(self, *, result: ExperimentResult, message: str) -> None: ... 1179 @overload 1180 def __init__( 1181 self, 1182 *, 1183 result: ExperimentResult, 1184 metric: str, 1185 value: float, 1186 threshold: Optional[float] = None, 1187 message: Optional[str] = None, 1188 ) -> None: ... 1189 def __init__( 1190 self, 1191 *, 1192 result: ExperimentResult, 1193 metric: Optional[str] = None, 1194 value: Optional[float] = None, 1195 threshold: Optional[float] = None, 1196 message: Optional[str] = None, 1197 ): 1198 self.result = result 1199 self.metric = metric 1200 self.value = value 1201 self.threshold = threshold 1202 if message is not None: 1203 formatted = message 1204 elif metric is not None and value is not None: 1205 formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" 1206 else: 1207 formatted = "Experiment regression detected" 1208 super().__init__(formatted)
Raised by a user's experiment function to signal a CI gate failure.
Intended for use with the langfuse/experiment-action GitHub Action
(https://github.com/langfuse/experiment-action). The action catches this
exception and, when should_fail_on_error is enabled, fails the
workflow run and renders a callout in the PR comment using
metric/value/threshold if supplied, otherwise str(exc).
Callers choose one of three forms:
RegressionError(result=r)— minimal, generic message.RegressionError(result=r, message="...")— free-form message.RegressionError(result=r, metric="acc", value=0.7, threshold=0.9)— structured;metricandvaluemust be provided together so the action can render a targeted callout withoutNoneplaceholders.
1189 def __init__( 1190 self, 1191 *, 1192 result: ExperimentResult, 1193 metric: Optional[str] = None, 1194 value: Optional[float] = None, 1195 threshold: Optional[float] = None, 1196 message: Optional[str] = None, 1197 ): 1198 self.result = result 1199 self.metric = metric 1200 self.value = value 1201 self.threshold = threshold 1202 if message is not None: 1203 formatted = message 1204 elif metric is not None and value is not None: 1205 formatted = f"Regression on `{metric}`: {value} (threshold {threshold})" 1206 else: 1207 formatted = "Experiment regression detected" 1208 super().__init__(formatted)
98def is_default_export_span(span: ReadableSpan) -> bool: 99 """Return whether a span should be exported by default.""" 100 return ( 101 is_langfuse_span(span) or is_genai_span(span) or is_known_llm_instrumentor(span) 102 )
Return whether a span should be exported by default.
61def is_langfuse_span(span: ReadableSpan) -> bool: 62 """Return whether the span was created by the Langfuse SDK tracer.""" 63 return ( 64 span.instrumentation_scope is not None 65 and span.instrumentation_scope.name == LANGFUSE_TRACER_NAME 66 )
Return whether the span was created by the Langfuse SDK tracer.
69def is_genai_span(span: ReadableSpan) -> bool: 70 """Return whether the span has any ``gen_ai.*`` semantic convention attribute.""" 71 if span.attributes is None: 72 return False 73 74 return any( 75 isinstance(key, str) and key.startswith("gen_ai") 76 for key in span.attributes.keys() 77 )
Return whether the span has any gen_ai.* semantic convention attribute.
85def is_known_llm_instrumentor(span: ReadableSpan) -> bool: 86 """Return whether the span comes from a known LLM instrumentation scope.""" 87 if span.instrumentation_scope is None: 88 return False 89 90 scope_name = span.instrumentation_scope.name 91 92 return any( 93 _matches_scope_prefix(scope_name, prefix) 94 for prefix in KNOWN_LLM_INSTRUMENTATION_SCOPE_PREFIXES 95 )
Return whether the span comes from a known LLM instrumentation scope.
224class MaskOtelSpansFunction(Protocol): 225 """Function protocol for export-stage OpenTelemetry span masking. 226 227 `mask_otel_spans` runs after Langfuse decides which spans this client should 228 export and after export-stage media handling has converted supported media 229 payloads into Langfuse media references. It affects only the spans exported 230 by this Langfuse client. If the same OpenTelemetry spans are sent to another 231 exporter, that exporter receives its own unmodified copy. 232 233 The function is synchronous. It usually runs on the OpenTelemetry batch span 234 processor worker thread; during `flush()` and shutdown it may run on the 235 caller thread. Keep it deterministic and fast, and avoid relying on request 236 locals, the current active span, or async I/O. 237 238 Return `None` to leave the whole batch unchanged, or return 239 `MaskOtelSpansResult` with sparse patches for the spans that should change. 240 241 Example: 242 ```python 243 from typing import Optional 244 245 from langfuse import Langfuse 246 from langfuse.types import ( 247 MaskOtelSpansParams, 248 MaskOtelSpansResult, 249 OtelSpanPatch, 250 ) 251 252 def mask_otel_spans( 253 *, params: MaskOtelSpansParams 254 ) -> Optional[MaskOtelSpansResult]: 255 patches = {} 256 257 for identifier, span in params.spans.items(): 258 if span.instrumentation_scope_name == "openai": 259 patches[identifier] = OtelSpanPatch( 260 delete_attributes=( 261 "gen_ai.prompt.0.content", 262 "gen_ai.completion.0.content", 263 ), 264 set_attributes={"masking.applied": True}, 265 ) 266 267 return MaskOtelSpansResult(span_patches=patches) 268 269 langfuse = Langfuse(mask_otel_spans=mask_otel_spans) 270 ``` 271 """ 272 273 def __call__( 274 self, *, params: MaskOtelSpansParams 275 ) -> Optional[MaskOtelSpansResult]: ...
Function protocol for export-stage OpenTelemetry span masking.
mask_otel_spans runs after Langfuse decides which spans this client should
export and after export-stage media handling has converted supported media
payloads into Langfuse media references. It affects only the spans exported
by this Langfuse client. If the same OpenTelemetry spans are sent to another
exporter, that exporter receives its own unmodified copy.
The function is synchronous. It usually runs on the OpenTelemetry batch span
processor worker thread; during flush() and shutdown it may run on the
caller thread. Keep it deterministic and fast, and avoid relying on request
locals, the current active span, or async I/O.
Return None to leave the whole batch unchanged, or return
MaskOtelSpansResult with sparse patches for the spans that should change.
Example:
from typing import Optional from langfuse import Langfuse from langfuse.types import ( MaskOtelSpansParams, MaskOtelSpansResult, OtelSpanPatch, ) def mask_otel_spans( *, params: MaskOtelSpansParams ) -> Optional[MaskOtelSpansResult]: patches = {} for identifier, span in params.spans.items(): if span.instrumentation_scope_name == "openai": patches[identifier] = OtelSpanPatch( delete_attributes=( "gen_ai.prompt.0.content", "gen_ai.completion.0.content", ), set_attributes={"masking.applied": True}, ) return MaskOtelSpansResult(span_patches=patches) langfuse = Langfuse(mask_otel_spans=mask_otel_spans)
1927def _no_init_or_replace_init(self, *args, **kwargs): 1928 cls = type(self) 1929 1930 if cls._is_protocol: 1931 raise TypeError('Protocols cannot be instantiated') 1932 1933 # Already using a custom `__init__`. No need to calculate correct 1934 # `__init__` to call. This can lead to RecursionError. See bpo-45121. 1935 if cls.__init__ is not _no_init_or_replace_init: 1936 return 1937 1938 # Initially, `__init__` of a protocol subclass is set to `_no_init_or_replace_init`. 1939 # The first instantiation of the subclass will call `_no_init_or_replace_init` which 1940 # searches for a proper new `__init__` in the MRO. The new `__init__` 1941 # replaces the subclass' old `__init__` (ie `_no_init_or_replace_init`). Subsequent 1942 # instantiation of the protocol subclass will thus use the new 1943 # `__init__` and no longer call `_no_init_or_replace_init`. 1944 for base in cls.__mro__: 1945 init = base.__dict__.get('__init__', _no_init_or_replace_init) 1946 if init is not _no_init_or_replace_init: 1947 cls.__init__ = init 1948 break 1949 else: 1950 # should not happen 1951 cls.__init__ = object.__init__ 1952 1953 cls.__init__(self, *args, **kwargs)
123@dataclass(frozen=True) 124class MaskOtelSpansParams: 125 """Input passed to an export-stage OpenTelemetry span masking function. 126 127 A single call receives one OpenTelemetry export batch, not necessarily a 128 complete trace, request, or Langfuse observation tree. Batch contents depend 129 on OpenTelemetry span processor settings such as `flush_at`, 130 `flush_interval`, explicit `flush()`, and shutdown. 131 132 Example: 133 ```python 134 from typing import Optional 135 136 from langfuse.types import ( 137 MaskOtelSpansParams, 138 MaskOtelSpansResult, 139 OtelSpanPatch, 140 ) 141 142 def mask_otel_spans( 143 *, params: MaskOtelSpansParams 144 ) -> Optional[MaskOtelSpansResult]: 145 patches = {} 146 147 for identifier, span in params.spans.items(): 148 if "http.request.header.authorization" in span.attributes: 149 patches[identifier] = OtelSpanPatch( 150 delete_attributes=("http.request.header.authorization",), 151 set_attributes={"security.redacted": True}, 152 ) 153 154 return MaskOtelSpansResult(span_patches=patches) 155 ``` 156 157 Attributes: 158 spans: Read-only mapping from stable span identifiers to span snapshots. 159 Return patches using keys from this mapping. 160 """ 161 162 spans: Mapping[OtelSpanIdentifier, OtelSpanData]
Input passed to an export-stage OpenTelemetry span masking function.
A single call receives one OpenTelemetry export batch, not necessarily a
complete trace, request, or Langfuse observation tree. Batch contents depend
on OpenTelemetry span processor settings such as flush_at,
flush_interval, explicit flush(), and shutdown.
Example:
from typing import Optional from langfuse.types import ( MaskOtelSpansParams, MaskOtelSpansResult, OtelSpanPatch, ) def mask_otel_spans( *, params: MaskOtelSpansParams ) -> Optional[MaskOtelSpansResult]: patches = {} for identifier, span in params.spans.items(): if "http.request.header.authorization" in span.attributes: patches[identifier] = OtelSpanPatch( delete_attributes=("http.request.header.authorization",), set_attributes={"security.redacted": True}, ) return MaskOtelSpansResult(span_patches=patches)
Attributes:
- spans: Read-only mapping from stable span identifiers to span snapshots. Return patches using keys from this mapping.
200@dataclass(frozen=True) 201class MaskOtelSpansResult: 202 """Patches returned by a `mask_otel_spans` function. 203 204 Omit spans that do not need changes. A mapping value of `None` also leaves 205 that span unchanged. Returning an invalid patch to drop a span is not a 206 supported API; use `should_export_span` when you need span-level export 207 filtering. 208 209 If `mask_otel_spans` raises or returns an object that is not a 210 `MaskOtelSpansResult`, Langfuse drops the whole export batch. If one 211 individual `OtelSpanPatch` is invalid, Langfuse drops only that span from 212 the export batch. 213 214 Attributes: 215 span_patches: Mapping from identifiers in `MaskOtelSpansParams.spans` to 216 sparse attribute patches. 217 """ 218 219 span_patches: Mapping[OtelSpanIdentifier, Optional[OtelSpanPatch]] = field( 220 default_factory=lambda: MappingProxyType({}) 221 )
Patches returned by a mask_otel_spans function.
Omit spans that do not need changes. A mapping value of None also leaves
that span unchanged. Returning an invalid patch to drop a span is not a
supported API; use should_export_span when you need span-level export
filtering.
If mask_otel_spans raises or returns an object that is not a
MaskOtelSpansResult, Langfuse drops the whole export batch. If one
individual OtelSpanPatch is invalid, Langfuse drops only that span from
the export batch.
Attributes:
- span_patches: Mapping from identifiers in
MaskOtelSpansParams.spansto sparse attribute patches.
82@dataclass(frozen=True) 83class OtelSpanData: 84 """Read-only OpenTelemetry span snapshot passed to `mask_otel_spans`. 85 86 The snapshot contains the span data that Langfuse is about to export after 87 the SDK has applied `should_export_span` filtering and export-stage media 88 processing. The mappings are immutable views and mutating them is not 89 supported; return an `OtelSpanPatch` to change exported attributes. 90 91 `mask_otel_spans` can only change span attributes. It cannot change the 92 span name, IDs, parent relationship, resource attributes, events, links, or 93 instrumentation scope. 94 95 Attributes: 96 trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID. 97 span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID. 98 parent_span_id: Lowercase hexadecimal parent span ID, or `None` for a 99 root span or when the parent is not available. 100 name: OpenTelemetry span name. 101 instrumentation_scope_name: Name of the instrumentation scope that 102 emitted the span, for example `openai` or `langfuse`. 103 instrumentation_scope_version: Version of the instrumentation scope, if 104 the instrumentation library provided one. 105 attributes: Read-only attributes that will be exported unless patched. 106 Values use OpenTelemetry `AttributeValue` types: strings, booleans, 107 numbers, or homogeneous sequences of those scalar values. 108 resource_attributes: Read-only resource attributes from the span's 109 OpenTelemetry resource. These are available for decisions only and 110 cannot be patched through `mask_otel_spans`. 111 """ 112 113 trace_id: str 114 span_id: str 115 parent_span_id: Optional[str] 116 name: str 117 instrumentation_scope_name: Optional[str] 118 instrumentation_scope_version: Optional[str] 119 attributes: Mapping[str, AttributeValue] 120 resource_attributes: Mapping[str, AttributeValue]
Read-only OpenTelemetry span snapshot passed to mask_otel_spans.
The snapshot contains the span data that Langfuse is about to export after
the SDK has applied should_export_span filtering and export-stage media
processing. The mappings are immutable views and mutating them is not
supported; return an OtelSpanPatch to change exported attributes.
mask_otel_spans can only change span attributes. It cannot change the
span name, IDs, parent relationship, resource attributes, events, links, or
instrumentation scope.
Attributes:
- trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
- span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
- parent_span_id: Lowercase hexadecimal parent span ID, or
Nonefor a root span or when the parent is not available. - name: OpenTelemetry span name.
- instrumentation_scope_name: Name of the instrumentation scope that
emitted the span, for example
openaiorlangfuse. - instrumentation_scope_version: Version of the instrumentation scope, if the instrumentation library provided one.
- attributes: Read-only attributes that will be exported unless patched.
Values use OpenTelemetry
AttributeValuetypes: strings, booleans, numbers, or homogeneous sequences of those scalar values. - resource_attributes: Read-only resource attributes from the span's
OpenTelemetry resource. These are available for decisions only and
cannot be patched through
mask_otel_spans.
65@dataclass(frozen=True) 66class OtelSpanIdentifier: 67 """Stable key for one OpenTelemetry span in a masking batch. 68 69 Use this object as the key when returning a patch for a span. It is a 70 frozen, hashable dataclass, so the safest pattern is to reuse the exact 71 identifier object from `MaskOtelSpansParams.spans` instead of rebuilding it. 72 73 Attributes: 74 trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID. 75 span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID. 76 """ 77 78 trace_id: str 79 span_id: str
Stable key for one OpenTelemetry span in a masking batch.
Use this object as the key when returning a patch for a span. It is a
frozen, hashable dataclass, so the safest pattern is to reuse the exact
identifier object from MaskOtelSpansParams.spans instead of rebuilding it.
Attributes:
- trace_id: Lowercase 32-character hexadecimal OpenTelemetry trace ID.
- span_id: Lowercase 16-character hexadecimal OpenTelemetry span ID.
165@dataclass(frozen=True) 166class OtelSpanPatch: 167 """Attribute changes to apply to one OpenTelemetry span before export. 168 169 Patches are sparse: include only the attributes that should change. Langfuse 170 deletes `delete_attributes` first and then applies `set_attributes`, so a key 171 present in both fields is exported with the value from `set_attributes`. 172 173 Attribute values must be valid OpenTelemetry attributes: strings, booleans, 174 integers, floats, or homogeneous sequences of those scalar types. If one 175 value is not valid for OpenTelemetry, Langfuse removes that attribute from 176 the export rather than sending an invalid span. 177 178 Example: 179 ```python 180 OtelSpanPatch( 181 delete_attributes=("gen_ai.prompt.0.content",), 182 set_attributes={ 183 "gen_ai.prompt.redacted": True, 184 "app.masking.rule": "drop_prompt_text", 185 }, 186 ) 187 ``` 188 189 Attributes: 190 set_attributes: Attribute values to add or replace on the exported span. 191 delete_attributes: Attribute keys to remove from the exported span. 192 """ 193 194 set_attributes: Mapping[str, AttributeValue] = field( 195 default_factory=lambda: MappingProxyType({}) 196 ) 197 delete_attributes: Sequence[str] = field(default_factory=tuple)
Attribute changes to apply to one OpenTelemetry span before export.
Patches are sparse: include only the attributes that should change. Langfuse
deletes delete_attributes first and then applies set_attributes, so a key
present in both fields is exported with the value from set_attributes.
Attribute values must be valid OpenTelemetry attributes: strings, booleans, integers, floats, or homogeneous sequences of those scalar types. If one value is not valid for OpenTelemetry, Langfuse removes that attribute from the export rather than sending an invalid span.
Example:
OtelSpanPatch( delete_attributes=("gen_ai.prompt.0.content",), set_attributes={ "gen_ai.prompt.redacted": True, "app.masking.rule": "drop_prompt_text", }, )
Attributes:
- set_attributes: Attribute values to add or replace on the exported span.
- delete_attributes: Attribute keys to remove from the exported span.