[Observability][Bugfix] Fix expected an instance of Token, got None error in OpenTelemetry (#18934)

This commit is contained in:
AichiB7A 2025-04-28 10:31:13 +08:00 committed by GitHub
parent 7a62202392
commit 8299614e60
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 166 additions and 226 deletions

View File

@ -52,7 +52,6 @@ def initialize_extensions(app: DifyApp):
ext_mail,
ext_migrate,
ext_otel,
ext_otel_patch,
ext_proxy_fix,
ext_redis,
ext_repositories,
@ -85,7 +84,6 @@ def initialize_extensions(app: DifyApp):
ext_proxy_fix,
ext_blueprints,
ext_commands,
ext_otel_patch, # Apply patch before initializing OpenTelemetry
ext_otel,
]
for ext in extensions:

View File

@ -8,35 +8,77 @@ from typing import Union
from celery.signals import worker_init # type: ignore
from flask_login import user_loaded_from_request, user_logged_in # type: ignore
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.celery import CeleryInstrumentor
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.metrics import get_meter, get_meter_provider, set_meter_provider
from opentelemetry.propagate import set_global_textmap
from opentelemetry.propagators.b3 import B3Format
from opentelemetry.propagators.composite import CompositePropagator
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import ConsoleMetricExporter, PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (
BatchSpanProcessor,
ConsoleSpanExporter,
)
from opentelemetry.sdk.trace.sampling import ParentBasedTraceIdRatio
from opentelemetry.semconv.resource import ResourceAttributes
from opentelemetry.trace import Span, get_current_span, get_tracer_provider, set_tracer_provider
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
from opentelemetry.trace.status import StatusCode
from configs import dify_config
from dify_app import DifyApp
class ExceptionLoggingHandler(logging.Handler):
@user_logged_in.connect
@user_loaded_from_request.connect
def on_user_loaded(_sender, user):
if dify_config.ENABLE_OTEL:
from opentelemetry.trace import get_current_span
if user:
current_span = get_current_span()
if current_span:
current_span.set_attribute("service.tenant.id", user.current_tenant_id)
current_span.set_attribute("service.user.id", user.id)
def init_app(app: DifyApp):
def is_celery_worker():
return "celery" in sys.argv[0].lower()
def instrument_exception_logging():
exception_handler = ExceptionLoggingHandler()
logging.getLogger().addHandler(exception_handler)
def init_flask_instrumentor(app: DifyApp):
meter = get_meter("http_metrics", version=dify_config.CURRENT_VERSION)
_http_response_counter = meter.create_counter(
"http.server.response.count", description="Total number of HTTP responses by status code", unit="{response}"
)
def response_hook(span: Span, status: str, response_headers: list):
if span and span.is_recording():
if status.startswith("2"):
span.set_status(StatusCode.OK)
else:
span.set_status(StatusCode.ERROR, status)
status = status.split(" ")[0]
status_code = int(status)
status_class = f"{status_code // 100}xx"
_http_response_counter.add(1, {"status_code": status_code, "status_class": status_class})
instrumentor = FlaskInstrumentor()
if dify_config.DEBUG:
logging.info("Initializing Flask instrumentor")
instrumentor.instrument_app(app, response_hook=response_hook)
def init_sqlalchemy_instrumentor(app: DifyApp):
with app.app_context():
engines = list(app.extensions["sqlalchemy"].engines.values())
SQLAlchemyInstrumentor().instrument(enable_commenter=True, engines=engines)
def setup_context_propagation():
# Configure propagators
set_global_textmap(
CompositePropagator(
[
TraceContextTextMapPropagator(), # W3C trace context
B3Format(), # B3 propagation (used by many systems)
]
)
)
def shutdown_tracer():
provider = trace.get_tracer_provider()
if hasattr(provider, "force_flush"):
provider.force_flush()
class ExceptionLoggingHandler(logging.Handler):
"""Custom logging handler that creates spans for logging.exception() calls"""
def emit(self, record):
@ -60,19 +102,30 @@ class ExceptionLoggingHandler(logging.Handler):
except Exception:
pass
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.celery import CeleryInstrumentor
from opentelemetry.instrumentation.flask import FlaskInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.metrics import get_meter, get_meter_provider, set_meter_provider
from opentelemetry.propagate import set_global_textmap
from opentelemetry.propagators.b3 import B3Format
from opentelemetry.propagators.composite import CompositePropagator
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import ConsoleMetricExporter, PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (
BatchSpanProcessor,
ConsoleSpanExporter,
)
from opentelemetry.sdk.trace.sampling import ParentBasedTraceIdRatio
from opentelemetry.semconv.resource import ResourceAttributes
from opentelemetry.trace import Span, get_tracer_provider, set_tracer_provider
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
from opentelemetry.trace.status import StatusCode
@user_logged_in.connect
@user_loaded_from_request.connect
def on_user_loaded(_sender, user):
if user:
current_span = get_current_span()
if current_span:
current_span.set_attribute("service.tenant.id", user.current_tenant_id)
current_span.set_attribute("service.user.id", user.id)
def init_app(app: DifyApp):
if dify_config.ENABLE_OTEL:
setup_context_propagation()
# Initialize OpenTelemetry
# Follow Semantic Convertions 1.32.0 to define resource attributes
@ -133,67 +186,19 @@ def init_app(app: DifyApp):
atexit.register(shutdown_tracer)
def is_celery_worker():
return "celery" in sys.argv[0].lower()
def instrument_exception_logging():
exception_handler = ExceptionLoggingHandler()
logging.getLogger().addHandler(exception_handler)
def init_flask_instrumentor(app: DifyApp):
meter = get_meter("http_metrics", version=dify_config.CURRENT_VERSION)
_http_response_counter = meter.create_counter(
"http.server.response.count", description="Total number of HTTP responses by status code", unit="{response}"
)
def response_hook(span: Span, status: str, response_headers: list):
if span and span.is_recording():
if status.startswith("2"):
span.set_status(StatusCode.OK)
else:
span.set_status(StatusCode.ERROR, status)
status = status.split(" ")[0]
status_code = int(status)
status_class = f"{status_code // 100}xx"
_http_response_counter.add(1, {"status_code": status_code, "status_class": status_class})
instrumentor = FlaskInstrumentor()
if dify_config.DEBUG:
logging.info("Initializing Flask instrumentor")
instrumentor.instrument_app(app, response_hook=response_hook)
def init_sqlalchemy_instrumentor(app: DifyApp):
with app.app_context():
engines = list(app.extensions["sqlalchemy"].engines.values())
SQLAlchemyInstrumentor().instrument(enable_commenter=True, engines=engines)
def setup_context_propagation():
# Configure propagators
set_global_textmap(
CompositePropagator(
[
TraceContextTextMapPropagator(), # W3C trace context
B3Format(), # B3 propagation (used by many systems)
]
)
)
def is_enabled():
return dify_config.ENABLE_OTEL
@worker_init.connect(weak=False)
def init_celery_worker(*args, **kwargs):
if dify_config.ENABLE_OTEL:
from opentelemetry.instrumentation.celery import CeleryInstrumentor
from opentelemetry.metrics import get_meter_provider
from opentelemetry.trace import get_tracer_provider
tracer_provider = get_tracer_provider()
metric_provider = get_meter_provider()
if dify_config.DEBUG:
logging.info("Initializing OpenTelemetry for Celery worker")
CeleryInstrumentor(tracer_provider=tracer_provider, meter_provider=metric_provider).instrument()
def shutdown_tracer():
provider = trace.get_tracer_provider()
if hasattr(provider, "force_flush"):
provider.force_flush()

View File

@ -1,63 +0,0 @@
"""
Patch for OpenTelemetry context detach method to handle None tokens gracefully.
This patch addresses the issue where OpenTelemetry's context.detach() method raises a TypeError
when called with a None token. The error occurs in the contextvars_context.py file where it tries
to call reset() on a None token.
Related GitHub issue: https://github.com/langgenius/dify/issues/18496
Error being fixed:
```
Traceback (most recent call last):
File "opentelemetry/context/__init__.py", line 154, in detach
_RUNTIME_CONTEXT.detach(token)
File "opentelemetry/context/contextvars_context.py", line 50, in detach
self._current_context.reset(token) # type: ignore
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: expected an instance of Token, got None
```
Instead of modifying the third-party package directly, this patch monkey-patches the
context.detach method to gracefully handle None tokens.
"""
import logging
from functools import wraps
from opentelemetry import context
logger = logging.getLogger(__name__)
# Store the original detach method
original_detach = context.detach
# Create a patched version that handles None tokens
@wraps(original_detach)
def patched_detach(token):
"""
A patched version of context.detach that handles None tokens gracefully.
"""
if token is None:
logger.debug("Attempted to detach a None token, skipping")
return
return original_detach(token)
def is_enabled():
"""
Check if the extension is enabled.
Always enable this patch to prevent errors even when OpenTelemetry is disabled.
"""
return True
def init_app(app):
"""
Initialize the OpenTelemetry context patch.
"""
# Replace the original detach method with our patched version
context.detach = patched_detach
logger.info("OpenTelemetry context.detach patched to handle None tokens")