535 lines
19 KiB
Python
535 lines
19 KiB
Python
# Copyright 2014-present MongoDB, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
# may not use this file except in compliance with the License. You
|
|
# may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied. See the License for the specific language governing
|
|
# permissions and limitations under the License.
|
|
|
|
"""Class to monitor a MongoDB server on a background thread."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import atexit
|
|
import logging
|
|
import time
|
|
import weakref
|
|
from typing import TYPE_CHECKING, Any, Mapping, Optional, cast
|
|
|
|
from pymongo import common
|
|
from pymongo._csot import MovingMinimum
|
|
from pymongo.errors import NetworkTimeout, NotPrimaryError, OperationFailure, _OperationCancelled
|
|
from pymongo.hello import Hello
|
|
from pymongo.lock import _create_lock
|
|
from pymongo.logger import _SDAM_LOGGER, _debug_log, _SDAMStatusMessage
|
|
from pymongo.pool_options import _is_faas
|
|
from pymongo.read_preferences import MovingAverage
|
|
from pymongo.server_description import ServerDescription
|
|
from pymongo.srv_resolver import _SrvResolver
|
|
from pymongo.synchronous import periodic_executor
|
|
from pymongo.synchronous.periodic_executor import _shutdown_executors
|
|
|
|
if TYPE_CHECKING:
|
|
from pymongo.synchronous.pool import Connection, Pool, _CancellationContext
|
|
from pymongo.synchronous.settings import TopologySettings
|
|
from pymongo.synchronous.topology import Topology
|
|
|
|
_IS_SYNC = True
|
|
|
|
|
|
def _sanitize(error: Exception) -> None:
|
|
"""PYTHON-2433 Clear error traceback info."""
|
|
error.__traceback__ = None
|
|
error.__context__ = None
|
|
error.__cause__ = None
|
|
|
|
|
|
def _monotonic_duration(start: float) -> float:
|
|
"""Return the duration since the given start time.
|
|
|
|
Accounts for buggy platforms where time.monotonic() is not monotonic.
|
|
See PYTHON-4600.
|
|
"""
|
|
return max(0.0, time.monotonic() - start)
|
|
|
|
|
|
class MonitorBase:
|
|
def __init__(self, topology: Topology, name: str, interval: int, min_interval: float):
|
|
"""Base class to do periodic work on a background thread.
|
|
|
|
The background thread is signaled to stop when the Topology or
|
|
this instance is freed.
|
|
"""
|
|
|
|
# We strongly reference the executor and it weakly references us via
|
|
# this closure. When the monitor is freed, stop the executor soon.
|
|
def target() -> bool:
|
|
monitor = self_ref()
|
|
if monitor is None:
|
|
return False # Stop the executor.
|
|
monitor._run() # type:ignore[attr-defined]
|
|
return True
|
|
|
|
executor = periodic_executor.PeriodicExecutor(
|
|
interval=interval, min_interval=min_interval, target=target, name=name
|
|
)
|
|
|
|
self._executor = executor
|
|
|
|
def _on_topology_gc(dummy: Optional[Topology] = None) -> None:
|
|
# This prevents GC from waiting 10 seconds for hello to complete
|
|
# See test_cleanup_executors_on_client_del.
|
|
monitor = self_ref()
|
|
if monitor:
|
|
monitor.gc_safe_close()
|
|
|
|
# Avoid cycles. When self or topology is freed, stop executor soon.
|
|
self_ref = weakref.ref(self, executor.close)
|
|
self._topology = weakref.proxy(topology, _on_topology_gc)
|
|
_register(self)
|
|
|
|
def open(self) -> None:
|
|
"""Start monitoring, or restart after a fork.
|
|
|
|
Multiple calls have no effect.
|
|
"""
|
|
self._executor.open()
|
|
|
|
def gc_safe_close(self) -> None:
|
|
"""GC safe close."""
|
|
self._executor.close()
|
|
|
|
def close(self) -> None:
|
|
"""Close and stop monitoring.
|
|
|
|
open() restarts the monitor after closing.
|
|
"""
|
|
self.gc_safe_close()
|
|
|
|
def join(self, timeout: Optional[int] = None) -> None:
|
|
"""Wait for the monitor to stop."""
|
|
self._executor.join(timeout)
|
|
|
|
def request_check(self) -> None:
|
|
"""If the monitor is sleeping, wake it soon."""
|
|
self._executor.wake()
|
|
|
|
|
|
class Monitor(MonitorBase):
|
|
def __init__(
|
|
self,
|
|
server_description: ServerDescription,
|
|
topology: Topology,
|
|
pool: Pool,
|
|
topology_settings: TopologySettings,
|
|
):
|
|
"""Class to monitor a MongoDB server on a background thread.
|
|
|
|
Pass an initial ServerDescription, a Topology, a Pool, and
|
|
TopologySettings.
|
|
|
|
The Topology is weakly referenced. The Pool must be exclusive to this
|
|
Monitor.
|
|
"""
|
|
super().__init__(
|
|
topology,
|
|
"pymongo_server_monitor_thread",
|
|
topology_settings.heartbeat_frequency,
|
|
common.MIN_HEARTBEAT_INTERVAL,
|
|
)
|
|
self._server_description = server_description
|
|
self._pool = pool
|
|
self._settings = topology_settings
|
|
self._listeners = self._settings._pool_options._event_listeners
|
|
self._publish = self._listeners is not None and self._listeners.enabled_for_server_heartbeat
|
|
self._cancel_context: Optional[_CancellationContext] = None
|
|
self._rtt_monitor = _RttMonitor(
|
|
topology,
|
|
topology_settings,
|
|
topology._create_pool_for_monitor(server_description.address),
|
|
)
|
|
if topology_settings.server_monitoring_mode == "stream":
|
|
self._stream = True
|
|
elif topology_settings.server_monitoring_mode == "poll":
|
|
self._stream = False
|
|
else:
|
|
self._stream = not _is_faas()
|
|
|
|
def cancel_check(self) -> None:
|
|
"""Cancel any concurrent hello check.
|
|
|
|
Note: this is called from a weakref.proxy callback and MUST NOT take
|
|
any locks.
|
|
"""
|
|
context = self._cancel_context
|
|
if context:
|
|
# Note: we cannot close the socket because doing so may cause
|
|
# concurrent reads/writes to hang until a timeout occurs
|
|
# (depending on the platform).
|
|
context.cancel()
|
|
|
|
def _start_rtt_monitor(self) -> None:
|
|
"""Start an _RttMonitor that periodically runs ping."""
|
|
# If this monitor is closed directly before (or during) this open()
|
|
# call, the _RttMonitor will not be closed. Checking if this monitor
|
|
# was closed directly after resolves the race.
|
|
self._rtt_monitor.open()
|
|
if self._executor._stopped:
|
|
self._rtt_monitor.close()
|
|
|
|
def gc_safe_close(self) -> None:
|
|
self._executor.close()
|
|
self._rtt_monitor.gc_safe_close()
|
|
self.cancel_check()
|
|
|
|
def close(self) -> None:
|
|
self.gc_safe_close()
|
|
self._rtt_monitor.close()
|
|
# Increment the generation and maybe close the socket. If the executor
|
|
# thread has the socket checked out, it will be closed when checked in.
|
|
self._reset_connection()
|
|
|
|
def _reset_connection(self) -> None:
|
|
# Clear our pooled connection.
|
|
self._pool.reset()
|
|
|
|
def _run(self) -> None:
|
|
try:
|
|
prev_sd = self._server_description
|
|
try:
|
|
self._server_description = self._check_server()
|
|
except _OperationCancelled as exc:
|
|
_sanitize(exc)
|
|
# Already closed the connection, wait for the next check.
|
|
self._server_description = ServerDescription(
|
|
self._server_description.address, error=exc
|
|
)
|
|
if prev_sd.is_server_type_known:
|
|
# Immediately retry since we've already waited 500ms to
|
|
# discover that we've been cancelled.
|
|
self._executor.skip_sleep()
|
|
return
|
|
|
|
# Update the Topology and clear the server pool on error.
|
|
self._topology.on_change(
|
|
self._server_description,
|
|
reset_pool=self._server_description.error,
|
|
interrupt_connections=isinstance(self._server_description.error, NetworkTimeout),
|
|
)
|
|
|
|
if self._stream and (
|
|
self._server_description.is_server_type_known
|
|
and self._server_description.topology_version
|
|
):
|
|
self._start_rtt_monitor()
|
|
# Immediately check for the next streaming response.
|
|
self._executor.skip_sleep()
|
|
|
|
if self._server_description.error and prev_sd.is_server_type_known:
|
|
# Immediately retry on network errors.
|
|
self._executor.skip_sleep()
|
|
except ReferenceError:
|
|
# Topology was garbage-collected.
|
|
self.close()
|
|
|
|
def _check_server(self) -> ServerDescription:
|
|
"""Call hello or read the next streaming response.
|
|
|
|
Returns a ServerDescription.
|
|
"""
|
|
start = time.monotonic()
|
|
try:
|
|
try:
|
|
return self._check_once()
|
|
except (OperationFailure, NotPrimaryError) as exc:
|
|
# Update max cluster time even when hello fails.
|
|
details = cast(Mapping[str, Any], exc.details)
|
|
self._topology.receive_cluster_time(details.get("$clusterTime"))
|
|
raise
|
|
except ReferenceError:
|
|
raise
|
|
except Exception as error:
|
|
_sanitize(error)
|
|
sd = self._server_description
|
|
address = sd.address
|
|
duration = _monotonic_duration(start)
|
|
awaited = bool(self._stream and sd.is_server_type_known and sd.topology_version)
|
|
if self._publish:
|
|
assert self._listeners is not None
|
|
self._listeners.publish_server_heartbeat_failed(address, duration, error, awaited)
|
|
if _SDAM_LOGGER.isEnabledFor(logging.DEBUG):
|
|
_debug_log(
|
|
_SDAM_LOGGER,
|
|
topologyId=self._topology._topology_id,
|
|
serverHost=address[0],
|
|
serverPort=address[1],
|
|
awaited=awaited,
|
|
durationMS=duration * 1000,
|
|
failure=error,
|
|
message=_SDAMStatusMessage.HEARTBEAT_FAIL,
|
|
)
|
|
self._reset_connection()
|
|
if isinstance(error, _OperationCancelled):
|
|
raise
|
|
self._rtt_monitor.reset()
|
|
# Server type defaults to Unknown.
|
|
return ServerDescription(address, error=error)
|
|
|
|
def _check_once(self) -> ServerDescription:
|
|
"""A single attempt to call hello.
|
|
|
|
Returns a ServerDescription, or raises an exception.
|
|
"""
|
|
address = self._server_description.address
|
|
sd = self._server_description
|
|
|
|
# XXX: "awaited" could be incorrectly set to True in the rare case
|
|
# the pool checkout closes and recreates a connection.
|
|
awaited = bool(
|
|
self._pool.conns and self._stream and sd.is_server_type_known and sd.topology_version
|
|
)
|
|
if self._publish:
|
|
assert self._listeners is not None
|
|
self._listeners.publish_server_heartbeat_started(address, awaited)
|
|
|
|
if self._cancel_context and self._cancel_context.cancelled:
|
|
self._reset_connection()
|
|
with self._pool.checkout() as conn:
|
|
if _SDAM_LOGGER.isEnabledFor(logging.DEBUG):
|
|
_debug_log(
|
|
_SDAM_LOGGER,
|
|
topologyId=self._topology._topology_id,
|
|
driverConnectionId=conn.id,
|
|
serverConnectionId=conn.server_connection_id,
|
|
serverHost=address[0],
|
|
serverPort=address[1],
|
|
awaited=awaited,
|
|
message=_SDAMStatusMessage.HEARTBEAT_START,
|
|
)
|
|
|
|
self._cancel_context = conn.cancel_context
|
|
response, round_trip_time = self._check_with_socket(conn)
|
|
if not response.awaitable:
|
|
self._rtt_monitor.add_sample(round_trip_time)
|
|
|
|
avg_rtt, min_rtt = self._rtt_monitor.get()
|
|
sd = ServerDescription(address, response, avg_rtt, min_round_trip_time=min_rtt)
|
|
if self._publish:
|
|
assert self._listeners is not None
|
|
self._listeners.publish_server_heartbeat_succeeded(
|
|
address, round_trip_time, response, response.awaitable
|
|
)
|
|
if _SDAM_LOGGER.isEnabledFor(logging.DEBUG):
|
|
_debug_log(
|
|
_SDAM_LOGGER,
|
|
topologyId=self._topology._topology_id,
|
|
driverConnectionId=conn.id,
|
|
serverConnectionId=conn.server_connection_id,
|
|
serverHost=address[0],
|
|
serverPort=address[1],
|
|
awaited=awaited,
|
|
durationMS=round_trip_time * 1000,
|
|
reply=response.document,
|
|
message=_SDAMStatusMessage.HEARTBEAT_SUCCESS,
|
|
)
|
|
return sd
|
|
|
|
def _check_with_socket(self, conn: Connection) -> tuple[Hello, float]:
|
|
"""Return (Hello, round_trip_time).
|
|
|
|
Can raise ConnectionFailure or OperationFailure.
|
|
"""
|
|
cluster_time = self._topology.max_cluster_time()
|
|
start = time.monotonic()
|
|
if conn.more_to_come:
|
|
# Read the next streaming hello (MongoDB 4.4+).
|
|
response = Hello(conn._next_reply(), awaitable=True)
|
|
elif (
|
|
self._stream and conn.performed_handshake and self._server_description.topology_version
|
|
):
|
|
# Initiate streaming hello (MongoDB 4.4+).
|
|
response = conn._hello(
|
|
cluster_time,
|
|
self._server_description.topology_version,
|
|
self._settings.heartbeat_frequency,
|
|
)
|
|
else:
|
|
# New connection handshake or polling hello (MongoDB <4.4).
|
|
response = conn._hello(cluster_time, None, None)
|
|
duration = _monotonic_duration(start)
|
|
return response, duration
|
|
|
|
|
|
class SrvMonitor(MonitorBase):
|
|
def __init__(self, topology: Topology, topology_settings: TopologySettings):
|
|
"""Class to poll SRV records on a background thread.
|
|
|
|
Pass a Topology and a TopologySettings.
|
|
|
|
The Topology is weakly referenced.
|
|
"""
|
|
super().__init__(
|
|
topology,
|
|
"pymongo_srv_polling_thread",
|
|
common.MIN_SRV_RESCAN_INTERVAL,
|
|
topology_settings.heartbeat_frequency,
|
|
)
|
|
self._settings = topology_settings
|
|
self._seedlist = self._settings._seeds
|
|
assert isinstance(self._settings.fqdn, str)
|
|
self._fqdn: str = self._settings.fqdn
|
|
self._startup_time = time.monotonic()
|
|
|
|
def _run(self) -> None:
|
|
# Don't poll right after creation, wait 60 seconds first
|
|
if time.monotonic() < self._startup_time + common.MIN_SRV_RESCAN_INTERVAL:
|
|
return
|
|
seedlist = self._get_seedlist()
|
|
if seedlist:
|
|
self._seedlist = seedlist
|
|
try:
|
|
self._topology.on_srv_update(self._seedlist)
|
|
except ReferenceError:
|
|
# Topology was garbage-collected.
|
|
self.close()
|
|
|
|
def _get_seedlist(self) -> Optional[list[tuple[str, Any]]]:
|
|
"""Poll SRV records for a seedlist.
|
|
|
|
Returns a list of ServerDescriptions.
|
|
"""
|
|
try:
|
|
resolver = _SrvResolver(
|
|
self._fqdn,
|
|
self._settings.pool_options.connect_timeout,
|
|
self._settings.srv_service_name,
|
|
)
|
|
seedlist, ttl = resolver.get_hosts_and_min_ttl()
|
|
if len(seedlist) == 0:
|
|
# As per the spec: this should be treated as a failure.
|
|
raise Exception
|
|
except Exception:
|
|
# As per the spec, upon encountering an error:
|
|
# - An error must not be raised
|
|
# - SRV records must be rescanned every heartbeatFrequencyMS
|
|
# - Topology must be left unchanged
|
|
self.request_check()
|
|
return None
|
|
else:
|
|
self._executor.update_interval(max(ttl, common.MIN_SRV_RESCAN_INTERVAL))
|
|
return seedlist
|
|
|
|
|
|
class _RttMonitor(MonitorBase):
|
|
def __init__(self, topology: Topology, topology_settings: TopologySettings, pool: Pool):
|
|
"""Maintain round trip times for a server.
|
|
|
|
The Topology is weakly referenced.
|
|
"""
|
|
super().__init__(
|
|
topology,
|
|
"pymongo_server_rtt_thread",
|
|
topology_settings.heartbeat_frequency,
|
|
common.MIN_HEARTBEAT_INTERVAL,
|
|
)
|
|
|
|
self._pool = pool
|
|
self._moving_average = MovingAverage()
|
|
self._moving_min = MovingMinimum()
|
|
self._lock = _create_lock()
|
|
|
|
def close(self) -> None:
|
|
self.gc_safe_close()
|
|
# Increment the generation and maybe close the socket. If the executor
|
|
# thread has the socket checked out, it will be closed when checked in.
|
|
self._pool.reset()
|
|
|
|
def add_sample(self, sample: float) -> None:
|
|
"""Add a RTT sample."""
|
|
with self._lock:
|
|
self._moving_average.add_sample(sample)
|
|
self._moving_min.add_sample(sample)
|
|
|
|
def get(self) -> tuple[Optional[float], float]:
|
|
"""Get the calculated average, or None if no samples yet and the min."""
|
|
with self._lock:
|
|
return self._moving_average.get(), self._moving_min.get()
|
|
|
|
def reset(self) -> None:
|
|
"""Reset the average RTT."""
|
|
with self._lock:
|
|
self._moving_average.reset()
|
|
self._moving_min.reset()
|
|
|
|
def _run(self) -> None:
|
|
try:
|
|
# NOTE: This thread is only run when using the streaming
|
|
# heartbeat protocol (MongoDB 4.4+).
|
|
# XXX: Skip check if the server is unknown?
|
|
rtt = self._ping()
|
|
self.add_sample(rtt)
|
|
except ReferenceError:
|
|
# Topology was garbage-collected.
|
|
self.close()
|
|
except Exception:
|
|
self._pool.reset()
|
|
|
|
def _ping(self) -> float:
|
|
"""Run a "hello" command and return the RTT."""
|
|
with self._pool.checkout() as conn:
|
|
if self._executor._stopped:
|
|
raise Exception("_RttMonitor closed")
|
|
start = time.monotonic()
|
|
conn.hello()
|
|
return _monotonic_duration(start)
|
|
|
|
|
|
# Close monitors to cancel any in progress streaming checks before joining
|
|
# executor threads. For an explanation of how this works see the comment
|
|
# about _EXECUTORS in periodic_executor.py.
|
|
_MONITORS = set()
|
|
|
|
|
|
def _register(monitor: MonitorBase) -> None:
|
|
ref = weakref.ref(monitor, _unregister)
|
|
_MONITORS.add(ref)
|
|
|
|
|
|
def _unregister(monitor_ref: weakref.ReferenceType[MonitorBase]) -> None:
|
|
_MONITORS.remove(monitor_ref)
|
|
|
|
|
|
def _shutdown_monitors() -> None:
|
|
if _MONITORS is None:
|
|
return
|
|
|
|
# Copy the set. Closing monitors removes them.
|
|
monitors = list(_MONITORS)
|
|
|
|
# Close all monitors.
|
|
for ref in monitors:
|
|
monitor = ref()
|
|
if monitor:
|
|
monitor.gc_safe_close()
|
|
|
|
monitor = None
|
|
|
|
|
|
def _shutdown_resources() -> None:
|
|
# _shutdown_monitors/_shutdown_executors may already be GC'd at shutdown.
|
|
shutdown = _shutdown_monitors
|
|
if shutdown: # type:ignore[truthy-function]
|
|
shutdown()
|
|
shutdown = _shutdown_executors
|
|
if shutdown: # type:ignore[truthy-function]
|
|
shutdown()
|
|
|
|
|
|
atexit.register(_shutdown_resources)
|