From 07ef97527efe3470a4bcb04e5cb9b908f23a211f Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Tue, 24 Dec 2024 09:02:35 -0600 Subject: [PATCH] NAS-133245 / 25.04 / Switch to using syslog for middleware logfiles (#15264) This commit converts middleware to using syslog handler + syslog ident in order to uniquely filter log messages into separate files. This has advantage of streamlining remote syslog handling. Adding new logger destinations is also fairly straightforward. 1. Add new TNLog object with details for new log file 2. Add to tuple of all TNLog objects These are evaluated during syslog-ng configuration. If for some reason syslog is broken on the server then middleware will queue the records internally for when syslog becomes unbroken while writing the records to a fallback file for evaluation by a system administrator. --- src/middlewared/debian/control | 1 - .../syslog-ng/conf.d/tnaudit.conf.mako | 4 + .../syslog-ng/conf.d/tnfilters.conf.mako | 6 + .../etc_files/syslog-ng/syslog-ng.conf.mako | 27 +- src/middlewared/middlewared/logger.py | 251 +++++++++++++++--- src/middlewared/middlewared/main.py | 7 +- tests/unit/test_logger.py | 179 +++++++++++++ 7 files changed, 427 insertions(+), 48 deletions(-) create mode 100644 tests/unit/test_logger.py diff --git a/src/middlewared/debian/control b/src/middlewared/debian/control index df8fc0d1ed180..d9761eb9e0682 100644 --- a/src/middlewared/debian/control +++ b/src/middlewared/debian/control @@ -103,7 +103,6 @@ Depends: alembic, python3-acme, python3-aiohttp, python3-aiorwlock, - python3-anyio, python3-apps-validation, python3-asyncssh, python3-aws-requests-auth, diff --git a/src/middlewared/middlewared/etc_files/syslog-ng/conf.d/tnaudit.conf.mako b/src/middlewared/middlewared/etc_files/syslog-ng/conf.d/tnaudit.conf.mako index cab6b60dd9f7d..e59962c4afcff 100644 --- a/src/middlewared/middlewared/etc_files/syslog-ng/conf.d/tnaudit.conf.mako +++ b/src/middlewared/middlewared/etc_files/syslog-ng/conf.d/tnaudit.conf.mako @@ -78,7 +78,11 @@ ${textwrap.indent(get_db(svc), ' ')} % if not audit_custom_section(svc, 'log'): log { +% if svc == 'MIDDLEWARE': + source(tn_middleware_src); +% else: source(s_src); +% endif filter(f_tnaudit_${svc.lower()}); parser(p_tnaudit); rewrite(r_rewrite_success); diff --git a/src/middlewared/middlewared/etc_files/syslog-ng/conf.d/tnfilters.conf.mako b/src/middlewared/middlewared/etc_files/syslog-ng/conf.d/tnfilters.conf.mako index 600a51e2254f6..912d5cd5a4e85 100644 --- a/src/middlewared/middlewared/etc_files/syslog-ng/conf.d/tnfilters.conf.mako +++ b/src/middlewared/middlewared/etc_files/syslog-ng/conf.d/tnfilters.conf.mako @@ -1,5 +1,6 @@ <% from middlewared.plugins.audit.utils import audit_program, AUDITED_SERVICES + from middlewared.logger import ALL_LOG_FILES adv_conf = render_ctx['system.advanced.config'] @@ -53,6 +54,11 @@ filter f_scst { program("kernel") and match("dlm:" value("MESSAGE")); }; +# TrueNAS middleware filters +% for tnlog in ALL_LOG_FILES: +filter f_${tnlog.name or "middleware"} { program("${tnlog.get_ident()[:-2]}"); }; +% endfor + # Temporary SNMP filter: NAS-129124 filter f_snmp { program("snmpd") and match("unexpected header length" value("MESSAGE")); diff --git a/src/middlewared/middlewared/etc_files/syslog-ng/syslog-ng.conf.mako b/src/middlewared/middlewared/etc_files/syslog-ng/syslog-ng.conf.mako index c801052fa5244..f0474c4b7b637 100644 --- a/src/middlewared/middlewared/etc_files/syslog-ng/syslog-ng.conf.mako +++ b/src/middlewared/middlewared/etc_files/syslog-ng/syslog-ng.conf.mako @@ -1,6 +1,13 @@ <% +from middlewared.logger import DEFAULT_SYSLOG_PATH, ALL_LOG_FILES + logger = middleware.logger +# The messages coming in via middleware are already formatted by logger +# and so we don't want to do additional formatting. +syslog_template = 'template("${MESSAGE}\n")' + + def generate_syslog_remote_destination(advanced_config): result = "" syslog_server = advanced_config["syslogserver"] @@ -63,7 +70,7 @@ def generate_syslog_remote_destination(advanced_config): result += ' };\n' - result += 'log { source(tn_remote_src_files); filter(f_tnremote); destination(loghost); };\n' + result += 'log { source(tn_middleware_src); filter(f_tnremote); destination(loghost); };\n' result += 'log { source(s_src); filter(f_tnremote); destination(loghost); };\n' return result @@ -92,11 +99,8 @@ options { ################## source s_src { system(); internal(); }; -source tn_remote_src_files { - file("/var/log/middlewared.log"); - file("/var/log/failover.log"); - file("/var/log/fenced.log"); - file("/var/log/zettarepl.log"); +source tn_middleware_src { + unix-stream("${DEFAULT_SYSLOG_PATH}" create-dirs(yes) perm(0600)); }; ################## @@ -124,6 +128,17 @@ log { flags(final); }; + +####################### +# Middlewared-related log files +######################## +% for tnlog in ALL_LOG_FILES: +log { + source(tn_middleware_src); filter(f_${tnlog.name or "middleware"}); + destination { file(${tnlog.logfile} ${syslog_template}); }; +}; +% endfor + log { source(s_src); filter(f_auth); destination(d_auth); }; log { source(s_src); filter(f_cron); destination(d_cron); }; log { source(s_src); filter(f_daemon); destination(d_daemon); }; diff --git a/src/middlewared/middlewared/logger.py b/src/middlewared/middlewared/logger.py index 5f00cf1b39edb..77a5755a367a6 100644 --- a/src/middlewared/middlewared/logger.py +++ b/src/middlewared/middlewared/logger.py @@ -1,10 +1,12 @@ import logging import logging.handlers -import os import queue +import socket import typing import warnings from .logging.console_formatter import ConsoleLogFormatter +from collections import deque +from dataclasses import dataclass # markdown debug is also considered useless logging.getLogger('MARKDOWN').setLevel(logging.INFO) @@ -56,8 +58,16 @@ DOCKER_IMAGE_LOGFILE = '/var/log/docker_image.log' FAILOVER_LOGFILE = '/var/log/failover.log' LOGFILE = '/var/log/middlewared.log' +DEFAULT_LOGFORMAT = '[%(asctime)s] (%(levelname)s) %(name)s.%(funcName)s():%(lineno)d - %(message)s' +FALLBACK_LOGFILE = '/var/log/fallback-middlewared.log' NETDATA_API_LOGFILE = '/var/log/netdata_api.log' ZETTAREPL_LOGFILE = '/var/log/zettarepl.log' +ZETTAREPL_LOGFORMAT = '[%(asctime)s] %(levelname)-8s [%(threadName)s] [%(name)s] %(message)s' + +DEFAULT_IDENT = 'MIDDLEWARE: ' +MIDDLEWARE_AUDIT_IDENT = 'TNAUDIT_MIDDLEWARE: ' +DEFAULT_SYSLOG_PATH = '/var/run/syslog-ng/middleware.sock' +DEFAULT_PENDING_QUEUE_LEN = 4096 def trace(self, message, *args, **kws): @@ -69,14 +79,199 @@ def trace(self, message, *args, **kws): logging.Logger.trace = trace -class Logger: - """Pseudo-Class for Logger - Wrapper for logging module""" +@dataclass(slots=True, frozen=True) +class TNLog: + name: str | None + logfile: str + logformat: str = DEFAULT_LOGFORMAT + pending_maxlen: int | None = DEFAULT_PENDING_QUEUE_LEN + + def get_ident(self): + if self.name is None: + return DEFAULT_IDENT + + return f'{self.name.upper()}: ' + + +# NOTE if new separate log file needs to be added, create a new TNLog +# object and append to ALL_LOG_FILES tuple. These files are read by the +# syslog-ng config generation scripts and automatically handled. +TNLOG_MIDDLEWARE = TNLog(None, LOGFILE) +TNLOG_APP_LIFECYCLE = TNLog('app_lifecycle', APP_LIFECYCLE_LOGFILE) +TNLOG_APP_MIGRATION = TNLog('app_migration', APP_MIGRATION_LOGFILE) +TNLOG_DOCKER_IMAGE = TNLog('docker_image', DOCKER_IMAGE_LOGFILE) +TNLOG_FAILOVER = TNLog('failover', FAILOVER_LOGFILE) +TNLOG_NETDATA_API = TNLog('netdata_api', NETDATA_API_LOGFILE) +TNLOG_ZETTAREPL = TNLog('zettarepl', ZETTAREPL_LOGFILE, ZETTAREPL_LOGFORMAT) + +# NOTE: this is also consumed by tests/unit/test_logger.py, which validates +# the auto-generated syslog-ng rules place messages in the correct log files. +ALL_LOG_FILES = ( + TNLOG_MIDDLEWARE, + TNLOG_APP_LIFECYCLE, + TNLOG_APP_MIGRATION, + TNLOG_DOCKER_IMAGE, + TNLOG_FAILOVER, + TNLOG_NETDATA_API, + TNLOG_ZETTAREPL, +) + +# Audit entries are inserted into audit databases in /audit rather than +# written to files in /var/log and so they are not members of ALL_LOG_FILES +MIDDLEWARE_TNAUDIT = TNLog('TNAUDIT_MIDDLEWARE', '', '', None) + + +class TNSyslogHandler(logging.handlers.SysLogHandler): def __init__( - self, application_name: str, debug_level: typing.Optional[str]=None, - log_format: str='[%(asctime)s] (%(levelname)s) %(name)s.%(funcName)s():%(lineno)d - %(message)s' + self, + address: str = DEFAULT_SYSLOG_PATH, + pending_queue: deque | None = None ): + """ + address - path to Unix socket (should be defined in syslog-ng config) + + pending_queue - deque object that will be used for storing failed + LogRecords if syslog is currently down. + + Note: maxlen should be set unless one wants to queue the log messages + without loss until syslog connection restored. This is probably + desireable for auditing, but not for general purposes (where it's + better to just specify a fallback handler). + """ + self.pending_queue = pending_queue + self.fallback_handler = None + super().__init__(address, socktype=socket.SOCK_STREAM) + + def drain_pending_queue(self) -> bool: + """ + Attempt to emit any log records that have been queued up due to logging + failures to the syslog socket. + + Returns: + True if successfully drained entire queue else False + + Raises: + Should not raise exceptions + """ + while self.pending_queue: + record = self.pending_queue.popleft() + try: + super().emit(record) + except Exception: + # Nope. Still dead. Put it back where we found it + self.pending_queue.appendleft(record) + return False + + return True + + def fallback(self, record: logging.LogRecord) -> None: + """ + Fallback logging mechanism in case the syslog target is down. + + In this case we emit the log record to the fallback handler and ignore + any errors. + + Returns: + None + + Raises: + Should not raise exceptions + """ + if not self.fallback_handler: + return + + try: + self.fallback_handler.emit(record) + except Exception: + pass + + def emit(self, record: logging.LogRecord) -> None: + """ + Emit a LogRecord to syslog. If this fails then add to pending queue and + emit via our fallback handler. + """ + + # First attempt to drain the pending queue + if not self.drain_pending_queue(): + # Failed to drain our pending queue so add this record to the + # ever-growing deque + self.pending_queue.append(record) + self.fallback(record) + return + + try: + super().emit(record) + except Exception: + # logging framework done broke. Queue up + # for drain on next auditd message handled + self.pending_queue.append(record) + self.fallback(record) + + def handleError(self, record: logging.LogRecord) -> None: + """ + Override the default syslog error handler if we have a pending_queue to + defined. Exception raised here passes back up to the the emit() call. + """ + # re-raise it back up to the emit call + if self.pending_queue is None: + return super().handleError(record) + + raise + + def set_fallback_handler(self, fallback: logging.Handler) -> None: + """ Set a fallback handler (for example to file) that will be used if syslog socket logging fails """ + if not isinstance(fallback, logging.Handler): + raise TypeError(f'{fallback}: not a logging.Handler') + + self.fallback_handler = fallback + + def close(self) -> None: + # Close our socket + super().close() + + if self.fallback_handler: + # close any open file handler + self.fallback_handler.close() + self.fallback_handler = None + + +def setup_syslog_handler(tnlog: TNLog, fallback: logging.Handler | None) -> logging.Logger: + # Use `QueueHandler` to avoid blocking IO in asyncio main loop + log_queue = queue.Queue() + queue_handler = logging.handlers.QueueHandler(log_queue) + + # Set up syslog handler with deque to store failed messages until + # they can be flushed. This can happen if syslog-ng isn't ready yet. + syslog_handler = TNSyslogHandler(pending_queue=deque(maxlen=tnlog.pending_maxlen)) + syslog_handler.setLevel(logging.DEBUG) + + if tnlog.logformat: + syslog_handler.setFormatter(logging.Formatter(tnlog.logformat, '%Y/%m/%d %H:%M:%S')) + + # Set ident for the logger. This becomes program name in syslog-ng and allows + # more precise filtering rules + syslog_handler.ident = tnlog.get_ident() + + # Set fallback for case where syslog is broken. This does not need separate queue + # since emit will happen in separate thread from main loop. + if fallback: + syslog_handler.set_fallback_handler(fallback) + + queue_listener = logging.handlers.QueueListener(log_queue, syslog_handler) + queue_listener.start() + logger = logging.getLogger(tnlog.name) + logger.addHandler(queue_handler) + if tnlog.name is not None: + logging.getLogger(tnlog.name).propagate = False + + return logger + + +class Logger: + """Pseudo-Class for Logger - Wrapper for logging module""" + def __init__(self, application_name: str, debug_level: str = 'DEBUG', log_format: str = DEFAULT_LOGFORMAT): self.application_name = application_name - self.debug_level = debug_level or 'DEBUG' + self.debug_level = debug_level self.log_format = log_format def getLogger(self): @@ -94,40 +289,22 @@ def configure_logging(self, output_option: str): console_handler.setFormatter(ConsoleLogFormatter(self.log_format, datefmt=time_format)) logging.root.addHandler(console_handler) else: - for name, filename, log_format in [ - (None, LOGFILE, self.log_format), - ('app_lifecycle', APP_LIFECYCLE_LOGFILE, self.log_format), - ('app_migrations', APP_MIGRATION_LOGFILE, self.log_format), - ('docker_image', DOCKER_IMAGE_LOGFILE, self.log_format), - ('failover', FAILOVER_LOGFILE, self.log_format), - ('netdata_api', NETDATA_API_LOGFILE, self.log_format), - ('zettarepl', ZETTAREPL_LOGFILE, - '[%(asctime)s] %(levelname)-8s [%(threadName)s] [%(name)s] %(message)s'), - ]: - self.setup_file_logger(name, filename, log_format) + # Set up our fallback logging mechanism (to file) in case syslog is broken + # We internally queue writes to regular log files while waiting for syslog to recover + # while simultaneously writing to the fallback file so that there is less potential to + # lose relevant logs. + fallback_handler = logging.handlers.RotatingFileHandler(FALLBACK_LOGFILE, 'a', 10485760, 5, 'utf-8') + fallback_handler.setLevel(logging.DEBUG) + fallback_handler.setFormatter(logging.Formatter(DEFAULT_LOGFORMAT, '%Y/%m/%d %H:%M:%S')) + + for tnlog in ALL_LOG_FILES: + setup_syslog_handler(tnlog, fallback_handler) logging.root.setLevel(getattr(logging, self.debug_level)) - def setup_file_logger(self, name: typing.Optional[str], filename: str, log_format: typing.Optional[str]): - # Use `QueueHandler` to avoid blocking IO in asyncio main loop - log_queue = queue.Queue() - queue_handler = logging.handlers.QueueHandler(log_queue) - file_handler = logging.handlers.RotatingFileHandler(filename, 'a', 10485760, 5, 'utf-8') - file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(logging.Formatter(log_format, '%Y/%m/%d %H:%M:%S')) - queue_listener = logging.handlers.QueueListener(log_queue, file_handler) - queue_listener.start() - logging.getLogger(name).addHandler(queue_handler) - if name is not None: - logging.getLogger(name).propagate = False - - # Make sure various log files are not readable by everybody. - # umask could be another approach but chmod was chosen so - # it affects existing installs. - try: - os.chmod(filename, 0o640) - except OSError: - pass + +def setup_audit_logging() -> logging.Logger: + return setup_syslog_handler(MIDDLEWARE_TNAUDIT, None) def setup_logging(name: str, debug_level: typing.Optional[str], log_handler: typing.Optional[str]): diff --git a/src/middlewared/middlewared/main.py b/src/middlewared/middlewared/main.py index 65858eb732804..d86d483eaf114 100644 --- a/src/middlewared/middlewared/main.py +++ b/src/middlewared/middlewared/main.py @@ -64,12 +64,11 @@ import typing import uuid -from anyio import create_connected_unix_datagram_socket from systemd.daemon import notify as systemd_notify from truenas_api_client import json -from .logger import Logger, setup_logging +from .logger import Logger, setup_audit_logging, setup_logging SYSTEMD_EXTEND_USECS = 240000000 # 4mins in microseconds @@ -128,6 +127,7 @@ def __init__( self.tasks = set() self.api_versions = None self.api_versions_adapter = None + self.__audit_logger = setup_audit_logging() def create_task(self, coro, *, name=None): task = self.loop.create_task(coro, name=name) @@ -924,8 +924,7 @@ async def log_audit_message(self, app, event, event_data, success): } }) - async with await create_connected_unix_datagram_socket("/dev/log") as s: - await s.send(syslog_message(message)) + self.__audit_logger.debug(message) async def call(self, name, *params, app=None, audit_callback=None, job_on_progress_cb=None, pipes=None, profile=False): diff --git a/tests/unit/test_logger.py b/tests/unit/test_logger.py new file mode 100644 index 0000000000000..185fbd7ec8af5 --- /dev/null +++ b/tests/unit/test_logger.py @@ -0,0 +1,179 @@ +import logging +import logging.handlers +import os +import pytest +import subprocess + +from collections import deque +from contextlib import contextmanager +from middlewared.logger import ( + ALL_LOG_FILES, + DEFAULT_IDENT, + DEFAULT_LOGFORMAT, + DEFAULT_SYSLOG_PATH, + LOGFILE, + setup_syslog_handler, + TNSyslogHandler, +) +from time import sleep + +SYSLOG_WRITE_WAIT = 1 + + +def remove_logfile(path): + try: + os.unlink(path) + except FileNotFoundError: + pass + + +@pytest.fixture(scope='function') +def current_test_name(request): + return request.node.name + + +@pytest.fixture(scope='function') +def test_message(current_test_name): + return f'This is a test message for test {current_test_name}' + + +@pytest.fixture(scope='function') +def fallback_handler(tmpdir): + logfile = os.path.join(tmpdir, 'fallback.log') + fallback_handler = logging.handlers.RotatingFileHandler(logfile, 'a', 10485760, 5, 'utf-8') + fallback_handler.setLevel(logging.DEBUG) + fallback_handler.setFormatter(logging.Formatter(DEFAULT_LOGFORMAT, "%Y/%m/%d %H:%M:%S")) + + try: + yield (logfile, fallback_handler) + finally: + remove_logfile(logfile) + + +@contextmanager +def disable_syslog(): + subprocess.run(['service', 'syslog-ng', 'stop']) + + try: + yield + finally: + subprocess.run(['service', 'syslog-ng', 'start']) + # Give a little extra buffer for syslog-ng to fully start + sleep(SYSLOG_WRITE_WAIT) + + +@pytest.fixture(scope='function') +def broken_syslog(current_test_name): + handler = TNSyslogHandler(address='/var/empty/canary', pending_queue=deque()) + logger = logging.getLogger(current_test_name) + logger.addHandler(handler) + + try: + yield (logger, handler) + finally: + handler.close() + + +@pytest.fixture(scope='function') +def broken_syslog_with_fallback(broken_syslog, fallback_handler): + logger, syslog_handler = broken_syslog + logfile, fallback = fallback_handler + + syslog_handler.set_fallback_handler(fallback) + + yield (logger, syslog_handler, logfile) + + +@pytest.fixture(scope='function') +def working_syslog(current_test_name): + """ Set up syslog logger to use middleware rules (via ident) so that it + can successfully write to the middleware log file. """ + + handler = TNSyslogHandler(address=DEFAULT_SYSLOG_PATH, pending_queue=deque()) + handler.ident = DEFAULT_IDENT + logger = logging.getLogger(current_test_name) + logger.addHandler(handler) + + try: + yield (logger, handler) + finally: + handler.close() + + +def test__pending_queue(broken_syslog, test_message): + """ Verify that when syslog connection is broken messages are queued """ + logger, handler = broken_syslog + assert handler.pending_queue is not None + assert handler.fallback_handler is None + + logger.critical(test_message) + assert len(handler.pending_queue) == 1 + assert handler.pending_queue[0].msg == test_message + + +def test__fallback_handler(broken_syslog_with_fallback, test_message): + """ Verify that fallback handler results in writes to log file """ + logger, handler, logfile = broken_syslog_with_fallback + + assert handler.pending_queue is not None + assert handler.fallback_handler is not None + + logger.critical(test_message) + assert len(handler.pending_queue) == 1 + assert handler.pending_queue[0].msg == test_message + + with open(logfile, 'r') as f: + contents = f.read() + assert test_message in contents + + +def test__working_syslog_handler(working_syslog, test_message): + """ Verify that syslog handler writes to the middleware log """ + logger, handler = working_syslog + assert handler.pending_queue is not None + assert handler.fallback_handler is None + + logger.critical(test_message) + assert len(handler.pending_queue) == 0 + sleep(SYSLOG_WRITE_WAIT) + + with open(LOGFILE, 'r') as f: + contents = f.read() + assert test_message in contents + + +def test__syslog_handler_recovery(working_syslog, test_message, current_test_name): + """ Verify that pending queue is properly drained and written to syslog target """ + logger, handler = working_syslog + assert handler.pending_queue is not None + assert handler.fallback_handler is None + + with disable_syslog(): + logger.critical(test_message) + assert len(handler.pending_queue) == 1 + + # Queue only gets drained on message emit + flush_message = f'Flush message for {current_test_name}' + logger.critical(flush_message) + + # Queue should be fully drained now that syslog is working + assert len(handler.pending_queue) == 0 + sleep(SYSLOG_WRITE_WAIT) + + # Message should now be written to file in order in which + # it was initially emitted + with open(LOGFILE, 'r') as f: + contents = f.read() + assert f'{test_message}\n{flush_message}' in contents + + +@pytest.mark.parametrize('tnlog', ALL_LOG_FILES) +def test__middleware_logger_paths(tnlog, test_message): + """ Verify that syslog filtering rules work properly for backends """ + logger = setup_syslog_handler(tnlog, None) + logger.critical(test_message) + + sleep(SYSLOG_WRITE_WAIT) + with open(tnlog.logfile, 'r') as f: + contents = f.read() + assert test_message in contents