Skip to content

Commit

Permalink
NAS-133245 / 25.04 / Switch to using syslog for middleware logfiles (#…
Browse files Browse the repository at this point in the history
…15264)

This commit converts middleware  to using syslog handler + syslog
ident in order to uniquely filter log messages into separate files.

This has advantage of streamlining remote syslog handling.

Adding new logger destinations is also fairly straightforward.
1. Add new TNLog object with details for new log file
2. Add to tuple of all TNLog objects

These are evaluated during syslog-ng configuration.

If for some reason syslog is broken on the server then middleware
will queue the records internally for when syslog becomes unbroken
while writing the records to a fallback file for evaluation by
a system administrator.
  • Loading branch information
anodos325 authored Dec 24, 2024
1 parent 7c79c21 commit 07ef975
Show file tree
Hide file tree
Showing 7 changed files with 427 additions and 48 deletions.
1 change: 0 additions & 1 deletion src/middlewared/debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ Depends: alembic,
python3-acme,
python3-aiohttp,
python3-aiorwlock,
python3-anyio,
python3-apps-validation,
python3-asyncssh,
python3-aws-requests-auth,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,11 @@ ${textwrap.indent(get_db(svc), ' ')}

% if not audit_custom_section(svc, 'log'):
log {
% if svc == 'MIDDLEWARE':
source(tn_middleware_src);
% else:
source(s_src);
% endif
filter(f_tnaudit_${svc.lower()});
parser(p_tnaudit);
rewrite(r_rewrite_success);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<%
from middlewared.plugins.audit.utils import audit_program, AUDITED_SERVICES
from middlewared.logger import ALL_LOG_FILES
adv_conf = render_ctx['system.advanced.config']
Expand Down Expand Up @@ -53,6 +54,11 @@ filter f_scst {
program("kernel") and match("dlm:" value("MESSAGE"));
};

# TrueNAS middleware filters
% for tnlog in ALL_LOG_FILES:
filter f_${tnlog.name or "middleware"} { program("${tnlog.get_ident()[:-2]}"); };
% endfor

# Temporary SNMP filter: NAS-129124
filter f_snmp {
program("snmpd") and match("unexpected header length" value("MESSAGE"));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
<%
from middlewared.logger import DEFAULT_SYSLOG_PATH, ALL_LOG_FILES
logger = middleware.logger
# The messages coming in via middleware are already formatted by logger
# and so we don't want to do additional formatting.
syslog_template = 'template("${MESSAGE}\n")'
def generate_syslog_remote_destination(advanced_config):
result = ""
syslog_server = advanced_config["syslogserver"]
Expand Down Expand Up @@ -63,7 +70,7 @@ def generate_syslog_remote_destination(advanced_config):
result += ' };\n'
result += 'log { source(tn_remote_src_files); filter(f_tnremote); destination(loghost); };\n'
result += 'log { source(tn_middleware_src); filter(f_tnremote); destination(loghost); };\n'
result += 'log { source(s_src); filter(f_tnremote); destination(loghost); };\n'
return result
Expand Down Expand Up @@ -92,11 +99,8 @@ options {
##################
source s_src { system(); internal(); };

source tn_remote_src_files {
file("/var/log/middlewared.log");
file("/var/log/failover.log");
file("/var/log/fenced.log");
file("/var/log/zettarepl.log");
source tn_middleware_src {
unix-stream("${DEFAULT_SYSLOG_PATH}" create-dirs(yes) perm(0600));
};

##################
Expand Down Expand Up @@ -124,6 +128,17 @@ log {
flags(final);
};


#######################
# Middlewared-related log files
########################
% for tnlog in ALL_LOG_FILES:
log {
source(tn_middleware_src); filter(f_${tnlog.name or "middleware"});
destination { file(${tnlog.logfile} ${syslog_template}); };
};
% endfor

log { source(s_src); filter(f_auth); destination(d_auth); };
log { source(s_src); filter(f_cron); destination(d_cron); };
log { source(s_src); filter(f_daemon); destination(d_daemon); };
Expand Down
251 changes: 214 additions & 37 deletions src/middlewared/middlewared/logger.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import logging
import logging.handlers
import os
import queue
import socket
import typing
import warnings
from .logging.console_formatter import ConsoleLogFormatter
from collections import deque
from dataclasses import dataclass

# markdown debug is also considered useless
logging.getLogger('MARKDOWN').setLevel(logging.INFO)
Expand Down Expand Up @@ -56,8 +58,16 @@
DOCKER_IMAGE_LOGFILE = '/var/log/docker_image.log'
FAILOVER_LOGFILE = '/var/log/failover.log'
LOGFILE = '/var/log/middlewared.log'
DEFAULT_LOGFORMAT = '[%(asctime)s] (%(levelname)s) %(name)s.%(funcName)s():%(lineno)d - %(message)s'
FALLBACK_LOGFILE = '/var/log/fallback-middlewared.log'
NETDATA_API_LOGFILE = '/var/log/netdata_api.log'
ZETTAREPL_LOGFILE = '/var/log/zettarepl.log'
ZETTAREPL_LOGFORMAT = '[%(asctime)s] %(levelname)-8s [%(threadName)s] [%(name)s] %(message)s'

DEFAULT_IDENT = 'MIDDLEWARE: '
MIDDLEWARE_AUDIT_IDENT = 'TNAUDIT_MIDDLEWARE: '
DEFAULT_SYSLOG_PATH = '/var/run/syslog-ng/middleware.sock'
DEFAULT_PENDING_QUEUE_LEN = 4096


def trace(self, message, *args, **kws):
Expand All @@ -69,14 +79,199 @@ def trace(self, message, *args, **kws):
logging.Logger.trace = trace


class Logger:
"""Pseudo-Class for Logger - Wrapper for logging module"""
@dataclass(slots=True, frozen=True)
class TNLog:
name: str | None
logfile: str
logformat: str = DEFAULT_LOGFORMAT
pending_maxlen: int | None = DEFAULT_PENDING_QUEUE_LEN

def get_ident(self):
if self.name is None:
return DEFAULT_IDENT

return f'{self.name.upper()}: '


# NOTE if new separate log file needs to be added, create a new TNLog
# object and append to ALL_LOG_FILES tuple. These files are read by the
# syslog-ng config generation scripts and automatically handled.
TNLOG_MIDDLEWARE = TNLog(None, LOGFILE)
TNLOG_APP_LIFECYCLE = TNLog('app_lifecycle', APP_LIFECYCLE_LOGFILE)
TNLOG_APP_MIGRATION = TNLog('app_migration', APP_MIGRATION_LOGFILE)
TNLOG_DOCKER_IMAGE = TNLog('docker_image', DOCKER_IMAGE_LOGFILE)
TNLOG_FAILOVER = TNLog('failover', FAILOVER_LOGFILE)
TNLOG_NETDATA_API = TNLog('netdata_api', NETDATA_API_LOGFILE)
TNLOG_ZETTAREPL = TNLog('zettarepl', ZETTAREPL_LOGFILE, ZETTAREPL_LOGFORMAT)

# NOTE: this is also consumed by tests/unit/test_logger.py, which validates
# the auto-generated syslog-ng rules place messages in the correct log files.
ALL_LOG_FILES = (
TNLOG_MIDDLEWARE,
TNLOG_APP_LIFECYCLE,
TNLOG_APP_MIGRATION,
TNLOG_DOCKER_IMAGE,
TNLOG_FAILOVER,
TNLOG_NETDATA_API,
TNLOG_ZETTAREPL,
)

# Audit entries are inserted into audit databases in /audit rather than
# written to files in /var/log and so they are not members of ALL_LOG_FILES
MIDDLEWARE_TNAUDIT = TNLog('TNAUDIT_MIDDLEWARE', '', '', None)


class TNSyslogHandler(logging.handlers.SysLogHandler):
def __init__(
self, application_name: str, debug_level: typing.Optional[str]=None,
log_format: str='[%(asctime)s] (%(levelname)s) %(name)s.%(funcName)s():%(lineno)d - %(message)s'
self,
address: str = DEFAULT_SYSLOG_PATH,
pending_queue: deque | None = None
):
"""
address - path to Unix socket (should be defined in syslog-ng config)
pending_queue - deque object that will be used for storing failed
LogRecords if syslog is currently down.
Note: maxlen should be set unless one wants to queue the log messages
without loss until syslog connection restored. This is probably
desireable for auditing, but not for general purposes (where it's
better to just specify a fallback handler).
"""
self.pending_queue = pending_queue
self.fallback_handler = None
super().__init__(address, socktype=socket.SOCK_STREAM)

def drain_pending_queue(self) -> bool:
"""
Attempt to emit any log records that have been queued up due to logging
failures to the syslog socket.
Returns:
True if successfully drained entire queue else False
Raises:
Should not raise exceptions
"""
while self.pending_queue:
record = self.pending_queue.popleft()
try:
super().emit(record)
except Exception:
# Nope. Still dead. Put it back where we found it
self.pending_queue.appendleft(record)
return False

return True

def fallback(self, record: logging.LogRecord) -> None:
"""
Fallback logging mechanism in case the syslog target is down.
In this case we emit the log record to the fallback handler and ignore
any errors.
Returns:
None
Raises:
Should not raise exceptions
"""
if not self.fallback_handler:
return

try:
self.fallback_handler.emit(record)
except Exception:
pass

def emit(self, record: logging.LogRecord) -> None:
"""
Emit a LogRecord to syslog. If this fails then add to pending queue and
emit via our fallback handler.
"""

# First attempt to drain the pending queue
if not self.drain_pending_queue():
# Failed to drain our pending queue so add this record to the
# ever-growing deque
self.pending_queue.append(record)
self.fallback(record)
return

try:
super().emit(record)
except Exception:
# logging framework done broke. Queue up
# for drain on next auditd message handled
self.pending_queue.append(record)
self.fallback(record)

def handleError(self, record: logging.LogRecord) -> None:
"""
Override the default syslog error handler if we have a pending_queue to
defined. Exception raised here passes back up to the the emit() call.
"""
# re-raise it back up to the emit call
if self.pending_queue is None:
return super().handleError(record)

raise

def set_fallback_handler(self, fallback: logging.Handler) -> None:
""" Set a fallback handler (for example to file) that will be used if syslog socket logging fails """
if not isinstance(fallback, logging.Handler):
raise TypeError(f'{fallback}: not a logging.Handler')

self.fallback_handler = fallback

def close(self) -> None:
# Close our socket
super().close()

if self.fallback_handler:
# close any open file handler
self.fallback_handler.close()
self.fallback_handler = None


def setup_syslog_handler(tnlog: TNLog, fallback: logging.Handler | None) -> logging.Logger:
# Use `QueueHandler` to avoid blocking IO in asyncio main loop
log_queue = queue.Queue()
queue_handler = logging.handlers.QueueHandler(log_queue)

# Set up syslog handler with deque to store failed messages until
# they can be flushed. This can happen if syslog-ng isn't ready yet.
syslog_handler = TNSyslogHandler(pending_queue=deque(maxlen=tnlog.pending_maxlen))
syslog_handler.setLevel(logging.DEBUG)

if tnlog.logformat:
syslog_handler.setFormatter(logging.Formatter(tnlog.logformat, '%Y/%m/%d %H:%M:%S'))

# Set ident for the logger. This becomes program name in syslog-ng and allows
# more precise filtering rules
syslog_handler.ident = tnlog.get_ident()

# Set fallback for case where syslog is broken. This does not need separate queue
# since emit will happen in separate thread from main loop.
if fallback:
syslog_handler.set_fallback_handler(fallback)

queue_listener = logging.handlers.QueueListener(log_queue, syslog_handler)
queue_listener.start()
logger = logging.getLogger(tnlog.name)
logger.addHandler(queue_handler)
if tnlog.name is not None:
logging.getLogger(tnlog.name).propagate = False

return logger


class Logger:
"""Pseudo-Class for Logger - Wrapper for logging module"""
def __init__(self, application_name: str, debug_level: str = 'DEBUG', log_format: str = DEFAULT_LOGFORMAT):
self.application_name = application_name
self.debug_level = debug_level or 'DEBUG'
self.debug_level = debug_level
self.log_format = log_format

def getLogger(self):
Expand All @@ -94,40 +289,22 @@ def configure_logging(self, output_option: str):
console_handler.setFormatter(ConsoleLogFormatter(self.log_format, datefmt=time_format))
logging.root.addHandler(console_handler)
else:
for name, filename, log_format in [
(None, LOGFILE, self.log_format),
('app_lifecycle', APP_LIFECYCLE_LOGFILE, self.log_format),
('app_migrations', APP_MIGRATION_LOGFILE, self.log_format),
('docker_image', DOCKER_IMAGE_LOGFILE, self.log_format),
('failover', FAILOVER_LOGFILE, self.log_format),
('netdata_api', NETDATA_API_LOGFILE, self.log_format),
('zettarepl', ZETTAREPL_LOGFILE,
'[%(asctime)s] %(levelname)-8s [%(threadName)s] [%(name)s] %(message)s'),
]:
self.setup_file_logger(name, filename, log_format)
# Set up our fallback logging mechanism (to file) in case syslog is broken
# We internally queue writes to regular log files while waiting for syslog to recover
# while simultaneously writing to the fallback file so that there is less potential to
# lose relevant logs.
fallback_handler = logging.handlers.RotatingFileHandler(FALLBACK_LOGFILE, 'a', 10485760, 5, 'utf-8')
fallback_handler.setLevel(logging.DEBUG)
fallback_handler.setFormatter(logging.Formatter(DEFAULT_LOGFORMAT, '%Y/%m/%d %H:%M:%S'))

for tnlog in ALL_LOG_FILES:
setup_syslog_handler(tnlog, fallback_handler)

logging.root.setLevel(getattr(logging, self.debug_level))

def setup_file_logger(self, name: typing.Optional[str], filename: str, log_format: typing.Optional[str]):
# Use `QueueHandler` to avoid blocking IO in asyncio main loop
log_queue = queue.Queue()
queue_handler = logging.handlers.QueueHandler(log_queue)
file_handler = logging.handlers.RotatingFileHandler(filename, 'a', 10485760, 5, 'utf-8')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter(log_format, '%Y/%m/%d %H:%M:%S'))
queue_listener = logging.handlers.QueueListener(log_queue, file_handler)
queue_listener.start()
logging.getLogger(name).addHandler(queue_handler)
if name is not None:
logging.getLogger(name).propagate = False

# Make sure various log files are not readable by everybody.
# umask could be another approach but chmod was chosen so
# it affects existing installs.
try:
os.chmod(filename, 0o640)
except OSError:
pass

def setup_audit_logging() -> logging.Logger:
return setup_syslog_handler(MIDDLEWARE_TNAUDIT, None)


def setup_logging(name: str, debug_level: typing.Optional[str], log_handler: typing.Optional[str]):
Expand Down
Loading

0 comments on commit 07ef975

Please sign in to comment.