Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NAS-133245 / 25.04 / Switch to using syslog for middleware logfiles #15264

Merged
merged 3 commits into from
Dec 24, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/middlewared/debian/control
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ Depends: alembic,
python3-acme,
python3-aiohttp,
python3-aiorwlock,
python3-anyio,
python3-apps-validation,
python3-asyncssh,
python3-aws-requests-auth,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,11 @@ ${textwrap.indent(get_db(svc), ' ')}

% if not audit_custom_section(svc, 'log'):
log {
% if svc == 'MIDDLEWARE':
source(tn_middleware_src);
% else:
source(s_src);
% endif
filter(f_tnaudit_${svc.lower()});
parser(p_tnaudit);
rewrite(r_rewrite_success);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<%
from middlewared.plugins.audit.utils import audit_program, AUDITED_SERVICES
from middlewared.logger import ALL_LOG_FILES

adv_conf = render_ctx['system.advanced.config']

Expand Down Expand Up @@ -53,6 +54,11 @@ filter f_scst {
program("kernel") and match("dlm:" value("MESSAGE"));
};

# TrueNAS middleware filters
% for tnlog in ALL_LOG_FILES:
filter f_${tnlog.name or "middleware"} { program("${tnlog.get_ident()[:-2]}"); };
% endfor

# Temporary SNMP filter: NAS-129124
filter f_snmp {
program("snmpd") and match("unexpected header length" value("MESSAGE"));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
<%
from middlewared.logger import DEFAULT_SYSLOG_PATH, ALL_LOG_FILES

logger = middleware.logger

# The messages coming in via middleware are already formatted by logger
# and so we don't want to do additional formatting.
syslog_template = 'template("${MESSAGE}\n")'


def generate_syslog_remote_destination(advanced_config):
result = ""
syslog_server = advanced_config["syslogserver"]
Expand Down Expand Up @@ -63,7 +70,7 @@ def generate_syslog_remote_destination(advanced_config):


result += ' };\n'
result += 'log { source(tn_remote_src_files); filter(f_tnremote); destination(loghost); };\n'
result += 'log { source(tn_middleware_src); filter(f_tnremote); destination(loghost); };\n'
result += 'log { source(s_src); filter(f_tnremote); destination(loghost); };\n'

return result
Expand Down Expand Up @@ -92,11 +99,8 @@ options {
##################
source s_src { system(); internal(); };

source tn_remote_src_files {
file("/var/log/middlewared.log");
file("/var/log/failover.log");
file("/var/log/fenced.log");
file("/var/log/zettarepl.log");
source tn_middleware_src {
unix-stream("${DEFAULT_SYSLOG_PATH}" create-dirs(yes) perm(0600));
};

##################
Expand Down Expand Up @@ -124,6 +128,17 @@ log {
flags(final);
};


#######################
# Middlewared-related log files
########################
% for tnlog in ALL_LOG_FILES:
log {
source(tn_middleware_src); filter(f_${tnlog.name or "middleware"});
destination { file(${tnlog.logfile} ${syslog_template}); };
};
% endfor

log { source(s_src); filter(f_auth); destination(d_auth); };
log { source(s_src); filter(f_cron); destination(d_cron); };
log { source(s_src); filter(f_daemon); destination(d_daemon); };
Expand Down
249 changes: 212 additions & 37 deletions src/middlewared/middlewared/logger.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import logging
import logging.handlers
import os
import queue
import socket
import typing
import warnings
from .logging.console_formatter import ConsoleLogFormatter
from collections import deque
from dataclasses import dataclass

# markdown debug is also considered useless
logging.getLogger('MARKDOWN').setLevel(logging.INFO)
Expand Down Expand Up @@ -56,8 +58,16 @@
DOCKER_IMAGE_LOGFILE = '/var/log/docker_image.log'
FAILOVER_LOGFILE = '/var/log/failover.log'
LOGFILE = '/var/log/middlewared.log'
DEFAULT_LOGFORMAT = '[%(asctime)s] (%(levelname)s) %(name)s.%(funcName)s():%(lineno)d - %(message)s'
FALLBACK_LOGFILE = '/var/log/fallback-middlewared.log'
NETDATA_API_LOGFILE = '/var/log/netdata_api.log'
ZETTAREPL_LOGFILE = '/var/log/zettarepl.log'
ZETTAREPL_LOGFORMAT = '[%(asctime)s] %(levelname)-8s [%(threadName)s] [%(name)s] %(message)s'

DEFAULT_IDENT = 'MIDDLEWARE: '
MIDDLEWARE_AUDIT_IDENT = 'TNAUDIT_MIDDLEWARE: '
DEFAULT_SYSLOG_PATH = '/var/run/syslog-ng/middleware.sock'
DEFAULT_PENDING_QUEUE_LEN = 4096


def trace(self, message, *args, **kws):
Expand All @@ -69,14 +79,197 @@ def trace(self, message, *args, **kws):
logging.Logger.trace = trace


class Logger:
"""Pseudo-Class for Logger - Wrapper for logging module"""
@dataclass(slots=True, frozen=True)
class TNLog:
name: str | None
logfile: str
logformat: str = DEFAULT_LOGFORMAT
pending_maxlen: int | None = DEFAULT_PENDING_QUEUE_LEN

def get_ident(self):
if self.name is None:
return DEFAULT_IDENT

return f'{self.name.upper()}: '


# NOTE if new separate log file needs to be added, create a new TNLog
# object and append to ALL_LOG_FILES tuple. These files are read by the
# syslog-ng config generation scripts and automatically handled.
TNLOG_MIDDLEWARE = TNLog(None, LOGFILE)
TNLOG_APP_LIFECYCLE = TNLog('app_lifecycle', APP_LIFECYCLE_LOGFILE)
TNLOG_APP_MIGRATION = TNLog('app_migration', APP_MIGRATION_LOGFILE)
TNLOG_DOCKER_IMAGE = TNLog('docker_image', DOCKER_IMAGE_LOGFILE)
TNLOG_FAILOVER = TNLog('failover', FAILOVER_LOGFILE)
TNLOG_NETDATA_API = TNLog('netdata_api', NETDATA_API_LOGFILE)
TNLOG_ZETTAREPL = TNLog('zettarepl', ZETTAREPL_LOGFILE, ZETTAREPL_LOGFORMAT)

ALL_LOG_FILES = (
TNLOG_MIDDLEWARE,
TNLOG_APP_LIFECYCLE,
TNLOG_APP_MIGRATION,
TNLOG_DOCKER_IMAGE,
TNLOG_FAILOVER,
TNLOG_NETDATA_API,
TNLOG_ZETTAREPL,
)

# Audit entries are inserted into audit databases in /audit rather than
# written to files in /var/log and so they are not members of ALL_LOG_FILES
MIDDLEWARE_TNAUDIT = TNLog('TNAUDIT_MIDDLEWARE', '', '', None)


class TNSyslogHandler(logging.handlers.SysLogHandler):
def __init__(
self, application_name: str, debug_level: typing.Optional[str]=None,
log_format: str='[%(asctime)s] (%(levelname)s) %(name)s.%(funcName)s():%(lineno)d - %(message)s'
self,
address: str = DEFAULT_SYSLOG_PATH,
pending_queue: deque | None = None
):
"""
address - path to Unix socket (should be defined in syslog-ng config)

pending_queue - deque object that will be used for storing failed
LogRecords if syslog is currently down.

Note: maxlen should be set unless one wants to queue the log messages
without loss until syslog connection restored. This is probably
desireable for auditing, but not for general purposes (where it's
better to just specify a fallback handler).
"""
self.pending_queue = pending_queue
self.fallback_handler = None
super().__init__(address, socktype=socket.SOCK_STREAM)

def drain_pending_queue(self) -> bool:
"""
Attempt to emit any log records that have been queued up due to logging
failures to the syslog socket.

Returns:
True if successfully drained entire queue else False

Raises:
Should not raise exceptions
"""
while self.pending_queue:
record = self.pending_queue.popleft()
try:
super().emit(record)
except Exception:
# Nope. Still dead. Put it back where we found it
self.pending_queue.appendleft(record)
return False

return True

def fallback(self, record: logging.LogRecord) -> None:
"""
Fallback logging mechanism in case the syslog target is down.

In this case we emit the log record to the fallback handler and ignore
any errors.

Returns:
None

Raises:
Should not raise exceptions
"""
if not self.fallback_handler:
return

try:
self.fallback_handler.emit(record)
except Exception:
pass

def emit(self, record: logging.LogRecord) -> None:
"""
Emit a LogRecord to syslog. If this fails then add to pending queue and
emit via our fallback handler.
"""

# First attempt to drain the pending queue
if not self.drain_pending_queue():
# Failed to drain our pending queue so add this record to the
# ever-growing deque
self.pending_queue.append(record)
self.fallback(record)
return

try:
super().emit(record)
except Exception:
# logging framework done broke. Queue up
# for drain on next auditd message handled
self.pending_queue.append(record)
yocalebo marked this conversation as resolved.
Show resolved Hide resolved
self.fallback(record)

def handleError(self, record: logging.LogRecord) -> None:
"""
Override the default syslog error handler if we have a pending_queue to
defined. Exception raised here passes back up to the the emit() call.
"""
# re-raise it back up to the emit call
if self.pending_queue is None:
return super().handleError(record)

raise

def set_fallback_handler(self, fallback: logging.Handler) -> None:
""" Set a fallback handler (for example to file) that will be used if syslog socket logging fails """
if not isinstance(fallback, logging.Handler):
raise TypeError(f'{fallback}: not a logging.Handler')

self.fallback_handler = fallback

def close(self) -> None:
# Close our socket
super().close()

if self.fallback_handler:
# close any open file handler
self.fallback_handler.close()
self.fallback_handler = None


def setup_syslog_handler(tnlog: TNLog, fallback: logging.Handler | None) -> logging.Logger:
# Use `QueueHandler` to avoid blocking IO in asyncio main loop
log_queue = queue.Queue()
queue_handler = logging.handlers.QueueHandler(log_queue)

# Set up syslog handler with deque to store failed messages until
# they can be flushed. This can happen if syslog-ng isn't ready yet.
syslog_handler = TNSyslogHandler(pending_queue=deque(maxlen=tnlog.pending_maxlen))
syslog_handler.setLevel(logging.DEBUG)

if tnlog.logformat:
syslog_handler.setFormatter(logging.Formatter(tnlog.logformat, '%Y/%m/%d %H:%M:%S'))

# Set ident for the logger. This becomes program name in syslog-ng and allows
# more precise filtering rules
syslog_handler.ident = tnlog.get_ident()

# Set fallback for case where syslog is broken. This does not need separate queue
# since emit will happen in separate thread from main loop.
if fallback:
syslog_handler.set_fallback_handler(fallback)

queue_listener = logging.handlers.QueueListener(log_queue, syslog_handler)
queue_listener.start()
logger = logging.getLogger(tnlog.name)
logger.addHandler(queue_handler)
if tnlog.name is not None:
logging.getLogger(tnlog.name).propagate = False

return logger


class Logger:
"""Pseudo-Class for Logger - Wrapper for logging module"""
def __init__(self, application_name: str, debug_level: str = 'DEBUG', log_format: str = DEFAULT_LOGFORMAT):
self.application_name = application_name
self.debug_level = debug_level or 'DEBUG'
self.debug_level = debug_level
self.log_format = log_format

def getLogger(self):
Expand All @@ -94,40 +287,22 @@ def configure_logging(self, output_option: str):
console_handler.setFormatter(ConsoleLogFormatter(self.log_format, datefmt=time_format))
logging.root.addHandler(console_handler)
else:
for name, filename, log_format in [
(None, LOGFILE, self.log_format),
('app_lifecycle', APP_LIFECYCLE_LOGFILE, self.log_format),
('app_migrations', APP_MIGRATION_LOGFILE, self.log_format),
('docker_image', DOCKER_IMAGE_LOGFILE, self.log_format),
('failover', FAILOVER_LOGFILE, self.log_format),
('netdata_api', NETDATA_API_LOGFILE, self.log_format),
('zettarepl', ZETTAREPL_LOGFILE,
'[%(asctime)s] %(levelname)-8s [%(threadName)s] [%(name)s] %(message)s'),
]:
self.setup_file_logger(name, filename, log_format)
# Set up our fallback logging mechanism (to file) in case syslog is broken
# We internally queue writes to regular log files while waiting for syslog to recover
# while simultaneously writing to the fallback file so that there is less potential to
# lose relevant logs.
fallback_handler = logging.handlers.RotatingFileHandler(FALLBACK_LOGFILE, 'a', 10485760, 5, 'utf-8')
fallback_handler.setLevel(logging.DEBUG)
fallback_handler.setFormatter(logging.Formatter(DEFAULT_LOGFORMAT, '%Y/%m/%d %H:%M:%S'))

for tnlog in ALL_LOG_FILES:
setup_syslog_handler(tnlog, fallback_handler)

logging.root.setLevel(getattr(logging, self.debug_level))

def setup_file_logger(self, name: typing.Optional[str], filename: str, log_format: typing.Optional[str]):
# Use `QueueHandler` to avoid blocking IO in asyncio main loop
log_queue = queue.Queue()
queue_handler = logging.handlers.QueueHandler(log_queue)
file_handler = logging.handlers.RotatingFileHandler(filename, 'a', 10485760, 5, 'utf-8')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter(log_format, '%Y/%m/%d %H:%M:%S'))
queue_listener = logging.handlers.QueueListener(log_queue, file_handler)
queue_listener.start()
logging.getLogger(name).addHandler(queue_handler)
if name is not None:
logging.getLogger(name).propagate = False

# Make sure various log files are not readable by everybody.
# umask could be another approach but chmod was chosen so
# it affects existing installs.
try:
os.chmod(filename, 0o640)
except OSError:
pass

def setup_audit_logging() -> logging.Logger:
return setup_syslog_handler(MIDDLEWARE_TNAUDIT, None)


def setup_logging(name: str, debug_level: typing.Optional[str], log_handler: typing.Optional[str]):
Expand Down
Loading
Loading