Доведен до ума и стабилизирован модуль ZFS

This commit is contained in:
2026-02-24 14:35:26 +03:00
parent f227824070
commit 347d7388b2
5 changed files with 261 additions and 24 deletions

View File

@@ -43,6 +43,10 @@ class TelegramHandler(logging.Handler):
def __init__(self, config: Dict[str, Any]) -> None:
super().__init__()
self._config = config
self._suppress_levels: List[str] = []
batch = config.get("batch_notifications", {})
if batch.get("enabled"):
self._suppress_levels = [str(x).upper() for x in batch.get("suppress_individual_logs", [])]
self._queue: queue.Queue = queue.Queue(maxsize=1000)
self._bot_token = config.get("bot_token", "")
self._chat_id = str(config.get("chat_id", ""))
@@ -70,11 +74,13 @@ class TelegramHandler(logging.Handler):
def emit(self, record: logging.LogRecord) -> None:
if not self._bot_token or not self._chat_id:
return
levelname = str(getattr(record, "levelname", record.levelname)).upper()
if self._suppress_levels and levelname in self._suppress_levels:
return
if not self._level_allowed(record):
return
try:
msg = self.format(record)
# Экранируем HTML при parse_mode=HTML
if self._parse_mode == "HTML":
msg = msg.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
self._queue.put_nowait(msg)
@@ -183,11 +189,68 @@ class TelegramHandler(logging.Handler):
return True
return False
def flush(self, timeout: float = 15.0) -> None:
"""Дождаться отправки всех сообщений из очереди перед выходом программы."""
deadline = time.monotonic() + timeout
while not self._queue.empty() and time.monotonic() < deadline:
time.sleep(0.2)
self._stop.set()
self._worker.join(timeout=5.0)
def close(self) -> None:
self._stop.set()
super().close()
def flush_telegram_handlers(timeout: float = 15.0) -> None:
"""Ожидание отправки всех сообщений в Telegram перед выходом (вызывать в конце main)."""
root = logging.getLogger(ROOT_LOGGER_NAME)
for h in root.handlers:
if isinstance(h, TelegramHandler):
h.flush(timeout=timeout)
def send_batch_notification(telegram_config: Optional[Dict[str, Any]], text: str) -> None:
"""
Отправить структурированное batch-сообщение напрямую в Telegram (PRD v1.9).
Не идёт через очередь, отправка синхронная с retry.
"""
if not telegram_config or not telegram_config.get("enabled"):
return
token = telegram_config.get("bot_token")
chat_id = telegram_config.get("chat_id")
if not token or not chat_id:
return
retry_cfg = telegram_config.get("retry", {})
attempts = retry_cfg.get("attempts", 3)
delay = retry_cfg.get("delay", 5)
opts = telegram_config.get("options", {})
parse_mode = opts.get("parse_mode", "HTML")
max_len = opts.get("max_message_length", 4096)
if parse_mode == "HTML":
text = text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
parts = [text[i : i + max_len] for i in range(0, len(text), max_len)] if len(text) > max_len else [text]
import requests
url = f"https://api.telegram.org/bot{token}/sendMessage"
for part in parts:
for attempt in range(1, attempts + 1):
try:
resp = requests.post(
url,
data={"chat_id": chat_id, "text": part, "parse_mode": parse_mode},
timeout=10,
)
resp.raise_for_status()
if not resp.json().get("ok"):
raise RuntimeError(resp.json().get("description", "Unknown"))
break
except Exception as e:
if attempt == attempts:
sys.stderr.write(f"[send_batch_notification] Не удалось отправить: {e}\n")
return
time.sleep(delay)
def _parse_level(level: str) -> int:
m = {
"DEBUG": logging.DEBUG,

View File

@@ -10,6 +10,9 @@ import re
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Tuple
# Результат batch-операции: (успешно, список ошибок)
# ошибка = (server/dataset_key, сообщение) — например "gwo2.mps.cln.su/containers/smb2"
from .logger import get_logger
from .protocols import SSHProtocol
from .ssh_base import SSHBase
@@ -130,14 +133,18 @@ def backup_server(
server: Dict[str, Any],
ssh_defaults: Dict[str, Any],
log: logging.Logger,
) -> bool:
collect_errors: bool = False,
) -> Tuple[int, List[Tuple[str, str]]]:
"""
Выполнить бэкап для одного сервера: снапшоты, send/recv, очистка.
Конфиг v1.1: server['pools'] — список пулов с source_pool, datasets, target_pool.
Возвращает (успешно, список_ошибок), где ошибка = (ключ "server/dataset", сообщение).
"""
name = server["name"]
pools = server["pools"]
retention_days = int(server.get("retention_days", 30))
success_count = 0
errors: List[Tuple[str, str]] = []
port = ssh_defaults.get("port", 22)
username = ssh_defaults.get("username", "root")
@@ -148,27 +155,35 @@ def backup_server(
log.info("Сервер %s: дата снапшотов %s", name, date_str)
ssh = SSHBase(hostname=name, port=port, username=username, pkey_file=pkey_file, host_keys=host_keys)
ssh.connect()
pool_counts: Dict[str, int] = {}
total_datasets = 0
try:
ssh.connect()
except Exception as e:
for pool_config in pools:
for dataset in pool_config["datasets"]:
errors.append((f"{name}/{dataset}", str(e)))
return 0, errors
try:
for pool_config in pools:
source_pool = pool_config["source_pool"]
datasets = pool_config["datasets"]
target_pool = pool_config["target_pool"]
pool_counts[source_pool] = len(datasets)
total_datasets += len(datasets)
for dataset in datasets:
full_dataset = f"{source_pool}/{dataset}"
key = f"{name}/{dataset}"
def do_snapshot(fd=full_dataset, d=date_str):
if not _create_snapshot(ssh, fd, d, log):
raise RuntimeError(f"Snapshot {fd}@{d} failed")
_run_with_retry(log, name, f"snapshot {full_dataset}", do_snapshot)
try:
_run_with_retry(log, name, f"snapshot {full_dataset}", do_snapshot)
except Exception as e:
errors.append((key, str(e)))
if not collect_errors:
raise
continue
for pool_config in pools:
source_pool = pool_config["source_pool"]
@@ -178,6 +193,7 @@ def backup_server(
for dataset in datasets:
full_dataset = f"{source_pool}/{dataset}"
target_dataset = f"{target_pool}/{dataset}"
key = f"{name}/{dataset}"
def do_send_recv(fd=full_dataset, td=target_dataset, d=date_str):
prev_date = _get_previous_snapshot_date(ssh, td)
@@ -188,14 +204,20 @@ def backup_server(
if not _replicate_snapshot(ssh, fd, td, d, prev_date, log):
raise RuntimeError(f"Replicate {fd}@{d}{td} failed")
_run_with_retry(log, name, f"send/recv {full_dataset}", do_send_recv)
try:
_run_with_retry(log, name, f"send/recv {full_dataset}", do_send_recv)
success_count += 1
except Exception as e:
errors.append((key, str(e)))
if not collect_errors:
raise
continue
cutoff = datetime.now() - timedelta(days=retention_days)
for pool_config in pools:
source_pool = pool_config["source_pool"]
datasets = pool_config["datasets"]
target_pool = pool_config["target_pool"]
for dataset in datasets:
for pool, ds in [(source_pool, dataset), (target_pool, dataset)]:
full_ds = f"{pool}/{ds}"
@@ -218,9 +240,9 @@ def backup_server(
except ValueError:
continue
parts = ", ".join(f"{p}:{c}" for p, c in sorted(pool_counts.items()))
log.info("%s: %s datasets backed up (%s)", name, total_datasets, parts)
return True
if not collect_errors:
success_count = sum(len(pc["datasets"]) for pc in pools)
return success_count, errors
finally:
ssh.close()