feature: Reorganize files, add server backoff

This commit is contained in:
Keith Solomon
2025-10-19 14:54:48 -05:00
parent d41200d4e2
commit 14490a0f71
5 changed files with 344 additions and 237 deletions

View File

@@ -24,10 +24,12 @@ Designed for **homelab** and **server** environments. Script file is named `cifs
## How It Works
1. **Discovery** — scans `/etc/fstab` for uncommented `cifs` entries like:
```
```ini
//nas.local/media /mnt/media cifs vers=3.0,credentials=/root/.smbcreds 0 0
//192.168.1.50/share /mnt/share cifs credentials=/root/.creds,iocharset=utf8 0 0
```
2. **Reachability** — ensures the server resolves, optionally pings, and has TCP/445 open.
3. **Health probe** — times a quick `ls` against the mountpoint.
4. **Repair** — remount or unmount/remount as needed, with retries and logging.
@@ -37,34 +39,44 @@ Designed for **homelab** and **server** environments. Script file is named `cifs
## First-Run Setup
1. **Install prerequisites**
```bash
sudo apt install cifs-utils
```
2. **Credentials file**
```bash
sudo nano /root/.smbcreds
```
```
```ini
username=myuser
password=mypassword
domain=MYDOMAIN # optional
```
```bash
sudo chmod 600 /root/.smbcreds
```
3. **Add to `/etc/fstab`**
```
```ini
//192.168.1.10/media /mnt/media cifs vers=3.0,credentials=/root/.smbcreds,uid=1000,gid=1000,file_mode=0644,dir_mode=0755 0 0
```
> Avoid `noauto` if you want the script to manage the mount.
4. **Test manually**
```bash
sudo mount -a
sudo ls /mnt/media
```
5. **Verify connectivity**
```bash
ping -c 2 192.168.1.10
nc -zv 192.168.1.10 445
@@ -80,12 +92,14 @@ sudo touch /var/log/cifs-remount.log && sudo chmod 600 /var/log/cifs-remount.log
```
**Test it:**
```bash
sudo /usr/local/sbin/cifs-watch --dry-run --verbose
sudo /usr/local/sbin/cifs-watch --verbose
```
Logs:
- `/var/log/cifs-remount.log`
- `journalctl -t cifs-watch` or `journalctl -u cifs-watch.service`
@@ -96,6 +110,7 @@ Logs:
Create the following two files:
**`/etc/systemd/system/cifs-watch.service`**
```ini
[Unit]
Description=Monitor and repair CIFS mounts
@@ -107,6 +122,7 @@ Nice=10
```
**`/etc/systemd/system/cifs-watch.timer`**
```ini
[Unit]
Description=Run cifs-watch periodically
@@ -122,6 +138,7 @@ WantedBy=timers.target
```
Enable and start:
```bash
sudo systemctl daemon-reload
sudo systemctl enable --now cifs-watch.timer
@@ -149,7 +166,8 @@ Prevents hammering an offline NAS with constant retries.
- Backoff state is reset on reboot (uses `/var/tmp`).
Example log flow:
```
```log
[WARN] Server NOT reachable: nas.local (skipping /mnt/media)
[INFO] Backoff started: will not retry nas.local for 10 minutes
...
@@ -171,7 +189,7 @@ Example log flow:
## Cron Alternative
```
```ini
*/5 * * * * /usr/local/sbin/cifs-watch >/dev/null 2>&1
```
@@ -188,4 +206,5 @@ Example log flow:
## License
MIT — see `LICENSE`.
Released under the [Unlicense](https://unlicense.org/).
You can do whatever you want with this code. No warranty provided.

319
cifs-watch Normal file
View File

@@ -0,0 +1,319 @@
#!/usr/bin/env bash
# Monitor CIFS mounts from /etc/fstab and (re)mount if needed.
# Designed for cron/systemd timer. Requires root.
# Includes per-server backoff to avoid hammering offline hosts.
set -Eeuo pipefail
IFS=$'\n\t'
# ---------------------------
# Config (edit if desired)
# ---------------------------
LOGFILE="/var/log/cifs-remount.log" # Set empty ("") to disable file logging.
PING_COUNT=1
PING_TIMEOUT=1 # seconds
TCP_TIMEOUT=2 # seconds for port 445 check
PROBE_TIMEOUT=2 # seconds to test mount health (listing)
REMOUNT_RETRY=2 # attempts
SLEEP_BETWEEN=1 # seconds between retries
SYSLOG_TAG="cifs-watch"
DRY_RUN=0
VERBOSE=0
# ---------------------------
# Optional backoff (per-server)
# ---------------------------
# Line-delimited JSON; last entry wins for each host
BACKOFF_FILE="/var/tmp/cifs-watch-backoff.json"
BACKOFF_MINUTES=10 # set to 0 to disable
# ---------------------------
# Helpers
# ---------------------------
log() {
local level="$1"; shift
local msg="$*"
local ts
ts="$(date '+%Y-%m-%d %H:%M:%S')"
# stdout when verbose or for non-debug levels
if [[ "$VERBOSE" -eq 1 || "$level" != "DEBUG" ]]; then
echo "[$ts] [$level] $msg"
fi
# syslog
logger -t "$SYSLOG_TAG[$$]" -p "user.$(tr '[:upper:]' '[:lower:]' <<<"$level")" -- "$msg" || true
# logfile
if [[ -n "$LOGFILE" ]]; then
( umask 0077; echo "[$ts] [$level] $msg" >> "$LOGFILE" ) || true
fi
}
vdbg() { [[ "$VERBOSE" -eq 1 ]] && log "DEBUG" "$*"; }
fail() { log "ERROR" "$*"; exit 1; }
have_cmd() { command -v "$1" >/dev/null 2>&1; }
# ----- Backoff helpers -----
# Store one JSON object per line; we always use the last one for a host.
# Example line: {"host":"nas.local","last_unreachable":1729030000}
now_epoch() { date +%s; }
# Return last unreachable epoch for a host (or 0 if none)
get_backoff_epoch() {
local host="$1"
[[ -f "$BACKOFF_FILE" ]] || { echo 0; return; }
local line epoch
line="$(grep -F ""host":"$host"" "$BACKOFF_FILE" | tail -n1 || true)"
if [[ -z "$line" ]]; then
echo 0; return
fi
epoch="$(sed -n 's/.*"last_unreachable":[[:space:]]*\([0-9]\+\).*/\1/p' <<<"$line")"
[[ -n "$epoch" ]] && echo "$epoch" || echo 0
}
# Record an unreachable event for the host
set_backoff_epoch() {
local host="$1" now
now="$(now_epoch)"
( umask 0077; printf '{"host":"%s","last_unreachable":%s}\n' "$host" "$now" >> "$BACKOFF_FILE" ) || true
}
# Return 0 if we should BACK OFF (i.e., skip trying this host), else 1
backoff_active() {
[[ "${BACKOFF_MINUTES:-0}" -le 0 ]] && return 1
local host="$1" last now cutoff
last="$(get_backoff_epoch "$host")"
[[ "$last" -eq 0 ]] && return 1
now="$(now_epoch)"
cutoff=$(( BACKOFF_MINUTES * 60 ))
if (( now - last < cutoff )); then
return 0
fi
return 1
}
announce_backoff_window() {
if [[ "${BACKOFF_MINUTES:-0}" -gt 0 ]]; then
log "DEBUG" "Backoff window is ${BACKOFF_MINUTES} minute(s); state file: ${BACKOFF_FILE}"
fi
}
init_backoff_state() {
if [[ "${BACKOFF_MINUTES:-0}" -gt 0 && ! -f "$BACKOFF_FILE" ]]; then
( umask 0077; : > "$BACKOFF_FILE" ) || true
fi
}
# Return 0 if TCP port is open, else 1
tcp_open_445() {
local host="$1"
if have_cmd nc; then
nc -z -w "$TCP_TIMEOUT" "$host" 445 >/dev/null 2>&1
return $?
else
# shellcheck disable=SC3020
timeout "$TCP_TIMEOUT" bash -c "cat < /dev/null > /dev/tcp/$host/445" 2>/dev/null
return $?
fi
}
# Return 0 if host is reachable enough to try mount
host_reachable() {
local host="$1"
if ! getent ahosts "$host" >/dev/null 2>&1; then
vdbg "DNS resolution failed for $host"
return 1
fi
if have_cmd ping; then
ping -c "$PING_COUNT" -W "$PING_TIMEOUT" "$host" >/dev/null 2>&1 || vdbg "Ping to $host failed"
fi
if tcp_open_445 "$host"; then
return 0
else
vdbg "TCP/445 closed on $host"
return 1
fi
}
# Return 0 if mountpoint is currently mounted as CIFS
is_cifs_mounted() {
local mnt="$1"
if findmnt -no FSTYPE -T "$mnt" 2>/dev/null | grep -qi '^cifs$'; then
return 0
fi
return 1
}
# Return 0 if mounted share appears healthy (no hang/transport error)
mount_healthy() {
local mnt="$1"
timeout "$PROBE_TIMEOUT" bash -c 'ls -1A -- "$0" >/dev/null 2>&1' "$mnt"
}
# Try remount, else unmount+mount. Returns 0 on success.
repair_mount() {
local mnt="$1"
local attempt=1
while (( attempt <= REMOUNT_RETRY )); do
vdbg "Attempt $attempt: remounting $mnt"
if (( DRY_RUN )); then
log "INFO" "DRY-RUN: would remount $mnt"
return 0
fi
if mount -o remount "$mnt" >/dev/null 2>&1; then
if mount_healthy "$mnt"; then
log "INFO" "Remounted healthy: $mnt"
return 0
fi
vdbg "Remount completed but health probe failed: $mnt"
fi
sleep "$SLEEP_BETWEEN"
(( attempt++ ))
done
log "WARN" "Remount failed/unhealthy for $mnt; trying forced unmount + clean mount"
if (( DRY_RUN )); then
log "INFO" "DRY-RUN: would umount -f $mnt && mount $mnt"
return 0
fi
if umount -f "$mnt" >/dev/null 2>&1 || umount -l "$mnt" >/dev/null 2>&1; then
:
else
log "WARN" "Unable to unmount $mnt; will still attempt a mount"
fi
if mount "$mnt" >/dev/null 2>&1; then
if mount_healthy "$mnt"; then
log "INFO" "Mounted healthy: $mnt"
return 0
else
log "WARN" "Mounted but health probe failed: $mnt"
return 1
fi
else
log "ERROR" "Mount failed for $mnt"
return 1
fi
}
usage() {
cat <<'USAGE'
cifs-watch [-n|--dry-run] [-v|--verbose] [--logfile PATH]
Monitors CIFS entries in /etc/fstab, checks server reachability, and (re)mounts as needed.
- Processes uncommented lines with type "cifs".
- Accepts fstab lines with 46 fields.
- Skips entries containing "noauto".
Options:
-n, --dry-run Show actions without changing anything
-v, --verbose More detailed output
--logfile P Override logfile path (empty to disable file logging)
USAGE
}
# ---------------------------
# Parse args
# ---------------------------
while [[ $# -gt 0 ]]; do
case "$1" in
-n|--dry-run) DRY_RUN=1; shift ;;
-v|--verbose) VERBOSE=1; shift ;;
--logfile) LOGFILE="${2:-}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown arg: $1"; usage; exit 2 ;;
esac
done
# Ensure tools present
for bin in findmnt mount umount awk grep sed timeout; do
have_cmd "$bin" || fail "Required command not found: $bin"
done
# ---------------------------
# Main: parse /etc/fstab
# ---------------------------
init_backoff_state
announce_backoff_window
mapfile -t CIFS_LINES < <(awk '
$0 !~ /^[[:space:]]*#/ && NF>=4 && tolower($3)=="cifs" { print }
' /etc/fstab)
if [[ ${#CIFS_LINES[@]} -eq 0 ]]; then
log "INFO" "No CIFS entries found in /etc/fstab. Nothing to do."
exit 0
fi
overall_rc=0
for line in "${CIFS_LINES[@]}"; do
# fields: fs_spec mountpoint fstype options [dump] [pass]
fs_spec=$(awk '{print $1}' <<<"$line")
mnt_point=$(awk '{print $2}' <<<"$line")
fstype=$(awk '{print tolower($3)}' <<<"$line")
options=$(awk '{print $4}' <<<"$line")
dumpv=$(awk 'NF>=5{print $5}' <<<"$line")
passv=$(awk 'NF>=6{print $6}' <<<"$line")
# Skip noauto entries
if grep -qi '(^|,)noauto(,|$)' <<<",$options,"; then
vdbg "Skipping noauto CIFS entry: $mnt_point ($fs_spec)"
continue
fi
# Parse server from //server/share
if [[ "$fs_spec" =~ ^//([^/]+)/.+$ ]]; then
server="${BASH_REMATCH[1]}"
else
log "WARN" "Could not parse server from fs_spec: $fs_spec (skipping)"
continue
fi
log "INFO" "Checking CIFS mount: $mnt_point (server: $server)"
if is_cifs_mounted "$mnt_point"; then
if mount_healthy "$mnt_point"; then
vdbg "Healthy: $mnt_point"
continue
else
log "WARN" "Mounted but unhealthy: $mnt_point"
fi
else
log "WARN" "Not mounted: $mnt_point"
fi
# --- Per-server backoff gate ---
if [[ "${BACKOFF_MINUTES:-0}" -gt 0 ]] && backoff_active "$server"; then
log "INFO" "Backoff active for $server — skipping $mnt_point (will retry after ${BACKOFF_MINUTES}m since last failure)"
overall_rc=1
continue
fi
# Probe reachability now (outside backoff or after it expired)
if host_reachable "$server"; then
log "INFO" "Server reachable: $server — attempting repair for $mnt_point"
else
log "ERROR" "Server NOT reachable: $server — skipping $mnt_point for now"
# Mark/refresh backoff timestamp
if [[ "${BACKOFF_MINUTES:-0}" -gt 0 ]]; then
set_backoff_epoch "$server"
if backoff_active "$server"; then
# Compute remaining (best-effort) for log readability
last="$(get_backoff_epoch "$server")"
now="$(now_epoch)"
rem=$(( BACKOFF_MINUTES*60 - (now - last) ))
(( rem < 0 )) && rem=0
log "INFO" "Backoff started/extended for $server — next retry window in ~$(( rem/60 ))m"
fi
fi
overall_rc=1
continue
fi
if ! repair_mount "$mnt_point"; then
log "ERROR" "Repair failed: $mnt_point"
overall_rc=1
fi
done
exit "$overall_rc"

View File

@@ -1,229 +0,0 @@
#!/usr/bin/env bash
# Monitor CIFS mounts from /etc/fstab and (re)mount if needed.
# Designed for cron/systemd timer. Requires root.
set -Eeuo pipefail
IFS=$'\n\t'
# ---------------------------
# Config (edit if desired)
# ---------------------------
LOGFILE="/var/log/cifs-remount.log" # Set empty ("") to disable file logging.
PING_COUNT=1
PING_TIMEOUT=1 # seconds
TCP_TIMEOUT=2 # seconds for port 445 check
PROBE_TIMEOUT=2 # seconds to test mount health (listing)
REMOUNT_RETRY=2 # attempts
SLEEP_BETWEEN=1 # seconds between retries
SYSLOG_TAG="cifs-watch"
DRY_RUN=0
VERBOSE=0
# ---------------------------
# Helpers
# ---------------------------
log() {
local level="$1"; shift
local msg="$*"
local ts
ts="$(date '+%Y-%m-%d %H:%M:%S')"
if [[ "$VERBOSE" -eq 1 || "$level" != "DEBUG" ]]; then
echo "[$ts] [$level] $msg"
fi
logger -t "$SYSLOG_TAG[$$]" -p "user.$(tr '[:upper:]' '[:lower:]' <<<"$level")" -- "$msg" || true
if [[ -n "$LOGFILE" ]]; then
( umask 0077; echo "[$ts] [$level] $msg" >> "$LOGFILE" ) || true
fi
}
vdbg() { [[ "$VERBOSE" -eq 1 ]] && log "DEBUG" "$*"; }
fail() { log "ERROR" "$*"; exit 1; }
have_cmd() { command -v "$1" >/dev/null 2>&1; }
tcp_open_445() {
local host="$1"
if have_cmd nc; then
nc -z -w "$TCP_TIMEOUT" "$host" 445 >/dev/null 2>&1
return $?
else
timeout "$TCP_TIMEOUT" bash -c "cat < /dev/null > /dev/tcp/$host/445" 2>/dev/null
return $?
fi
}
host_reachable() {
local host="$1"
if ! getent ahosts "$host" >/dev/null 2>&1; then
vdbg "DNS resolution failed for $host"
return 1
fi
if have_cmd ping; then
ping -c "$PING_COUNT" -W "$PING_TIMEOUT" "$host" >/dev/null 2>&1 || vdbg "Ping to $host failed"
fi
if tcp_open_445 "$host"; then
return 0
else
vdbg "TCP/445 closed on $host"
return 1
fi
}
is_cifs_mounted() {
local mnt="$1"
if findmnt -no FSTYPE -T "$mnt" 2>/dev/null | grep -qi '^cifs$'; then
return 0
fi
return 1
}
mount_healthy() {
local mnt="$1"
timeout "$PROBE_TIMEOUT" bash -c 'ls -1A -- "$0" >/dev/null 2>&1' "$mnt"
}
repair_mount() {
local mnt="$1"
local attempt=1
while (( attempt <= REMOUNT_RETRY )); do
vdbg "Attempt $attempt: remounting $mnt"
if (( DRY_RUN )); then
log "INFO" "DRY-RUN: would remount $mnt"
return 0
fi
if mount -o remount "$mnt" >/dev/null 2>&1; then
if mount_healthy "$mnt"; then
log "INFO" "Remounted healthy: $mnt"
return 0
fi
vdbg "Remount completed but health probe failed: $mnt"
fi
sleep "$SLEEP_BETWEEN"
(( attempt++ ))
done
log "WARN" "Remount failed/unhealthy for $mnt; trying forced unmount + clean mount"
if (( DRY_RUN )); then
log "INFO" "DRY-RUN: would umount -f $mnt && mount $mnt"
return 0
fi
if umount -f "$mnt" >/dev/null 2>&1 || umount -l "$mnt" >/dev/null 2>&1; then
:
else
log "WARN" "Unable to unmount $mnt; will still attempt a mount"
fi
if mount "$mnt" >/dev/null 2>&1; then
if mount_healthy "$mnt"; then
log "INFO" "Mounted healthy: $mnt"
return 0
else
log "WARN" "Mounted but health probe failed: $mnt"
return 1
fi
else
log "ERROR" "Mount failed for $mnt"
return 1
fi
}
usage() {
cat <<'USAGE'
cifs-watch.sh [-n|--dry-run] [-v|--verbose] [--logfile PATH]
Monitors CIFS entries in /etc/fstab, checks server reachability, and (re)mounts as needed.
- Processes uncommented lines with type "cifs".
- Accepts fstab lines with 46 fields.
- Skips entries containing "noauto".
Options:
-n, --dry-run Show actions without changing anything
-v, --verbose More detailed output
--logfile P Override logfile path (empty to disable file logging)
USAGE
}
# ---------------------------
# Parse args
# ---------------------------
while [[ $# -gt 0 ]]; do
case "$1" in
-n|--dry-run) DRY_RUN=1; shift ;;
-v|--verbose) VERBOSE=1; shift ;;
--logfile) LOGFILE="${2:-}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown arg: $1"; usage; exit 2 ;;
esac
done
# Ensure tools present
for bin in findmnt mount umount awk grep sed timeout; do
have_cmd "$bin" || fail "Required command not found: $bin"
done
# ---------------------------
# Main: parse /etc/fstab
# ---------------------------
mapfile -t CIFS_LINES < <(awk '
$0 !~ /^[[:space:]]*#/ && NF>=4 && tolower($3)=="cifs" { print }
' /etc/fstab)
if [[ ${#CIFS_LINES[@]} -eq 0 ]]; then
log "INFO" "No CIFS entries found in /etc/fstab. Nothing to do."
exit 0
fi
overall_rc=0
for line in "${CIFS_LINES[@]}"; do
# fields: fs_spec mountpoint fstype options [dump] [pass]
fs_spec=$(awk '{print $1}' <<<"$line")
mnt_point=$(awk '{print $2}' <<<"$line")
fstype=$(awk '{print tolower($3)}' <<<"$line")
options=$(awk '{print $4}' <<<"$line")
dumpv=$(awk 'NF>=5{print $5}' <<<"$line")
passv=$(awk 'NF>=6{print $6}' <<<"$line")
# Skip noauto entries
if grep -qi '(^|,)noauto(,|$)' <<<",$options,"; then
vdbg "Skipping noauto CIFS entry: $mnt_point ($fs_spec)"
continue
fi
# Parse server from //server/share
if [[ "$fs_spec" =~ ^//([^/]+)/.+$ ]]; then
server="${BASH_REMATCH[1]}"
else
log "WARN" "Could not parse server from fs_spec: $fs_spec (skipping)"
continue
fi
log "INFO" "Checking CIFS mount: $mnt_point (server: $server)"
if is_cifs_mounted "$mnt_point"; then
if mount_healthy "$mnt_point"; then
vdbg "Healthy: $mnt_point"
continue
else
log "WARN" "Mounted but unhealthy: $mnt_point"
fi
else
log "WARN" "Not mounted: $mnt_point"
fi
if host_reachable "$server"; then
log "INFO" "Server reachable: $server — attempting repair for $mnt_point"
else
log "ERROR" "Server NOT reachable: $server — skipping $mnt_point for now"
overall_rc=1
continue
fi
if ! repair_mount "$mnt_point"; then
log "ERROR" "Repair failed: $mnt_point"
overall_rc=1
fi
done
exit "$overall_rc"

View File

@@ -1,4 +1,3 @@
# /etc/systemd/system/cifs-watch.service
[Unit]
Description=Monitor and repair CIFS mounts

View File

@@ -1,4 +1,3 @@
# /etc/systemd/system/cifs-watch.timer
[Unit]
Description=Run cifs-watch periodically