← Back
#!/bin/bash
# Signals-stall watchdog for futures-screener
# Detects the "process online but scanners wedged" failure (resync-storm) that
# silently stopped signals for ~6 days (26 May–1 Jun 2026). The existing
# health-check.sh misses it because PM2 stays "online" and HTTP keeps returning 200.
#
# Primary detector: /api/rate-limiter weightAge. In healthy operation a REST scan
# runs every ~60s so weightAge is near-zero; when wedged it grows unbounded.
# Secondary: HTTP unreachable => also a stall.
#
# Action: pm2 restart futures-screener (with a cooldown so it can't loop), log, notify.
# Install (cron, every 5 min):
#   */5 * * * * /home/app/futures-screener/scripts/watchdog.sh >> /home/app/futures-screener/logs/watchdog-cron.log 2>&1

set -uo pipefail

APP_NAME="futures-screener"
BASE_URL="http://127.0.0.1:3200"
STALL_MS=900000          # 15 min of no REST = wedged (healthy is <60s)
COOLDOWN_S=900           # don't restart more than once per 15 min
LOG="/home/app/futures-screener/logs/watchdog.log"
STATE="/home/app/futures-screener/logs/watchdog-last-restart"

# --- Optional Telegram alert (fill to enable) ---
TG_TOKEN="${FS_WATCHDOG_TG_TOKEN:-}"
TG_CHAT="${FS_WATCHDOG_TG_CHAT:-}"

mkdir -p "$(dirname "$LOG")"

log() { echo "[$(date -Iseconds)] $1" >> "$LOG"; }

notify() {
  [ -z "$TG_TOKEN" ] && return 0
  [ -z "$TG_CHAT" ] && return 0
  curl -s --max-time 8 "https://api.telegram.org/bot${TG_TOKEN}/sendMessage" \
    -d chat_id="$TG_CHAT" -d text="🐕 FS watchdog: $1" >/dev/null 2>&1 || true
}

# --- Read rate-limiter weightAge (ms). HTTP failure => stalled sentinel. ---
RESP=$(curl -s --max-time 8 "$BASE_URL/api/rate-limiter" 2>/dev/null || echo "")
if [ -z "$RESP" ]; then
  WEIGHT_AGE=$STALL_MS         # unreachable = treat as stalled
  REASON="HTTP unreachable"
else
  WEIGHT_AGE=$(echo "$RESP" | node -e "try{const d=JSON.parse(require('fs').readFileSync(0,'utf8'));console.log(Math.round(d.weightAge||0))}catch(e){console.log($STALL_MS)}" 2>/dev/null || echo "$STALL_MS")
  REASON="weightAge=${WEIGHT_AGE}ms (>${STALL_MS}ms)"
fi

# --- Healthy: nothing to do ---
if [ "$WEIGHT_AGE" -lt "$STALL_MS" ]; then
  exit 0
fi

# --- Stalled: respect cooldown to avoid restart loops ---
NOW=$(date +%s)
if [ -f "$STATE" ]; then
  LAST=$(cat "$STATE" 2>/dev/null || echo 0)
  if [ $((NOW - LAST)) -lt "$COOLDOWN_S" ]; then
    log "STALL detected ($REASON) but within cooldown ($((NOW - LAST))s ago) — skipping restart"
    exit 0
  fi
fi

log "STALL detected ($REASON) — restarting $APP_NAME"
echo "$NOW" > "$STATE"
pm2 restart "$APP_NAME" >> "$LOG" 2>&1
log "restart issued"
notify "scanners stalled ($REASON) → restarted $APP_NAME on $(hostname)"
exit 0

📜 Git History

cd8a86bfeat: signals-stall watchdog (auto-restart on wedged scanners)5 weeks ago
Show last diff
Loading...