From 55ecb1b396b3df453a1149f756d2146f120a6d31 Mon Sep 17 00:00:00 2001 From: ameer Date: Sun, 3 May 2026 04:16:23 +0800 Subject: [PATCH] fix(stress): port harnesses to v1.2 single-session API + remove WS-batch hang Local API stress (lib.mjs / api_stress.mjs): - setupSession now does login -> /admin/api/reset and returns sid="main". Drops the dead /admin/api/quizzes + /admin/api/sessions calls left over from the multi-quiz codex era. - bootServer writes the fixture pool (STRESS_POOL by default) to a tmp file and passes QUIZ_POOL_PATH so the v1.2 server has a session at startup. - happyPath: drop the post-connect lobby_update wait (race with snapshot dispatch) and stop double-driving the lifecycle (next() already opens the next question, an explicit open() afterwards is a no-op). - cross_session: rewritten as "cookie not honored on a non-existent sid" since v1.2 hosts a single canonical session. Live accuracy stress (live_accuracy.mjs): - Per-student lobby-snapshot timeout (12s) with WS error/close rejection, so a stalled handshake no longer hangs Promise.all until the outer shell timeout (which produced the exit=124 cycles). - Open all student WSs in parallel (mirrors what real students do); the batch-of-8 throttle was masking the question we wanted answered. - Instructor WS open also bounded by a 15s race so any failure surfaces as actionable error text instead of a silent stall. Bootstrap (deploy/bootstrap.sh): - Stage 1 provisions a 2GB swap file (idempotent) with vm.swappiness=10. 1GB-RAM ECS instances OOM-kill uvicorn under WS-burst start-of-class pressure; swap absorbs the spike without affecting steady state. - Pool seeding prefers examples/demo10_pool.json over the 2-question example so a fresh deploy boots with a usable demo. Pool fixture (examples/demo10_pool.json): - 10-question generic-knowledge demo pool, gitignore exception added. --- .gitignore | 3 +- deploy/bootstrap.sh | 39 ++++++++++----- examples/demo10_pool.json | 87 ++++++++++++++++++++++++++++++++++ tests/stress/api_stress.mjs | 49 +++++++++++-------- tests/stress/lib.mjs | 27 +++++++---- tests/stress/live_accuracy.mjs | 85 +++++++++++++++++++++++---------- 6 files changed, 226 insertions(+), 64 deletions(-) create mode 100644 examples/demo10_pool.json diff --git a/.gitignore b/.gitignore index 44064c3..b70856e 100644 --- a/.gitignore +++ b/.gitignore @@ -12,9 +12,10 @@ quiz.db *.db-wal # Real quiz pools must never be committed (they contain answer keys). -# Only generic demo pools tracked under examples/pool_example.json. +# Only generic demo pools tracked under examples/. examples/*_pool.json !examples/pool_example.json +!examples/demo10_pool.json # The runtime pool the server reads from disk lives at the repo root. # Operators populate it; it stays out of version control. /pool.json diff --git a/deploy/bootstrap.sh b/deploy/bootstrap.sh index 6dba024..91db80a 100755 --- a/deploy/bootstrap.sh +++ b/deploy/bootstrap.sh @@ -25,14 +25,29 @@ fi stage() { printf '\n==> Stage %s\n' "$*"; } -stage "1/9: apt update + base packages" +stage "1/10: provision 2GB swap (skip if /swapfile already present)" +# 1GB-RAM ECS instances OOM-kill uvicorn during ws-burst peaks (50+ +# simultaneous WS handshakes during class start). 2GB swap absorbs +# transient pressure without touching steady-state behavior. +if [ ! -f /swapfile ]; then + fallocate -l 2G /swapfile + chmod 600 /swapfile + mkswap /swapfile >/dev/null + swapon /swapfile + grep -q '^/swapfile ' /etc/fstab || echo '/swapfile none swap sw 0 0' >> /etc/fstab +fi +# vm.swappiness=10 keeps active pages in RAM, only swap under real pressure. +echo 'vm.swappiness=10' > /etc/sysctl.d/99-quiz.conf +sysctl -p /etc/sysctl.d/99-quiz.conf >/dev/null + +stage "2/10: apt update + base packages" apt-get update -q DEBIAN_FRONTEND=noninteractive apt-get install -y -q \ git curl ca-certificates gnupg \ python3 python3-venv python3-pip \ debian-keyring debian-archive-keyring apt-transport-https -stage "2/9: install Caddy (skip if present)" +stage "3/10: install Caddy (skip if present)" if ! command -v caddy >/dev/null 2>&1; then curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' \ | gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg @@ -42,12 +57,12 @@ if ! command -v caddy >/dev/null 2>&1; then apt-get install -y -q caddy fi -stage "3/9: create $APP_USER system user (skip if present)" +stage "4/10: create $APP_USER system user (skip if present)" if ! id "$APP_USER" >/dev/null 2>&1; then useradd --system --shell /usr/sbin/nologin --home-dir "$APP_DIR" "$APP_USER" fi -stage "4/9: clone or update repo into $APP_DIR" +stage "5/10: clone or update repo into $APP_DIR" if [ -d "$APP_DIR/.git" ]; then git -C "$APP_DIR" fetch origin git -C "$APP_DIR" reset --hard "origin/$BRANCH" @@ -57,12 +72,12 @@ else fi chown -R "$APP_USER":"$APP_USER" "$APP_DIR" -stage "5/9: build venv + install dependencies" +stage "6/10: build venv + install dependencies" sudo -u "$APP_USER" -H python3 -m venv "$APP_DIR/.venv" sudo -u "$APP_USER" -H "$APP_DIR/.venv/bin/pip" install --quiet --upgrade pip sudo -u "$APP_USER" -H "$APP_DIR/.venv/bin/pip" install --quiet -e "$APP_DIR" -stage "6/9: configure environment (.env)" +stage "7/10: configure environment (.env)" ENV_FILE="$APP_DIR/.env" if [ ! -f "$ENV_FILE" ]; then if [ -f /root/.quiz.env ]; then @@ -98,21 +113,23 @@ EOF chmod 600 "$ENV_FILE" fi -stage "7/9: seed pool.json (if not already present)" +stage "8/10: seed pool.json (if not already present)" POOL_FILE="$APP_DIR/pool.json" if [ ! -f "$POOL_FILE" ]; then - cp "$APP_DIR/examples/pool_example.json" "$POOL_FILE" + SEED_POOL="$APP_DIR/examples/demo10_pool.json" + [ -f "$SEED_POOL" ] || SEED_POOL="$APP_DIR/examples/pool_example.json" + cp "$SEED_POOL" "$POOL_FILE" chown "$APP_USER":"$APP_USER" "$POOL_FILE" - echo "Seeded $POOL_FILE from examples/pool_example.json — replace with your real pool when ready." + echo "Seeded $POOL_FILE from $(basename "$SEED_POOL"). Replace with your real pool when ready." fi -stage "8/9: install systemd unit" +stage "9/10: install systemd unit" install -m 644 "$APP_DIR/deploy/quiz.service" /etc/systemd/system/quiz.service systemctl daemon-reload systemctl enable quiz.service systemctl restart quiz.service -stage "9/9: configure Caddy" +stage "10/10: configure Caddy" sed "s/__DOMAIN__/$DOMAIN/g" "$APP_DIR/deploy/Caddyfile.tpl" > /etc/caddy/Caddyfile systemctl reload caddy diff --git a/examples/demo10_pool.json b/examples/demo10_pool.json new file mode 100644 index 0000000..50216b6 --- /dev/null +++ b/examples/demo10_pool.json @@ -0,0 +1,87 @@ +{ + "title": "Demo Pool: Generic Knowledge (10Q)", + "score_fn": "linear_decay", + "time_limit_default": 60, + "questions": [ + { + "id": "d01", + "text": "Which of these is a programming language?", + "options": {"A": "HTTP", "B": "Python", "C": "TCP", "D": "DNS"}, + "correct": "B", + "explanation": "Python is a general-purpose programming language; the others are network protocols." + }, + { + "id": "d02", + "text": "What is 2 + 2?", + "options": {"A": "3", "B": "4", "C": "5", "D": "22"}, + "correct": "B", + "explanation": "Basic arithmetic." + }, + { + "id": "d03", + "text": "What is the capital of France?", + "options": {"A": "Berlin", "B": "Madrid", "C": "Paris", "D": "Rome"}, + "correct": "C", + "explanation": "Paris has been the capital of France since the 10th century." + }, + { + "id": "d04", + "text": "Which planet is known as the Red Planet?", + "options": {"A": "Venus", "B": "Mars", "C": "Jupiter", "D": "Saturn"}, + "correct": "B", + "explanation": "Mars appears red because of iron-oxide dust on its surface." + }, + { + "id": "d05", + "text": "Which HTTP status code means 'Not Found'?", + "options": {"A": "200", "B": "301", "C": "404", "D": "500"}, + "correct": "C", + "explanation": "404 is the canonical client-error response for a missing resource." + }, + { + "id": "d06", + "text": "What does CPU stand for?", + "options": { + "A": "Central Processing Unit", + "B": "Computer Personal Unit", + "C": "Central Performance Utility", + "D": "Core Programming Unit" + }, + "correct": "A", + "explanation": "The CPU is the primary component that executes program instructions." + }, + { + "id": "d07", + "text": "Which sorting algorithm has the best average-case complexity?", + "options": { + "A": "Bubble sort", + "B": "Selection sort", + "C": "Quicksort", + "D": "Insertion sort" + }, + "correct": "C", + "explanation": "Quicksort averages O(n log n); the others average O(n^2)." + }, + { + "id": "d08", + "text": "Approximately what is the speed of light in vacuum (m/s)?", + "options": {"A": "3 x 10^6", "B": "3 x 10^8", "C": "1.5 x 10^8", "D": "9.8"}, + "correct": "B", + "explanation": "About 299,792,458 m/s, conventionally rounded to 3 x 10^8 m/s." + }, + { + "id": "d09", + "text": "Which data structure operates strictly in Last-In-First-Out (LIFO) order?", + "options": {"A": "Queue", "B": "Stack", "C": "Linked list", "D": "Hash map"}, + "correct": "B", + "explanation": "A stack pushes and pops from the same end." + }, + { + "id": "d10", + "text": "Which of the following is NOT an operating system?", + "options": {"A": "Linux", "B": "Windows", "C": "Oracle", "D": "macOS"}, + "correct": "C", + "explanation": "Oracle is a database management system, not an OS." + } + ] +} diff --git a/tests/stress/api_stress.mjs b/tests/stress/api_stress.mjs index 0ac0d09..689410e 100644 --- a/tests/stress/api_stress.mjs +++ b/tests/stress/api_stress.mjs @@ -55,13 +55,21 @@ async function happyPath(server) { })); const admin = new Admin(server.url, sid, jar); await admin.connect(); - await admin.waitFor("lobby_update"); + // Don't wait on lobby_update from the snapshot; that's a race + // (snapshot dispatch can land before the listener attaches). The + // first thing we DO act on (a question_open we triggered) is a + // sufficient liveness signal for the admin WS. for (let q = 0; q < STRESS_POOL.questions.length; q++) { - // Pre-register waiters so we don't lose the broadcast in the race window + // Pre-register waiters BEFORE triggering the broadcast so we don't + // lose the message in the race window. const studentOpenWaits = students.map(s => s.waitFor("question_open")); const adminOpenWait = admin.waitFor("question_open"); - admin.open(q, 5); + // v1.2: advance_to_next handles the whole lifecycle (close prev + + // open next). Use open() only for the very first question from + // the lobby state. + if (q === 0) admin.open(q, 5); + else admin.next(); await adminOpenWait; await Promise.all(studentOpenWaits); // Each student picks a random answer (mostly correct) @@ -78,14 +86,15 @@ async function happyPath(server) { note("happy", `student${i} q${q}: ${e.message}`); } })); - const studentClosedWaits = students.map(s => s.waitFor("question_closed", { timeoutMs: 3000 }).catch(() => null)); - const adminClosedWait = admin.waitFor("question_closed", { timeoutMs: 3000 }); - admin.close(); - await adminClosedWait; - await Promise.all(studentClosedWaits); - if (q < STRESS_POOL.questions.length - 1) { - admin.next(); - await sleep(150); + // Only manually verify question_closed on the LAST question; + // intermediate closes happen implicitly inside admin.next() and + // do broadcast a question_closed, but we don't need to gate on it. + if (q === STRESS_POOL.questions.length - 1) { + const studentClosedWaits = students.map(s => s.waitFor("question_closed", { timeoutMs: 3000 }).catch(() => null)); + const adminClosedWait = admin.waitFor("question_closed", { timeoutMs: 3000 }); + admin.close(); + await adminClosedWait; + await Promise.all(studentClosedWaits); } } const sessionEndedWait = admin.waitFor("session_ended", { timeoutMs: 3000 }); @@ -247,23 +256,25 @@ async function cookieTampering(server) { s.disconnect(); } -// Cross-session cookie: cookie from session A should not work on session B. +// Cross-session cookie: in v1.2 the server hosts a SINGLE canonical session +// ("main"), so cross-session reuse isn't a topology that exists at runtime. +// We instead assert the closest single-session analog: a cookie issued for +// sid="main" is rejected when used against a non-existent sid path. async function crossSessionCookie(server) { - const { sid: sidA, jar: jarA } = await setupSession(server.url, server.adminPw, STRESS_POOL); - const { sid: sidB } = await setupSession(server.url, server.adminPw, STRESS_POOL); + const { sid: sidA } = await setupSession(server.url, server.adminPw, STRESS_POOL); const s = new Student(server.url, sidA, "X1", "CrossUser"); await s.join(); - // Try to use sidA's cookie to access sidB - const wsUrl = server.url.replace(/^http/, "ws") + `/ws/student/${sidB}`; - let opened = false; + const bogusSid = "not-a-real-session"; + const wsUrl = server.url.replace(/^http/, "ws") + `/ws/student/${bogusSid}`; + let opened = false, closeCode = null; await new Promise(res => { const w = new WebSocket(wsUrl, { headers: { Cookie: s.jar.header() } }); w.on("open", () => { opened = true; w.close(); res(); }); - w.on("close", () => res()); + w.on("close", (c) => { closeCode = c; res(); }); w.on("error", () => res()); setTimeout(res, 1500); }); - expect(!opened, "cross_session", "cookie from sidA rejected when used against sidB", { opened }); + expect(!opened, "cross_session", "cookie not honored against non-existent sid", { opened, closeCode }); } // Duplicate student_id: two browsers join with same student_id (different cookies) diff --git a/tests/stress/lib.mjs b/tests/stress/lib.mjs index ee88b7d..267bcfc 100644 --- a/tests/stress/lib.mjs +++ b/tests/stress/lib.mjs @@ -21,10 +21,15 @@ export function logLine(scenario, level, msg, extra = {}) { export function pickRandom(arr) { return arr[Math.floor(Math.random() * arr.length)]; } export function rand(min, max) { return Math.random() * (max - min) + min; } -// Boot a fresh server on its own port + DB. Returns { url, stop }. -export async function bootServer({ port, secret = "stress-secret-12345678", adminPw = "stresspw" } = {}) { +// Boot a fresh server on its own port + DB + pool file. Returns { url, stop }. +// v1.2 single-session: server reads ONE pool from $QUIZ_POOL_PATH at startup. +// We write STRESS_POOL (or the supplied `pool`) to a file in a fresh tmp dir +// per server, so concurrent harness processes don't share state. +export async function bootServer({ port, secret = "stress-secret-12345678", adminPw = "stresspw", pool = STRESS_POOL } = {}) { const tmp = mkdtempSync(join(tmpdir(), "quiz-stress-")); const dbPath = join(tmp, "stress.db"); + const poolPath = join(tmp, "pool.json"); + writeFileSync(poolPath, JSON.stringify(pool), "utf-8"); const env = { ...process.env, QUIZ_DB_PATH: dbPath, @@ -33,6 +38,8 @@ export async function bootServer({ port, secret = "stress-secret-12345678", admi QUIZ_HOST: "127.0.0.1", QUIZ_PORT: String(port), QUIZ_PUBLIC_URL: `http://127.0.0.1:${port}`, + QUIZ_POOL_PATH: poolPath, + QUIZ_SESSION_ID: "main", }; const proc = spawn( `${QUIZ_ROOT}/.venv/bin/uvicorn`, @@ -101,16 +108,18 @@ export async function jsonReq(method, url, { jar, body, headers = {} } = {}) { return { status: r.status, ok: r.ok, data, headers: r.headers }; } -// Build admin session: login + upload pool + create session. Returns { sid, jar }. -export async function setupSession(serverUrl, adminPw, pool) { +// v1.2 single-session: pool is loaded at startup from $QUIZ_POOL_PATH and sid +// is fixed (default "main"). setupSession() now just authenticates the admin +// and resets the canonical session so each scenario starts from the lobby. +// The `pool` arg is accepted but unused; kept so call sites stay readable +// (pool is set at bootServer time, not per-scenario). +export async function setupSession(serverUrl, adminPw, _poolUnused) { const jar = new CookieJar(); const login = await jsonReq("POST", `${serverUrl}/admin/login`, { jar, body: { password: adminPw } }); if (!login.ok) throw new Error(`admin login failed: ${login.status} ${JSON.stringify(login.data)}`); - const create = await jsonReq("POST", `${serverUrl}/admin/api/quizzes`, { jar, body: { pool_json: pool } }); - if (!create.ok) throw new Error(`quiz create failed: ${create.status} ${JSON.stringify(create.data)}`); - const sess = await jsonReq("POST", `${serverUrl}/admin/api/sessions`, { jar, body: { quiz_id: create.data.quiz_id } }); - if (!sess.ok) throw new Error(`session create failed: ${sess.status} ${JSON.stringify(sess.data)}`); - return { sid: sess.data.sid, jar }; + const reset = await jsonReq("POST", `${serverUrl}/admin/api/reset`, { jar, body: {} }); + if (!reset.ok) throw new Error(`reset failed: ${reset.status} ${JSON.stringify(reset.data)}`); + return { sid: "main", jar }; } // Student wrapper: join + connect WS + collect messages. diff --git a/tests/stress/live_accuracy.mjs b/tests/stress/live_accuracy.mjs index 01b95ee..e932fde 100644 --- a/tests/stress/live_accuracy.mjs +++ b/tests/stress/live_accuracy.mjs @@ -88,9 +88,11 @@ async function joinStudent(sid, studentId, name) { // Build a Student object: opens the WS, attaches the message listener // IMMEDIATELY (before connection establishes), so no incoming frame is -// ever lost to a listener-attach race. Returns a Promise that resolves -// to the bookkeeping struct once the lobby snapshot has arrived. -function makeStudent(sid, cookie, idx) { +// ever lost to a listener-attach race. Returns a Promise that settles +// with {ok:true} when the lobby snapshot arrives, or {ok:false, err} +// on WS error / close-before-lobby / per-student timeout. Stage-3 must +// settle inside the timeout regardless of network glitches. +function makeStudent(sid, cookie, idx, lobbyTimeoutMs) { const studentId = `S${String(idx).padStart(3, "0")}`; const ws = new WebSocket(`${wsBase}/ws/student/${SID}`, { headers: { Cookie: cookie }, @@ -105,11 +107,25 @@ function makeStudent(sid, cookie, idx) { closedSeen: new Map(), ended: null, closed: false, + lobbyErr: null, }; - let resolveLobby; - const lobbyP = new Promise((r) => { resolveLobby = r; }); - ws.on("error", () => {}); - ws.on("close", () => { state.closed = true; }); + let settleLobby; + let settled = false; + const lobbyP = new Promise((r) => { settleLobby = r; }); + const settle = (val) => { if (!settled) { settled = true; settleLobby(val); } }; + const timer = setTimeout(() => { + state.lobbyErr = `timeout after ${lobbyTimeoutMs}ms`; + settle({ ok: false, err: state.lobbyErr }); + }, lobbyTimeoutMs); + ws.on("error", (e) => { + state.lobbyErr = `ws error: ${e?.message || e}`; + settle({ ok: false, err: state.lobbyErr }); + }); + ws.on("close", () => { + state.closed = true; + state.lobbyErr ||= "ws closed before lobby"; + settle({ ok: false, err: state.lobbyErr }); + }); ws.on("message", (raw) => { let m; try { m = JSON.parse(raw.toString()); } catch { return; } @@ -117,7 +133,8 @@ function makeStudent(sid, cookie, idx) { case "state": if (m.state === "lobby") { state.inLobby = true; - resolveLobby(); + clearTimeout(timer); + settle({ ok: true }); } break; case "question_open": @@ -149,10 +166,13 @@ function openInstructorWS(adminCookie) { perMessageDeflate: false, }); const ev = { ws, lastQuestionOpen: null }; - let resolveOpen; - const openP = new Promise((r) => { resolveOpen = r; }); - ws.on("open", () => resolveOpen()); - ws.on("error", () => {}); + let settle; + let settled = false; + const openP = new Promise((r) => { settle = r; }); + const finish = (val) => { if (!settled) { settled = true; settle(val); } }; + ws.on("open", () => finish({ ok: true })); + ws.on("error", (e) => finish({ ok: false, err: `instructor ws error: ${e?.message || e}` })); + ws.on("close", () => finish({ ok: false, err: "instructor ws closed before open" })); ws.on("message", (raw) => { let m; try { m = JSON.parse(raw.toString()); } catch { return; } if (m.type === "question_open") ev.lastQuestionOpen = m; @@ -179,21 +199,38 @@ async function main() { if ((i + 1) % 10 === 0) process.stdout.write(` joined ${i + 1}/${N}\n`); } - console.log(`[stage 3] opening 1 admin + ${N} student WSs (batched)`); + console.log(`[stage 3] opening 1 admin + ${N} student WSs (parallel)`); const inst = openInstructorWS(adminCookie); - await inst.openP; + const instRes = await Promise.race([ + inst.openP, + sleep(15000).then(() => ({ ok: false, err: "instructor WS did not open within 15s" })), + ]); + if (!instRes.ok) throw new Error(instRes.err); - // Open student WSs in batches of 8, 250ms apart. - const students = []; - const BATCH = 8, GAP_MS = 250; - for (let i = 0; i < cookies.length; i += BATCH) { - const slice = cookies.slice(i, i + BATCH); - const wave = slice.map((c, j) => makeStudent(SID, c, i + j)); - await Promise.all(wave.map((s) => s.lobbyP)); - students.push(...wave.map((s) => s.state)); - if (i + BATCH < cookies.length) await sleep(GAP_MS); + // Open all student WSs in parallel — mirrors what real students do + // (no source-side throttle). Per-student lobby timeout = 12s; if any + // students fail to lobby in time we PROCEED with the survivors and + // log the failure so the cycle records actionable data instead of + // hanging until the outer shell timeout. + const LOBBY_TIMEOUT_MS = 12000; + const wave = cookies.map((c, i) => makeStudent(SID, c, i, LOBBY_TIMEOUT_MS)); + const results = await Promise.all(wave.map((s) => s.lobbyP)); + const survivors = wave.filter((_, i) => results[i].ok).map((s) => s.state); + const failed = results + .map((r, i) => (!r.ok ? { idx: i, err: r.err } : null)) + .filter(Boolean); + if (failed.length) { + console.log(`[stage 3] partial — ${survivors.length}/${N} students lobbied within ${LOBBY_TIMEOUT_MS}ms`); + failed.slice(0, 5).forEach((f) => console.log(` fail S${String(f.idx).padStart(3, "0")}: ${f.err}`)); + // Discard dead WSs cleanly so node doesn't keep them alive + for (let i = 0; i < wave.length; i++) { + if (!results[i].ok) { try { wave[i].state.ws.terminate(); } catch {} } + } + } else { + console.log(`[stage 3] ok — all ${survivors.length} students saw the lobby snapshot`); } - console.log(`[stage 3] ok — all ${students.length} students saw the lobby snapshot`); + if (survivors.length === 0) throw new Error("no students lobbied; aborting cycle"); + const students = survivors; // -- Drive each question --- console.log(`[stage 4] driving ${totalQs} questions via admin "next"`);