From c8a3328e47d65d3c796da649608c3438933eec7b Mon Sep 17 00:00:00 2001
From: Yonatan Dankner <yonatan.dankner@gmail.com>
Date: Fri, 26 Jun 2026 11:29:50 +0200
Subject: [PATCH 1/2] Scrape degree-program branches and add scraper progress
 bars

Crawl the M.Sc. CS / B.Sc. Informatik / M.Sc. ML studiesOffered branches
alongside the VVZ "Gesamtverzeichnis" branch so courses cross-listed from
other faculties (KOG/GTCNEURO/MEDZ/BIOINF) are enumerated. Courses are
deduplicated by unit_id and detail pages are fetched once via the new
ScrapeOptions.skip_unit_ids; --no-programs restores VVZ-only behaviour.

Alias the codes that differ from study_areas.code in the D1 import so the
new courses link to the right study area: M.Sc. ML MACH-* -> ML-* and
B.Sc. Wahlpflicht INFM#### -> PRAK/THEO/TECH/INFO.

Add tqdm progress bars (outer "semesters", inner per-branch "details");
log lines go through tqdm.write so they don't corrupt the bars and --quiet
disables them. Add data_collection/CLAUDE.md and update QUICKSTART.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backend/scripts/import_alma_json_to_d1.py |  22 +++
 data_collection/CLAUDE.md                 |  99 +++++++++++++
 data_collection/QUICKSTART.md             |  13 +-
 data_collection/alma/cli.py               | 171 ++++++++++++++++------
 data_collection/alma/scraper.py           |  61 +++++++-
 data_collection/pyproject.toml            |   1 +
 data_collection/requirements.txt          |   1 +
 7 files changed, 315 insertions(+), 53 deletions(-)
 create mode 100644 data_collection/CLAUDE.md

diff --git a/backend/scripts/import_alma_json_to_d1.py b/backend/scripts/import_alma_json_to_d1.py
index dab0914..45aeb6c 100644
--- a/backend/scripts/import_alma_json_to_d1.py
+++ b/backend/scripts/import_alma_json_to_d1.py
@@ -497,6 +497,28 @@ def _get_or_create_lecturer(lecturer_id_by_name: dict[str, int], name: str) -> i
 JOIN study_areas AS sa ON sa.code = je.value
 WHERE f."key" = '_categories_json';
 
+-- Some programs expose study-area membership under codes that differ from the
+-- seeded study_areas.code: M.Sc. Machine Learning detail pages use MACH-*
+-- (seeded as ML-*), and B.Sc. Informatik Wahlpflicht modules appear as their
+-- INFM module numbers. Map those aliases so cross-listed courses still link to
+-- the right study area. The original scraped code is kept as source_code.
+INSERT OR IGNORE INTO course_study_area_links (course_id, study_area_id, source_code)
+SELECT f.course_id, sa.id, je.value
+FROM course_fields AS f
+JOIN json_each(f.value) AS je
+JOIN (
+    SELECT 'MACH-FML' AS src, 'ML-FOUND' AS dst
+    UNION ALL SELECT 'MACH-DTML', 'ML-DIVERSE'
+    UNION ALL SELECT 'MACH-GCS', 'ML-CS'
+    UNION ALL SELECT 'MACH-EP', 'ML-EXP'
+    UNION ALL SELECT 'INFM3110', 'PRAK'
+    UNION ALL SELECT 'INFM3410', 'THEO'
+    UNION ALL SELECT 'INFM3310', 'TECH'
+    UNION ALL SELECT 'INFM2510', 'INFO'
+) AS alias ON alias.src = je.value
+JOIN study_areas AS sa ON sa.code = alias.dst
+WHERE f."key" = '_categories_json';
+
 INSERT OR IGNORE INTO course_curriculum_matches (course_id, module_id, match_type, confidence)
 SELECT f.course_id, cm.id, 'category_code', 0.9
 FROM course_fields AS f
diff --git a/data_collection/CLAUDE.md b/data_collection/CLAUDE.md
new file mode 100644
index 0000000..d659c76
--- /dev/null
+++ b/data_collection/CLAUDE.md
@@ -0,0 +1,99 @@
+# CLAUDE.md — data_collection
+
+Project instructions for the ALMA course-catalog scraper. Read this before
+changing anything under `data_collection/`. For runnable commands see
+[`QUICKSTART.md`](QUICKSTART.md); for environment setup see [`SETUP.md`](SETUP.md).
+
+## What this is
+
+A standalone Python scraper for the public ALMA course catalog at
+`alma.uni-tuebingen.de`. It crawls the JSF catalog tree, fetches course detail
+pages, and writes a JSON file. That JSON is then turned into the D1 seed by
+`backend/scripts/import_alma_json_to_d1.py` (a separate step — the scraper does
+not touch the database).
+
+- `alma/scraper.py` — `AlmaScraper` (session, JSF navigation, parsing) + pure
+  parse helpers.
+- `alma/cli.py` — argparse entry point (`python -m alma.cli`), single-period and
+  multi-period orchestration.
+- Output: `output/<timestamp>/courses_multi_semester.json` (multi-period) or
+  `courses.json` (single).
+
+## The two catalog trees (important)
+
+ALMA exposes the same courses through two different trees of the public
+`showCourseCatalog-flow`:
+
+1. **VVZ** — "Gesamtverzeichnis Lehrveranstaltungen Informatik". A flat-ish
+   per-faculty listing. Includes department-wide offerings (Oberseminare,
+   Kolloquien, info events, Mathe-Vorkurs) that are **not** tied to any degree
+   module. None of those award ECTS.
+2. **studiesOffered** — degree programs (B.Sc./M.Sc. ...). Each program tree is
+   `[Modul] <study-area> → [Veranstaltungskonto] → [Veranstaltungsgruppe] (N CP)
+   → [Veranstaltung]`. It lists courses **cross-listed from other faculties**
+   (KOG, GTCNEURO, MEDZ, BIOINF) that count toward a study area but are absent
+   from the VVZ Informatik branch.
+
+Neither tree is a superset of the other, so the scraper crawls **both**: the VVZ
+branch (`INFORMATICS_BRANCH_CHAIN`) plus the degree-program branches
+(`PROGRAM_BRANCH_CHAINS`: M.Sc. CS, B.Sc. Informatik, M.Sc. ML). Courses are
+deduplicated by `unit_id`; `ScrapeOptions.skip_unit_ids` stops later branches
+from re-fetching detail pages a previous branch already got. `--no-programs`
+falls back to VVZ-only. See `cli._scrape_period_branches`.
+
+The logged-in "Studienplaner mit Modulplan" (`studyPlanner-flow`) returns **403
+anonymously** — do not target it. The studiesOffered tree above is the
+anonymously-reachable equivalent.
+
+## Period ids ↔ semesters
+
+Period ids are opaque ALMA ints; the mapping is **not** chronological by number:
+
+| id | semester | id | semester |
+|----|----------|----|----------|
+| 225 | SoSe 2022 | 233 | WiSe 2022/23 |
+| 226 | SoSe 2023 | 234 | WiSe 2023/24 |
+| 227 | SoSe 2024 | 235 | WiSe 2024/25 |
+| 228 | SoSe 2025 | 236 | WiSe 2025/26 |
+| 229 | SoSe 2026 | | |
+
+`--from-semester LABEL` selects every period at or after `LABEL`
+(`parse_semester_tuple` understands e.g. `"Sommer 2026"`, `"Winter 2022/23"`).
+Deep-path `title:NNNN` ids differ per period, so branches are rediscovered each
+period by title chain via `find_branch_permalink`.
+
+## Study-area attribution (how courses link to INFO-INFO etc.)
+
+Each course detail page has a "Module / Studiengänge" table; the scraper stores
+those codes as the `_categories_json` course field. The importer joins them to
+`study_areas.code`. Codes mostly match directly (M.Sc. CS: `INFO-BASIS`,
+`INFO-FOKUS`, `INFO-INFO`, `INFO-PRAK`, `INFO-TECH`, `INFO-THEO`), but some need
+aliasing — handled in `import_alma_json_to_d1.py`:
+
+- M.Sc. ML detail pages use `MACH-*`; seeded study areas are `ML-*`
+  (`MACH-FML→ML-FOUND`, `MACH-DTML→ML-DIVERSE`, `MACH-GCS→ML-CS`, `MACH-EP→ML-EXP`).
+- B.Sc. Wahlpflicht appears as `INFM####` (`INFM3110→PRAK`, `INFM3410→THEO`,
+  `INFM3310→TECH`, `INFM2510→INFO`).
+
+Enumeration is the hard part: once a cross-listed course is scraped and its
+detail page fetched, the existing category-code join attributes it. B.Sc.
+*compulsory* modules (Mathe, Teamprojekt) carry no Wahlpflicht code, so they are
+enumerated but not category-linked (known gap).
+
+## Gotchas
+
+- **Mojibake**: ALMA text often arrives UTF-8-as-cp1252. Use `repair_mojibake`
+  before comparing/printing titles; never assume clean text.
+- **Politeness**: keep `polite_delay` between requests; do not parallelize.
+- **Progress**: `tqdm` shows an outer "semesters" bar and an inner per-branch
+  detail bar. Log lines go through `tqdm.write` so they don't corrupt the bars;
+  `--quiet` disables both. `progress.json` is still written every course.
+- Coverage is scoped to the three Informatik programs above. Adding e.g.
+  Medieninformatik (which is where `User Experience` lives) is a one-entry
+  addition to `PROGRAM_BRANCH_CHAINS` plus any needed code aliases.
+
+## Conventions
+
+Follow the repo-wide `AGENTS.md`. Python: explicit type hints, small pure
+helpers, comments explain *why*. New scraper logic should be exercised by a real
+run against one period before merging (no DB write needed).
diff --git a/data_collection/QUICKSTART.md b/data_collection/QUICKSTART.md
index 7c94edc..998856d 100644
--- a/data_collection/QUICKSTART.md
+++ b/data_collection/QUICKSTART.md
@@ -51,14 +51,21 @@ Output: `output/YYYY-MM-DD_HH-MM-SS/courses.json`
 ### Multiple semesters
 
 Scrape every semester from a given label up to the most recent. Per-period
-the scraper switches via ALMA's Semesterauswahl dropdown and rediscovers the
-Informatik branch by title chain (the deep-path IDs differ between
-semesters):
+the scraper switches via ALMA's Semesterauswahl dropdown and rediscovers each
+branch by title chain (the deep-path IDs differ between semesters):
 
 ```powershell
 uv run python -m alma_scraper.cli --details --from-semester "Sommer 2022"
 ```
 
+In multi-period mode the scraper crawls the VVZ "Gesamtverzeichnis
+Lehrveranstaltungen Informatik" branch **and** the degree-program branches
+(M.Sc. Computer Science, B.Sc. Informatik, M.Sc. Machine Learning). The
+program branches surface courses cross-listed from other faculties that count
+toward a study area but are missing from the VVZ branch. Courses shared
+between branches are deduplicated by `unit_id` and their detail pages are
+fetched only once. Pass `--no-programs` to crawl the VVZ branch alone.
+
 Each course in the output gets `period_id` and `period_label` fields so you
 can tell semesters apart. The output file is rewritten after every period,
 so an interrupted run still leaves a usable file.
diff --git a/data_collection/alma/cli.py b/data_collection/alma/cli.py
index 789df1e..f033762 100644
--- a/data_collection/alma/cli.py
+++ b/data_collection/alma/cli.py
@@ -6,6 +6,8 @@
 from datetime import datetime
 from pathlib import Path
 
+from tqdm import tqdm
+
 from .scraper import (
     AlmaScraper,
     PeriodOption,
@@ -56,6 +58,16 @@ def build_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Do not skip older '(Version YYYY)' catalog branches.",
     )
+    parser.add_argument(
+        "--no-programs",
+        action="store_true",
+        help=(
+            "Multi-period only: scrape just the VVZ 'Gesamtverzeichnis' branch "
+            "and skip the degree-program branches (M.Sc. CS, B.Sc. Informatik, "
+            "M.Sc. ML). By default those program branches are also scraped so "
+            "courses cross-listed from other faculties are included."
+        ),
+    )
     parser.add_argument(
         "--timeout",
         type=float,
@@ -170,8 +182,9 @@ def build_parser() -> argparse.ArgumentParser:
 def main() -> None:
     args = build_parser().parse_args()
     def progress(message: str) -> None:
+        # tqdm.write keeps log lines from corrupting any active progress bar.
         if not args.quiet:
-            print(message, file=sys.stderr, flush=True)
+            tqdm.write(message, file=sys.stderr)
 
     scraper = AlmaScraper(timeout=args.timeout, progress=progress)
 
@@ -349,10 +362,99 @@ def _run_single_period_scrape(
         checkpoint_every=args.checkpoint_every,
         max_runtime_seconds=args.max_runtime_seconds,
         max_expansions=args.max_expansions,
+        progress_bar=not args.quiet,
+        progress_label="courses",
     )
     return scraper.scrape(options)
 
 
+def _scrape_period_branches(
+    scraper: AlmaScraper,
+    args: argparse.Namespace,
+    period: PeriodOption,
+    progress_path: str,
+) -> tuple[list[dict], list[dict], bool] | None:
+    """Scrape the VVZ Informatik branch and, unless ``--no-programs``, the
+    degree-program branches for one period, then merge the results.
+
+    The VVZ "Gesamtverzeichnis" branch is the base: if it cannot be located
+    the whole period is skipped (``None`` is returned). Program branches that
+    cannot be located are skipped individually with a warning, since not every
+    program necessarily exists for every archived semester.
+
+    Courses are deduplicated by ``unit_id`` (first branch wins), and the
+    ``skip_unit_ids`` option keeps later branches from re-fetching detail pages
+    for courses an earlier branch already collected. Catalog nodes are
+    deduplicated by ``node_id`` (they share absolute, root-relative ids across
+    branches). Every kept course/node is tagged with the period.
+
+    Returns ``(courses, catalog_nodes, partial)`` where ``partial`` is true if
+    any branch crawl hit a runtime/expansion limit.
+    """
+    chains: list[tuple[str, ...]] = [AlmaScraper.INFORMATICS_BRANCH_CHAIN]
+    if not args.no_programs:
+        chains.extend(AlmaScraper.PROGRAM_BRANCH_CHAINS)
+
+    merged_courses: dict[str, dict] = {}
+    merged_nodes: dict[str, dict] = {}
+    seen_unit_ids: set[str] = set()
+    partial = False
+
+    for chain in chains:
+        is_base = chain is AlmaScraper.INFORMATICS_BRANCH_CHAIN
+        permalink = scraper.find_branch_permalink(period.period_id, chain)
+        if not permalink:
+            if is_base:
+                return None
+            tqdm.write(
+                f"  ! branch {chain[-1]!r} not found for {period.label}; "
+                "skipping this branch",
+                file=sys.stderr,
+            )
+            continue
+
+        branch_label = "VVZ" if is_base else chain[-1].split(" (")[0]
+        options = ScrapeOptions(
+            start_url=permalink,
+            branch_title=None,
+            max_depth=args.max_depth,
+            max_courses=args.max_courses,
+            fetch_details=args.details or args.full_catalog,
+            latest_versions_only=not args.include_old_versions,
+            progress_file=progress_path,
+            checkpoint_path=None,
+            checkpoint_every=args.checkpoint_every,
+            max_runtime_seconds=args.max_runtime_seconds,
+            max_expansions=args.max_expansions,
+            restrict_to_start_path=True,
+            skip_unit_ids=frozenset(seen_unit_ids) or None,
+            progress_bar=not args.quiet,
+            progress_label=f"{period.label}: {branch_label}",
+        )
+        result = scraper.scrape(options)
+        partial = partial or bool(result["source"].get("partial"))
+
+        for course in result["courses"]:
+            key = course.get("unit_id") or course.get("detail_url")
+            if not key or key in merged_courses:
+                continue
+            merged_courses[key] = course
+            if course.get("unit_id"):
+                seen_unit_ids.add(course["unit_id"])
+        for node in result["catalog_nodes"]:
+            merged_nodes.setdefault(node["node_id"], node)
+
+    courses = list(merged_courses.values())
+    nodes = list(merged_nodes.values())
+    for course in courses:
+        course["period_id"] = period.period_id
+        course["period_label"] = period.label
+    for node in nodes:
+        node["period_id"] = period.period_id
+        node["period_label"] = period.label
+    return courses, nodes, partial
+
+
 def _run_multi_period_scrape(
     scraper: AlmaScraper,
     args: argparse.Namespace,
@@ -362,14 +464,12 @@ def _run_multi_period_scrape(
 ) -> dict:
     """Run the scraper once per discovered period and merge results.
 
-    For each period:
-      1. Switch to that period via the Semesterauswahl dropdown
-         (:meth:`AlmaScraper.find_branch_permalink`), discovering the
-         period-specific Informatik permalink because the deep-path title
-         IDs are not stable across semesters.
-      2. Scrape using that permalink as ``start_url`` so the existing
-         start-path scoping just works.
-      3. Tag every course with ``period_id`` and ``period_label``.
+    For each period, :func:`_scrape_period_branches` discovers the
+    period-specific permalink for the VVZ Informatik branch and (unless
+    ``--no-programs``) the degree-program branches, scrapes each, and merges
+    them. Branch permalinks are rediscovered per period because the deep-path
+    title IDs are not stable across semesters. Every kept course is tagged
+    with ``period_id`` and ``period_label``.
 
     A checkpoint is written after every period so an interrupted run still
     leaves a usable output file.
@@ -425,21 +525,16 @@ def _run_multi_period_scrape(
             flush=True,
         )
 
-    for index, period in enumerate(remaining, start=1):
-        print(
-            f"=== [{index}/{len(remaining)}] period {period.period_id} "
-            f"({period.label}) ===",
-            file=sys.stderr,
-            flush=True,
-        )
-        permalink = scraper.find_branch_permalink(
-            period.period_id, AlmaScraper.INFORMATICS_BRANCH_CHAIN
-        )
-        if not permalink:
-            print(
+    period_bar = tqdm(
+        remaining, desc="semesters", unit="sem", disable=args.quiet,
+    )
+    for period in period_bar:
+        period_bar.set_postfix_str(period.label)
+        scraped = _scrape_period_branches(scraper, args, period, progress_path)
+        if scraped is None:
+            tqdm.write(
                 f"  ! could not find Informatik branch for {period.label}; skipping",
                 file=sys.stderr,
-                flush=True,
             )
             per_period_summary.append(
                 {
@@ -456,36 +551,16 @@ def _run_multi_period_scrape(
             )
             continue
 
-        period_options = ScrapeOptions(
-            start_url=permalink,
-            branch_title=None,
-            max_depth=args.max_depth,
-            max_courses=args.max_courses,
-            fetch_details=args.details or args.full_catalog,
-            latest_versions_only=not args.include_old_versions,
-            progress_file=progress_path,
-            checkpoint_path=None,
-            checkpoint_every=args.checkpoint_every,
-            max_runtime_seconds=args.max_runtime_seconds,
-            max_expansions=args.max_expansions,
-            restrict_to_start_path=True,
-        )
-        result = scraper.scrape(period_options)
-        for course in result["courses"]:
-            course["period_id"] = period.period_id
-            course["period_label"] = period.label
-        for node in result["catalog_nodes"]:
-            node["period_id"] = period.period_id
-            node["period_label"] = period.label
-        all_courses.extend(result["courses"])
-        all_catalog_nodes.extend(result["catalog_nodes"])
+        period_courses, period_nodes, partial = scraped
+        all_courses.extend(period_courses)
+        all_catalog_nodes.extend(period_nodes)
         per_period_summary.append(
             {
                 "period_id": period.period_id,
                 "period_label": period.label,
-                "courses": len(result["courses"]),
-                "catalog_nodes": len(result["catalog_nodes"]),
-                "partial": bool(result["source"].get("partial")),
+                "courses": len(period_courses),
+                "catalog_nodes": len(period_nodes),
+                "partial": partial,
             }
         )
         _write_multi_period_checkpoint(
diff --git a/data_collection/alma/scraper.py b/data_collection/alma/scraper.py
index a884676..7d7752c 100644
--- a/data_collection/alma/scraper.py
+++ b/data_collection/alma/scraper.py
@@ -12,6 +12,7 @@
 
 import requests
 from bs4 import BeautifulSoup, Tag
+from tqdm import tqdm
 
 
 CATALOG_PREFIX = "hierarchy:content-container:courseCatalogFieldset:courseCatalog:"
@@ -45,6 +46,16 @@ class ScrapeOptions:
     restrict_to_start_path: bool = True
     max_runtime_seconds: int | None = None
     max_expansions: int | None = None
+    # unit_ids already collected by an earlier branch in the same period.
+    # Courses with these ids are dropped before detail fetching so the
+    # multi-branch crawl does not re-download shared courses.
+    skip_unit_ids: frozenset[str] | None = None
+    # Show a tqdm progress bar over the detail-fetch loop. When on, the noisy
+    # per-course stderr message is suppressed (the bar replaces it); the
+    # progress_file is still written every course.
+    progress_bar: bool = False
+    # Short label for the detail progress bar (e.g. "SoSe 2026 · M.Sc. CS").
+    progress_label: str | None = None
 
 
 @dataclass(slots=True)
@@ -105,6 +116,29 @@ class AlmaScraper:
         "Informatik",
         "Gesamtverzeichnis Lehrveranstaltungen Informatik",
     )
+    # Degree-program branches under studiesOffered. Their module trees list
+    # courses (incl. ones cross-listed from other faculties) that count toward
+    # a study area but are absent from the VVZ "Gesamtverzeichnis" branch. The
+    # last chain entry is matched as a case-insensitive substring against
+    # catalog row titles, so the "(Version 2021)" suffix keeps it on the
+    # current examination regulation.
+    PROGRAM_BRANCH_CHAINS: tuple[tuple[str, ...], ...] = (
+        (
+            "Mathematisch-Naturwissenschaftliche",
+            "Informatik",
+            "M.Sc. Informatik / Computer Science (Version 2021)",
+        ),
+        (
+            "Mathematisch-Naturwissenschaftliche",
+            "Informatik",
+            "B.Sc. Informatik (Version 2021)",
+        ),
+        (
+            "Mathematisch-Naturwissenschaftliche",
+            "Informatik",
+            "M.Sc. Machine Learning (Version 2021)",
+        ),
+    )
 
     def __init__(
         self,
@@ -295,12 +329,23 @@ def scrape(self, options: ScrapeOptions) -> dict[str, Any]:
                 if any(needle in title.casefold() for title in node.path_titles)
             ]
         courses = unique_courses(courses)
+        if options.skip_unit_ids:
+            courses = [
+                node for node in courses if node.unit_id not in options.skip_unit_ids
+            ]
         courses.sort(key=lambda item: item.node_id)
         if options.max_courses is not None:
             courses = courses[: options.max_courses]
 
         course_records: list[dict[str, Any]] = []
         total_courses = len(courses)
+        detail_bar = tqdm(
+            total=total_courses,
+            desc=options.progress_label or "details",
+            unit="course",
+            leave=False,
+            disable=not options.progress_bar,
+        )
         for index, course in enumerate(courses, start=1):
             if self._runtime_exceeded(options, started_at):
                 self._progress(
@@ -311,10 +356,12 @@ def scrape(self, options: ScrapeOptions) -> dict[str, Any]:
                     total_courses=total_courses,
                 )
                 break
+            # When the bar is on it replaces the noisy per-course stderr line.
             self._progress(
                 options,
                 "details",
                 f"Fetching detail {index}/{total_courses}: {course.title}",
+                to_stderr=not options.progress_bar,
                 course_index=index,
                 total_courses=total_courses,
                 current_course=course.title,
@@ -324,8 +371,10 @@ def scrape(self, options: ScrapeOptions) -> dict[str, Any]:
                 record["details"] = self.fetch_course_details(course.detail_url)
                 time.sleep(self.polite_delay)
             course_records.append(record)
+            detail_bar.update(1)
             if options.checkpoint_path and index % max(options.checkpoint_every, 1) == 0:
                 self._write_checkpoint(options, started_at, course_records)
+        detail_bar.close()
 
         result = {
             "source": {
@@ -706,7 +755,15 @@ def _runtime_exceeded(options: ScrapeOptions, started_at: int) -> bool:
             return False
         return time.time() - started_at >= options.max_runtime_seconds
 
-    def _progress(self, options: ScrapeOptions, stage: str, message: str, **extra: Any) -> None:
+    def _progress(
+        self,
+        options: ScrapeOptions,
+        stage: str,
+        message: str,
+        *,
+        to_stderr: bool = True,
+        **extra: Any,
+    ) -> None:
         payload = {
             "stage": stage,
             "message": message,
@@ -715,7 +772,7 @@ def _progress(self, options: ScrapeOptions, stage: str, message: str, **extra: A
             "skipped_old_version_nodes": len(self.skipped_old_version_node_ids),
             **extra,
         }
-        if self.progress:
+        if self.progress and to_stderr:
             self.progress(message)
         if options.progress_file:
             self._write_json(Path(options.progress_file), payload)
diff --git a/data_collection/pyproject.toml b/data_collection/pyproject.toml
index bc6f366..2f39d71 100644
--- a/data_collection/pyproject.toml
+++ b/data_collection/pyproject.toml
@@ -7,6 +7,7 @@ requires-python = ">=3.11"
 dependencies = [
     "beautifulsoup4>=4.12",
     "requests>=2.31",
+    "tqdm>=4.62",
 ]
 
 [build-system]
diff --git a/data_collection/requirements.txt b/data_collection/requirements.txt
index e35775c..8b078a4 100644
--- a/data_collection/requirements.txt
+++ b/data_collection/requirements.txt
@@ -1,2 +1,3 @@
 beautifulsoup4>=4.12
 requests>=2.31
+tqdm>=4.62

From 6133ead05ca79c465042df06908339054be671dc Mon Sep 17 00:00:00 2001
From: Yonatan Dankner <yonatan.dankner@gmail.com>
Date: Fri, 26 Jun 2026 11:51:35 +0200
Subject: [PATCH 2/2] Address review feedback on the program-branch scrape

- Fix the per-branch `partial` flag to also reflect the --max-expansions
  crawl limit (it previously tracked only the runtime limit), so a branch
  that stops early is not silently recorded complete and skipped by
  --continue.
- Program-scope the study-area alias join (JOIN study_programs) so the
  generic B.Sc. codes PRAK/THEO/TECH/INFO cannot mislink a course to
  another program that happens to reuse the same bare code.
- Close both tqdm bars via context managers so an exception mid-run no
  longer leaks a bar / corrupts the terminal.
- Fix progress-label example in the ScrapeOptions docstring.
- Correct the module name in QUICKSTART.md / SETUP.md (alma.cli, not the
  non-existent alma_scraper.cli).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 backend/scripts/import_alma_json_to_d1.py | 24 ++++---
 data_collection/QUICKSTART.md             | 18 +++---
 data_collection/SETUP.md                  |  4 +-
 data_collection/alma/cli.py               | 76 +++++++++++------------
 data_collection/alma/scraper.py           | 60 +++++++++---------
 5 files changed, 95 insertions(+), 87 deletions(-)

diff --git a/backend/scripts/import_alma_json_to_d1.py b/backend/scripts/import_alma_json_to_d1.py
index 45aeb6c..8c57473 100644
--- a/backend/scripts/import_alma_json_to_d1.py
+++ b/backend/scripts/import_alma_json_to_d1.py
@@ -501,22 +501,26 @@ def _get_or_create_lecturer(lecturer_id_by_name: dict[str, int], name: str) -> i
 -- seeded study_areas.code: M.Sc. Machine Learning detail pages use MACH-*
 -- (seeded as ML-*), and B.Sc. Informatik Wahlpflicht modules appear as their
 -- INFM module numbers. Map those aliases so cross-listed courses still link to
--- the right study area. The original scraped code is kept as source_code.
+-- the right study area. The alias destination is scoped to its program
+-- (study_areas.code is only unique per program; the B.Sc. codes PRAK/THEO/
+-- TECH/INFO are deliberately generic), and the original scraped code is kept
+-- as source_code.
 INSERT OR IGNORE INTO course_study_area_links (course_id, study_area_id, source_code)
 SELECT f.course_id, sa.id, je.value
 FROM course_fields AS f
 JOIN json_each(f.value) AS je
 JOIN (
-    SELECT 'MACH-FML' AS src, 'ML-FOUND' AS dst
-    UNION ALL SELECT 'MACH-DTML', 'ML-DIVERSE'
-    UNION ALL SELECT 'MACH-GCS', 'ML-CS'
-    UNION ALL SELECT 'MACH-EP', 'ML-EXP'
-    UNION ALL SELECT 'INFM3110', 'PRAK'
-    UNION ALL SELECT 'INFM3410', 'THEO'
-    UNION ALL SELECT 'INFM3310', 'TECH'
-    UNION ALL SELECT 'INFM2510', 'INFO'
+    SELECT 'MACH-FML' AS src, 'MSC_ML_2021' AS prog, 'ML-FOUND' AS dst
+    UNION ALL SELECT 'MACH-DTML', 'MSC_ML_2021', 'ML-DIVERSE'
+    UNION ALL SELECT 'MACH-GCS', 'MSC_ML_2021', 'ML-CS'
+    UNION ALL SELECT 'MACH-EP', 'MSC_ML_2021', 'ML-EXP'
+    UNION ALL SELECT 'INFM3110', 'BSC_INFO_2021', 'PRAK'
+    UNION ALL SELECT 'INFM3410', 'BSC_INFO_2021', 'THEO'
+    UNION ALL SELECT 'INFM3310', 'BSC_INFO_2021', 'TECH'
+    UNION ALL SELECT 'INFM2510', 'BSC_INFO_2021', 'INFO'
 ) AS alias ON alias.src = je.value
-JOIN study_areas AS sa ON sa.code = alias.dst
+JOIN study_programs AS sp ON sp.code = alias.prog
+JOIN study_areas AS sa ON sa.program_id = sp.id AND sa.code = alias.dst
 WHERE f."key" = '_categories_json';
 
 INSERT OR IGNORE INTO course_curriculum_matches (course_id, module_id, match_type, confidence)
diff --git a/data_collection/QUICKSTART.md b/data_collection/QUICKSTART.md
index 998856d..a6e00ac 100644
--- a/data_collection/QUICKSTART.md
+++ b/data_collection/QUICKSTART.md
@@ -11,7 +11,7 @@
 
 2. **Run scraper:**
    ```powershell
-   uv run python -m alma_scraper.cli --details
+   uv run python -m alma.cli --details
    ```
 
 ### Option 2: Using `pip`
@@ -29,7 +29,7 @@
 
 3. **Run scraper:**
    ```powershell
-   python -m alma_scraper.cli --details
+   python -m alma.cli --details
    ```
 
 ## Usage
@@ -40,7 +40,7 @@ Scrape the Informatik course catalog (Gesamtverzeichnis Lehrveranstaltungen
 Informatik) with course details:
 
 ```powershell
-uv run python -m alma_scraper.cli --details
+uv run python -m alma.cli --details
 ```
 
 Each course detail includes a `categories` list — the module/study-program
@@ -55,7 +55,7 @@ the scraper switches via ALMA's Semesterauswahl dropdown and rediscovers each
 branch by title chain (the deep-path IDs differ between semesters):
 
 ```powershell
-uv run python -m alma_scraper.cli --details --from-semester "Sommer 2022"
+uv run python -m alma.cli --details --from-semester "Sommer 2022"
 ```
 
 In multi-period mode the scraper crawls the VVZ "Gesamtverzeichnis
@@ -73,7 +73,7 @@ so an interrupted run still leaves a usable file.
 If a run was interrupted, resume it without redoing the completed semesters:
 
 ```powershell
-uv run python -m alma_scraper.cli --details --continue output/<timestamp>/courses_multi_semester.json
+uv run python -m alma.cli --details --continue output/<timestamp>/courses_multi_semester.json
 ```
 
 Fully completed periods are kept and skipped; partial or skipped ones are
@@ -82,7 +82,7 @@ redone. Output is written back to the same path.
 ### List available semesters
 
 ```powershell
-uv run python -m alma_scraper.cli --list-periods
+uv run python -m alma.cli --list-periods
 ```
 
 ### Quick Test (2 minutes)
@@ -90,7 +90,7 @@ uv run python -m alma_scraper.cli --list-periods
 Test scraping:
 
 ```powershell
-uv run python -m alma_scraper.cli --details --max-runtime-seconds 120
+uv run python -m alma.cli --details --max-runtime-seconds 120
 ```
 
 ### Full Catalog
@@ -98,7 +98,7 @@ uv run python -m alma_scraper.cli --details --max-runtime-seconds 120
 Scrape entire university:
 
 ```powershell
-uv run python -m alma_scraper.cli --full-catalog
+uv run python -m alma.cli --full-catalog
 ```
 
 ### Watch Progress
@@ -129,4 +129,4 @@ output/
 - `--pretty` - Pretty-print JSON
 - `--list-periods` - Print available period IDs and labels
 
-For full help: `uv run python -m alma_scraper.cli --help`
+For full help: `uv run python -m alma.cli --help`
diff --git a/data_collection/SETUP.md b/data_collection/SETUP.md
index 37fc872..b8878ea 100644
--- a/data_collection/SETUP.md
+++ b/data_collection/SETUP.md
@@ -9,7 +9,7 @@
 
 2. **Run scraper:**
    ```powershell
-   uv run python -m alma_scraper.cli --details
+   uv run python -m alma.cli --details
    ```
 
 ## Option 2: Using `pip` (Virtual Environment)
@@ -27,7 +27,7 @@
 
 3. **Run scraper:**
    ```powershell
-   python -m alma_scraper.cli --details
+   python -m alma.cli --details
    ```
 
 ## Output
diff --git a/data_collection/alma/cli.py b/data_collection/alma/cli.py
index f033762..b23c40a 100644
--- a/data_collection/alma/cli.py
+++ b/data_collection/alma/cli.py
@@ -525,52 +525,52 @@ def _run_multi_period_scrape(
             flush=True,
         )
 
-    period_bar = tqdm(
-        remaining, desc="semesters", unit="sem", disable=args.quiet,
-    )
-    for period in period_bar:
-        period_bar.set_postfix_str(period.label)
-        scraped = _scrape_period_branches(scraper, args, period, progress_path)
-        if scraped is None:
-            tqdm.write(
-                f"  ! could not find Informatik branch for {period.label}; skipping",
-                file=sys.stderr,
-            )
+    # Context-managed so the outer bar is always closed, even if a period
+    # scrape raises partway through the run.
+    with tqdm(remaining, desc="semesters", unit="sem", disable=args.quiet) as period_bar:
+        for period in period_bar:
+            period_bar.set_postfix_str(period.label)
+            scraped = _scrape_period_branches(scraper, args, period, progress_path)
+            if scraped is None:
+                tqdm.write(
+                    f"  ! could not find Informatik branch for {period.label}; skipping",
+                    file=sys.stderr,
+                )
+                per_period_summary.append(
+                    {
+                        "period_id": period.period_id,
+                        "period_label": period.label,
+                        "courses": 0,
+                        "catalog_nodes": 0,
+                        "skipped": True,
+                    }
+                )
+                _write_multi_period_checkpoint(
+                    out_path, args, periods, per_period_summary,
+                    all_catalog_nodes, all_courses,
+                )
+                continue
+
+            period_courses, period_nodes, partial = scraped
+            all_courses.extend(period_courses)
+            all_catalog_nodes.extend(period_nodes)
             per_period_summary.append(
                 {
                     "period_id": period.period_id,
                     "period_label": period.label,
-                    "courses": 0,
-                    "catalog_nodes": 0,
-                    "skipped": True,
+                    "courses": len(period_courses),
+                    "catalog_nodes": len(period_nodes),
+                    "partial": partial,
                 }
             )
             _write_multi_period_checkpoint(
-                out_path, args, periods, per_period_summary,
-                all_catalog_nodes, all_courses,
+                out_path,
+                args,
+                periods,
+                per_period_summary,
+                all_catalog_nodes,
+                all_courses,
             )
-            continue
-
-        period_courses, period_nodes, partial = scraped
-        all_courses.extend(period_courses)
-        all_catalog_nodes.extend(period_nodes)
-        per_period_summary.append(
-            {
-                "period_id": period.period_id,
-                "period_label": period.label,
-                "courses": len(period_courses),
-                "catalog_nodes": len(period_nodes),
-                "partial": partial,
-            }
-        )
-        _write_multi_period_checkpoint(
-            out_path,
-            args,
-            periods,
-            per_period_summary,
-            all_catalog_nodes,
-            all_courses,
-        )
 
     return _multi_period_result(
         args, periods, per_period_summary, all_catalog_nodes, all_courses
diff --git a/data_collection/alma/scraper.py b/data_collection/alma/scraper.py
index 7d7752c..78f4318 100644
--- a/data_collection/alma/scraper.py
+++ b/data_collection/alma/scraper.py
@@ -54,7 +54,7 @@ class ScrapeOptions:
     # per-course stderr message is suppressed (the bar replaces it); the
     # progress_file is still written every course.
     progress_bar: bool = False
-    # Short label for the detail progress bar (e.g. "SoSe 2026 · M.Sc. CS").
+    # Short label for the detail progress bar (e.g. "SoSe 2026: VVZ").
     progress_label: str | None = None
 
 
@@ -339,42 +339,43 @@ def scrape(self, options: ScrapeOptions) -> dict[str, Any]:
 
         course_records: list[dict[str, Any]] = []
         total_courses = len(courses)
-        detail_bar = tqdm(
+        # Context-managed so the bar is always closed, even if a detail fetch
+        # raises, instead of leaving a corrupted terminal line behind.
+        with tqdm(
             total=total_courses,
             desc=options.progress_label or "details",
             unit="course",
             leave=False,
             disable=not options.progress_bar,
-        )
-        for index, course in enumerate(courses, start=1):
-            if self._runtime_exceeded(options, started_at):
+        ) as detail_bar:
+            for index, course in enumerate(courses, start=1):
+                if self._runtime_exceeded(options, started_at):
+                    self._progress(
+                        options,
+                        "paused",
+                        f"Runtime limit reached before detail {index}/{total_courses}",
+                        course_index=index,
+                        total_courses=total_courses,
+                    )
+                    break
+                # When the bar is on it replaces the noisy per-course stderr line.
                 self._progress(
                     options,
-                    "paused",
-                    f"Runtime limit reached before detail {index}/{total_courses}",
+                    "details",
+                    f"Fetching detail {index}/{total_courses}: {course.title}",
+                    to_stderr=not options.progress_bar,
                     course_index=index,
                     total_courses=total_courses,
+                    current_course=course.title,
                 )
-                break
-            # When the bar is on it replaces the noisy per-course stderr line.
-            self._progress(
-                options,
-                "details",
-                f"Fetching detail {index}/{total_courses}: {course.title}",
-                to_stderr=not options.progress_bar,
-                course_index=index,
-                total_courses=total_courses,
-                current_course=course.title,
-            )
-            record = asdict(course)
-            if options.fetch_details and course.detail_url:
-                record["details"] = self.fetch_course_details(course.detail_url)
-                time.sleep(self.polite_delay)
-            course_records.append(record)
-            detail_bar.update(1)
-            if options.checkpoint_path and index % max(options.checkpoint_every, 1) == 0:
-                self._write_checkpoint(options, started_at, course_records)
-        detail_bar.close()
+                record = asdict(course)
+                if options.fetch_details and course.detail_url:
+                    record["details"] = self.fetch_course_details(course.detail_url)
+                    time.sleep(self.polite_delay)
+                course_records.append(record)
+                detail_bar.update(1)
+                if options.checkpoint_path and index % max(options.checkpoint_every, 1) == 0:
+                    self._write_checkpoint(options, started_at, course_records)
 
         result = {
             "source": {
@@ -382,7 +383,10 @@ def scrape(self, options: ScrapeOptions) -> dict[str, Any]:
                 "branch_title": branch_title,
                 "latest_versions_only": options.latest_versions_only,
                 "skipped_old_version_nodes": sorted(self.skipped_old_version_node_ids),
-                "partial": self._runtime_exceeded(options, started_at),
+                # timed_out covers both the runtime and max-expansions crawl
+                # limits; the runtime check also catches the detail-fetch loop
+                # breaking early. Either means the period is incomplete.
+                "partial": timed_out or self._runtime_exceeded(options, started_at),
                 "fetched_at_unix": started_at,
                 "finished_at_unix": int(time.time()),
             },