From c8a3328e47d65d3c796da649608c3438933eec7b Mon Sep 17 00:00:00 2001 From: Yonatan Dankner Date: Fri, 26 Jun 2026 11:29:50 +0200 Subject: [PATCH 1/2] Scrape degree-program branches and add scraper progress bars Crawl the M.Sc. CS / B.Sc. Informatik / M.Sc. ML studiesOffered branches alongside the VVZ "Gesamtverzeichnis" branch so courses cross-listed from other faculties (KOG/GTCNEURO/MEDZ/BIOINF) are enumerated. Courses are deduplicated by unit_id and detail pages are fetched once via the new ScrapeOptions.skip_unit_ids; --no-programs restores VVZ-only behaviour. Alias the codes that differ from study_areas.code in the D1 import so the new courses link to the right study area: M.Sc. ML MACH-* -> ML-* and B.Sc. Wahlpflicht INFM#### -> PRAK/THEO/TECH/INFO. Add tqdm progress bars (outer "semesters", inner per-branch "details"); log lines go through tqdm.write so they don't corrupt the bars and --quiet disables them. Add data_collection/CLAUDE.md and update QUICKSTART. Co-Authored-By: Claude Opus 4.8 --- backend/scripts/import_alma_json_to_d1.py | 22 +++ data_collection/CLAUDE.md | 99 +++++++++++++ data_collection/QUICKSTART.md | 13 +- data_collection/alma/cli.py | 171 ++++++++++++++++------ data_collection/alma/scraper.py | 61 +++++++- data_collection/pyproject.toml | 1 + data_collection/requirements.txt | 1 + 7 files changed, 315 insertions(+), 53 deletions(-) create mode 100644 data_collection/CLAUDE.md diff --git a/backend/scripts/import_alma_json_to_d1.py b/backend/scripts/import_alma_json_to_d1.py index dab0914..45aeb6c 100644 --- a/backend/scripts/import_alma_json_to_d1.py +++ b/backend/scripts/import_alma_json_to_d1.py @@ -497,6 +497,28 @@ def _get_or_create_lecturer(lecturer_id_by_name: dict[str, int], name: str) -> i JOIN study_areas AS sa ON sa.code = je.value WHERE f."key" = '_categories_json'; +-- Some programs expose study-area membership under codes that differ from the +-- seeded study_areas.code: M.Sc. Machine Learning detail pages use MACH-* +-- (seeded as ML-*), and B.Sc. Informatik Wahlpflicht modules appear as their +-- INFM module numbers. Map those aliases so cross-listed courses still link to +-- the right study area. The original scraped code is kept as source_code. +INSERT OR IGNORE INTO course_study_area_links (course_id, study_area_id, source_code) +SELECT f.course_id, sa.id, je.value +FROM course_fields AS f +JOIN json_each(f.value) AS je +JOIN ( + SELECT 'MACH-FML' AS src, 'ML-FOUND' AS dst + UNION ALL SELECT 'MACH-DTML', 'ML-DIVERSE' + UNION ALL SELECT 'MACH-GCS', 'ML-CS' + UNION ALL SELECT 'MACH-EP', 'ML-EXP' + UNION ALL SELECT 'INFM3110', 'PRAK' + UNION ALL SELECT 'INFM3410', 'THEO' + UNION ALL SELECT 'INFM3310', 'TECH' + UNION ALL SELECT 'INFM2510', 'INFO' +) AS alias ON alias.src = je.value +JOIN study_areas AS sa ON sa.code = alias.dst +WHERE f."key" = '_categories_json'; + INSERT OR IGNORE INTO course_curriculum_matches (course_id, module_id, match_type, confidence) SELECT f.course_id, cm.id, 'category_code', 0.9 FROM course_fields AS f diff --git a/data_collection/CLAUDE.md b/data_collection/CLAUDE.md new file mode 100644 index 0000000..d659c76 --- /dev/null +++ b/data_collection/CLAUDE.md @@ -0,0 +1,99 @@ +# CLAUDE.md — data_collection + +Project instructions for the ALMA course-catalog scraper. Read this before +changing anything under `data_collection/`. For runnable commands see +[`QUICKSTART.md`](QUICKSTART.md); for environment setup see [`SETUP.md`](SETUP.md). + +## What this is + +A standalone Python scraper for the public ALMA course catalog at +`alma.uni-tuebingen.de`. It crawls the JSF catalog tree, fetches course detail +pages, and writes a JSON file. That JSON is then turned into the D1 seed by +`backend/scripts/import_alma_json_to_d1.py` (a separate step — the scraper does +not touch the database). + +- `alma/scraper.py` — `AlmaScraper` (session, JSF navigation, parsing) + pure + parse helpers. +- `alma/cli.py` — argparse entry point (`python -m alma.cli`), single-period and + multi-period orchestration. +- Output: `output//courses_multi_semester.json` (multi-period) or + `courses.json` (single). + +## The two catalog trees (important) + +ALMA exposes the same courses through two different trees of the public +`showCourseCatalog-flow`: + +1. **VVZ** — "Gesamtverzeichnis Lehrveranstaltungen Informatik". A flat-ish + per-faculty listing. Includes department-wide offerings (Oberseminare, + Kolloquien, info events, Mathe-Vorkurs) that are **not** tied to any degree + module. None of those award ECTS. +2. **studiesOffered** — degree programs (B.Sc./M.Sc. ...). Each program tree is + `[Modul] → [Veranstaltungskonto] → [Veranstaltungsgruppe] (N CP) + → [Veranstaltung]`. It lists courses **cross-listed from other faculties** + (KOG, GTCNEURO, MEDZ, BIOINF) that count toward a study area but are absent + from the VVZ Informatik branch. + +Neither tree is a superset of the other, so the scraper crawls **both**: the VVZ +branch (`INFORMATICS_BRANCH_CHAIN`) plus the degree-program branches +(`PROGRAM_BRANCH_CHAINS`: M.Sc. CS, B.Sc. Informatik, M.Sc. ML). Courses are +deduplicated by `unit_id`; `ScrapeOptions.skip_unit_ids` stops later branches +from re-fetching detail pages a previous branch already got. `--no-programs` +falls back to VVZ-only. See `cli._scrape_period_branches`. + +The logged-in "Studienplaner mit Modulplan" (`studyPlanner-flow`) returns **403 +anonymously** — do not target it. The studiesOffered tree above is the +anonymously-reachable equivalent. + +## Period ids ↔ semesters + +Period ids are opaque ALMA ints; the mapping is **not** chronological by number: + +| id | semester | id | semester | +|----|----------|----|----------| +| 225 | SoSe 2022 | 233 | WiSe 2022/23 | +| 226 | SoSe 2023 | 234 | WiSe 2023/24 | +| 227 | SoSe 2024 | 235 | WiSe 2024/25 | +| 228 | SoSe 2025 | 236 | WiSe 2025/26 | +| 229 | SoSe 2026 | | | + +`--from-semester LABEL` selects every period at or after `LABEL` +(`parse_semester_tuple` understands e.g. `"Sommer 2026"`, `"Winter 2022/23"`). +Deep-path `title:NNNN` ids differ per period, so branches are rediscovered each +period by title chain via `find_branch_permalink`. + +## Study-area attribution (how courses link to INFO-INFO etc.) + +Each course detail page has a "Module / Studiengänge" table; the scraper stores +those codes as the `_categories_json` course field. The importer joins them to +`study_areas.code`. Codes mostly match directly (M.Sc. CS: `INFO-BASIS`, +`INFO-FOKUS`, `INFO-INFO`, `INFO-PRAK`, `INFO-TECH`, `INFO-THEO`), but some need +aliasing — handled in `import_alma_json_to_d1.py`: + +- M.Sc. ML detail pages use `MACH-*`; seeded study areas are `ML-*` + (`MACH-FML→ML-FOUND`, `MACH-DTML→ML-DIVERSE`, `MACH-GCS→ML-CS`, `MACH-EP→ML-EXP`). +- B.Sc. Wahlpflicht appears as `INFM####` (`INFM3110→PRAK`, `INFM3410→THEO`, + `INFM3310→TECH`, `INFM2510→INFO`). + +Enumeration is the hard part: once a cross-listed course is scraped and its +detail page fetched, the existing category-code join attributes it. B.Sc. +*compulsory* modules (Mathe, Teamprojekt) carry no Wahlpflicht code, so they are +enumerated but not category-linked (known gap). + +## Gotchas + +- **Mojibake**: ALMA text often arrives UTF-8-as-cp1252. Use `repair_mojibake` + before comparing/printing titles; never assume clean text. +- **Politeness**: keep `polite_delay` between requests; do not parallelize. +- **Progress**: `tqdm` shows an outer "semesters" bar and an inner per-branch + detail bar. Log lines go through `tqdm.write` so they don't corrupt the bars; + `--quiet` disables both. `progress.json` is still written every course. +- Coverage is scoped to the three Informatik programs above. Adding e.g. + Medieninformatik (which is where `User Experience` lives) is a one-entry + addition to `PROGRAM_BRANCH_CHAINS` plus any needed code aliases. + +## Conventions + +Follow the repo-wide `AGENTS.md`. Python: explicit type hints, small pure +helpers, comments explain *why*. New scraper logic should be exercised by a real +run against one period before merging (no DB write needed). diff --git a/data_collection/QUICKSTART.md b/data_collection/QUICKSTART.md index 7c94edc..998856d 100644 --- a/data_collection/QUICKSTART.md +++ b/data_collection/QUICKSTART.md @@ -51,14 +51,21 @@ Output: `output/YYYY-MM-DD_HH-MM-SS/courses.json` ### Multiple semesters Scrape every semester from a given label up to the most recent. Per-period -the scraper switches via ALMA's Semesterauswahl dropdown and rediscovers the -Informatik branch by title chain (the deep-path IDs differ between -semesters): +the scraper switches via ALMA's Semesterauswahl dropdown and rediscovers each +branch by title chain (the deep-path IDs differ between semesters): ```powershell uv run python -m alma_scraper.cli --details --from-semester "Sommer 2022" ``` +In multi-period mode the scraper crawls the VVZ "Gesamtverzeichnis +Lehrveranstaltungen Informatik" branch **and** the degree-program branches +(M.Sc. Computer Science, B.Sc. Informatik, M.Sc. Machine Learning). The +program branches surface courses cross-listed from other faculties that count +toward a study area but are missing from the VVZ branch. Courses shared +between branches are deduplicated by `unit_id` and their detail pages are +fetched only once. Pass `--no-programs` to crawl the VVZ branch alone. + Each course in the output gets `period_id` and `period_label` fields so you can tell semesters apart. The output file is rewritten after every period, so an interrupted run still leaves a usable file. diff --git a/data_collection/alma/cli.py b/data_collection/alma/cli.py index 789df1e..f033762 100644 --- a/data_collection/alma/cli.py +++ b/data_collection/alma/cli.py @@ -6,6 +6,8 @@ from datetime import datetime from pathlib import Path +from tqdm import tqdm + from .scraper import ( AlmaScraper, PeriodOption, @@ -56,6 +58,16 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Do not skip older '(Version YYYY)' catalog branches.", ) + parser.add_argument( + "--no-programs", + action="store_true", + help=( + "Multi-period only: scrape just the VVZ 'Gesamtverzeichnis' branch " + "and skip the degree-program branches (M.Sc. CS, B.Sc. Informatik, " + "M.Sc. ML). By default those program branches are also scraped so " + "courses cross-listed from other faculties are included." + ), + ) parser.add_argument( "--timeout", type=float, @@ -170,8 +182,9 @@ def build_parser() -> argparse.ArgumentParser: def main() -> None: args = build_parser().parse_args() def progress(message: str) -> None: + # tqdm.write keeps log lines from corrupting any active progress bar. if not args.quiet: - print(message, file=sys.stderr, flush=True) + tqdm.write(message, file=sys.stderr) scraper = AlmaScraper(timeout=args.timeout, progress=progress) @@ -349,10 +362,99 @@ def _run_single_period_scrape( checkpoint_every=args.checkpoint_every, max_runtime_seconds=args.max_runtime_seconds, max_expansions=args.max_expansions, + progress_bar=not args.quiet, + progress_label="courses", ) return scraper.scrape(options) +def _scrape_period_branches( + scraper: AlmaScraper, + args: argparse.Namespace, + period: PeriodOption, + progress_path: str, +) -> tuple[list[dict], list[dict], bool] | None: + """Scrape the VVZ Informatik branch and, unless ``--no-programs``, the + degree-program branches for one period, then merge the results. + + The VVZ "Gesamtverzeichnis" branch is the base: if it cannot be located + the whole period is skipped (``None`` is returned). Program branches that + cannot be located are skipped individually with a warning, since not every + program necessarily exists for every archived semester. + + Courses are deduplicated by ``unit_id`` (first branch wins), and the + ``skip_unit_ids`` option keeps later branches from re-fetching detail pages + for courses an earlier branch already collected. Catalog nodes are + deduplicated by ``node_id`` (they share absolute, root-relative ids across + branches). Every kept course/node is tagged with the period. + + Returns ``(courses, catalog_nodes, partial)`` where ``partial`` is true if + any branch crawl hit a runtime/expansion limit. + """ + chains: list[tuple[str, ...]] = [AlmaScraper.INFORMATICS_BRANCH_CHAIN] + if not args.no_programs: + chains.extend(AlmaScraper.PROGRAM_BRANCH_CHAINS) + + merged_courses: dict[str, dict] = {} + merged_nodes: dict[str, dict] = {} + seen_unit_ids: set[str] = set() + partial = False + + for chain in chains: + is_base = chain is AlmaScraper.INFORMATICS_BRANCH_CHAIN + permalink = scraper.find_branch_permalink(period.period_id, chain) + if not permalink: + if is_base: + return None + tqdm.write( + f" ! branch {chain[-1]!r} not found for {period.label}; " + "skipping this branch", + file=sys.stderr, + ) + continue + + branch_label = "VVZ" if is_base else chain[-1].split(" (")[0] + options = ScrapeOptions( + start_url=permalink, + branch_title=None, + max_depth=args.max_depth, + max_courses=args.max_courses, + fetch_details=args.details or args.full_catalog, + latest_versions_only=not args.include_old_versions, + progress_file=progress_path, + checkpoint_path=None, + checkpoint_every=args.checkpoint_every, + max_runtime_seconds=args.max_runtime_seconds, + max_expansions=args.max_expansions, + restrict_to_start_path=True, + skip_unit_ids=frozenset(seen_unit_ids) or None, + progress_bar=not args.quiet, + progress_label=f"{period.label}: {branch_label}", + ) + result = scraper.scrape(options) + partial = partial or bool(result["source"].get("partial")) + + for course in result["courses"]: + key = course.get("unit_id") or course.get("detail_url") + if not key or key in merged_courses: + continue + merged_courses[key] = course + if course.get("unit_id"): + seen_unit_ids.add(course["unit_id"]) + for node in result["catalog_nodes"]: + merged_nodes.setdefault(node["node_id"], node) + + courses = list(merged_courses.values()) + nodes = list(merged_nodes.values()) + for course in courses: + course["period_id"] = period.period_id + course["period_label"] = period.label + for node in nodes: + node["period_id"] = period.period_id + node["period_label"] = period.label + return courses, nodes, partial + + def _run_multi_period_scrape( scraper: AlmaScraper, args: argparse.Namespace, @@ -362,14 +464,12 @@ def _run_multi_period_scrape( ) -> dict: """Run the scraper once per discovered period and merge results. - For each period: - 1. Switch to that period via the Semesterauswahl dropdown - (:meth:`AlmaScraper.find_branch_permalink`), discovering the - period-specific Informatik permalink because the deep-path title - IDs are not stable across semesters. - 2. Scrape using that permalink as ``start_url`` so the existing - start-path scoping just works. - 3. Tag every course with ``period_id`` and ``period_label``. + For each period, :func:`_scrape_period_branches` discovers the + period-specific permalink for the VVZ Informatik branch and (unless + ``--no-programs``) the degree-program branches, scrapes each, and merges + them. Branch permalinks are rediscovered per period because the deep-path + title IDs are not stable across semesters. Every kept course is tagged + with ``period_id`` and ``period_label``. A checkpoint is written after every period so an interrupted run still leaves a usable output file. @@ -425,21 +525,16 @@ def _run_multi_period_scrape( flush=True, ) - for index, period in enumerate(remaining, start=1): - print( - f"=== [{index}/{len(remaining)}] period {period.period_id} " - f"({period.label}) ===", - file=sys.stderr, - flush=True, - ) - permalink = scraper.find_branch_permalink( - period.period_id, AlmaScraper.INFORMATICS_BRANCH_CHAIN - ) - if not permalink: - print( + period_bar = tqdm( + remaining, desc="semesters", unit="sem", disable=args.quiet, + ) + for period in period_bar: + period_bar.set_postfix_str(period.label) + scraped = _scrape_period_branches(scraper, args, period, progress_path) + if scraped is None: + tqdm.write( f" ! could not find Informatik branch for {period.label}; skipping", file=sys.stderr, - flush=True, ) per_period_summary.append( { @@ -456,36 +551,16 @@ def _run_multi_period_scrape( ) continue - period_options = ScrapeOptions( - start_url=permalink, - branch_title=None, - max_depth=args.max_depth, - max_courses=args.max_courses, - fetch_details=args.details or args.full_catalog, - latest_versions_only=not args.include_old_versions, - progress_file=progress_path, - checkpoint_path=None, - checkpoint_every=args.checkpoint_every, - max_runtime_seconds=args.max_runtime_seconds, - max_expansions=args.max_expansions, - restrict_to_start_path=True, - ) - result = scraper.scrape(period_options) - for course in result["courses"]: - course["period_id"] = period.period_id - course["period_label"] = period.label - for node in result["catalog_nodes"]: - node["period_id"] = period.period_id - node["period_label"] = period.label - all_courses.extend(result["courses"]) - all_catalog_nodes.extend(result["catalog_nodes"]) + period_courses, period_nodes, partial = scraped + all_courses.extend(period_courses) + all_catalog_nodes.extend(period_nodes) per_period_summary.append( { "period_id": period.period_id, "period_label": period.label, - "courses": len(result["courses"]), - "catalog_nodes": len(result["catalog_nodes"]), - "partial": bool(result["source"].get("partial")), + "courses": len(period_courses), + "catalog_nodes": len(period_nodes), + "partial": partial, } ) _write_multi_period_checkpoint( diff --git a/data_collection/alma/scraper.py b/data_collection/alma/scraper.py index a884676..7d7752c 100644 --- a/data_collection/alma/scraper.py +++ b/data_collection/alma/scraper.py @@ -12,6 +12,7 @@ import requests from bs4 import BeautifulSoup, Tag +from tqdm import tqdm CATALOG_PREFIX = "hierarchy:content-container:courseCatalogFieldset:courseCatalog:" @@ -45,6 +46,16 @@ class ScrapeOptions: restrict_to_start_path: bool = True max_runtime_seconds: int | None = None max_expansions: int | None = None + # unit_ids already collected by an earlier branch in the same period. + # Courses with these ids are dropped before detail fetching so the + # multi-branch crawl does not re-download shared courses. + skip_unit_ids: frozenset[str] | None = None + # Show a tqdm progress bar over the detail-fetch loop. When on, the noisy + # per-course stderr message is suppressed (the bar replaces it); the + # progress_file is still written every course. + progress_bar: bool = False + # Short label for the detail progress bar (e.g. "SoSe 2026 · M.Sc. CS"). + progress_label: str | None = None @dataclass(slots=True) @@ -105,6 +116,29 @@ class AlmaScraper: "Informatik", "Gesamtverzeichnis Lehrveranstaltungen Informatik", ) + # Degree-program branches under studiesOffered. Their module trees list + # courses (incl. ones cross-listed from other faculties) that count toward + # a study area but are absent from the VVZ "Gesamtverzeichnis" branch. The + # last chain entry is matched as a case-insensitive substring against + # catalog row titles, so the "(Version 2021)" suffix keeps it on the + # current examination regulation. + PROGRAM_BRANCH_CHAINS: tuple[tuple[str, ...], ...] = ( + ( + "Mathematisch-Naturwissenschaftliche", + "Informatik", + "M.Sc. Informatik / Computer Science (Version 2021)", + ), + ( + "Mathematisch-Naturwissenschaftliche", + "Informatik", + "B.Sc. Informatik (Version 2021)", + ), + ( + "Mathematisch-Naturwissenschaftliche", + "Informatik", + "M.Sc. Machine Learning (Version 2021)", + ), + ) def __init__( self, @@ -295,12 +329,23 @@ def scrape(self, options: ScrapeOptions) -> dict[str, Any]: if any(needle in title.casefold() for title in node.path_titles) ] courses = unique_courses(courses) + if options.skip_unit_ids: + courses = [ + node for node in courses if node.unit_id not in options.skip_unit_ids + ] courses.sort(key=lambda item: item.node_id) if options.max_courses is not None: courses = courses[: options.max_courses] course_records: list[dict[str, Any]] = [] total_courses = len(courses) + detail_bar = tqdm( + total=total_courses, + desc=options.progress_label or "details", + unit="course", + leave=False, + disable=not options.progress_bar, + ) for index, course in enumerate(courses, start=1): if self._runtime_exceeded(options, started_at): self._progress( @@ -311,10 +356,12 @@ def scrape(self, options: ScrapeOptions) -> dict[str, Any]: total_courses=total_courses, ) break + # When the bar is on it replaces the noisy per-course stderr line. self._progress( options, "details", f"Fetching detail {index}/{total_courses}: {course.title}", + to_stderr=not options.progress_bar, course_index=index, total_courses=total_courses, current_course=course.title, @@ -324,8 +371,10 @@ def scrape(self, options: ScrapeOptions) -> dict[str, Any]: record["details"] = self.fetch_course_details(course.detail_url) time.sleep(self.polite_delay) course_records.append(record) + detail_bar.update(1) if options.checkpoint_path and index % max(options.checkpoint_every, 1) == 0: self._write_checkpoint(options, started_at, course_records) + detail_bar.close() result = { "source": { @@ -706,7 +755,15 @@ def _runtime_exceeded(options: ScrapeOptions, started_at: int) -> bool: return False return time.time() - started_at >= options.max_runtime_seconds - def _progress(self, options: ScrapeOptions, stage: str, message: str, **extra: Any) -> None: + def _progress( + self, + options: ScrapeOptions, + stage: str, + message: str, + *, + to_stderr: bool = True, + **extra: Any, + ) -> None: payload = { "stage": stage, "message": message, @@ -715,7 +772,7 @@ def _progress(self, options: ScrapeOptions, stage: str, message: str, **extra: A "skipped_old_version_nodes": len(self.skipped_old_version_node_ids), **extra, } - if self.progress: + if self.progress and to_stderr: self.progress(message) if options.progress_file: self._write_json(Path(options.progress_file), payload) diff --git a/data_collection/pyproject.toml b/data_collection/pyproject.toml index bc6f366..2f39d71 100644 --- a/data_collection/pyproject.toml +++ b/data_collection/pyproject.toml @@ -7,6 +7,7 @@ requires-python = ">=3.11" dependencies = [ "beautifulsoup4>=4.12", "requests>=2.31", + "tqdm>=4.62", ] [build-system] diff --git a/data_collection/requirements.txt b/data_collection/requirements.txt index e35775c..8b078a4 100644 --- a/data_collection/requirements.txt +++ b/data_collection/requirements.txt @@ -1,2 +1,3 @@ beautifulsoup4>=4.12 requests>=2.31 +tqdm>=4.62 From 6133ead05ca79c465042df06908339054be671dc Mon Sep 17 00:00:00 2001 From: Yonatan Dankner Date: Fri, 26 Jun 2026 11:51:35 +0200 Subject: [PATCH 2/2] Address review feedback on the program-branch scrape - Fix the per-branch `partial` flag to also reflect the --max-expansions crawl limit (it previously tracked only the runtime limit), so a branch that stops early is not silently recorded complete and skipped by --continue. - Program-scope the study-area alias join (JOIN study_programs) so the generic B.Sc. codes PRAK/THEO/TECH/INFO cannot mislink a course to another program that happens to reuse the same bare code. - Close both tqdm bars via context managers so an exception mid-run no longer leaks a bar / corrupts the terminal. - Fix progress-label example in the ScrapeOptions docstring. - Correct the module name in QUICKSTART.md / SETUP.md (alma.cli, not the non-existent alma_scraper.cli). Co-Authored-By: Claude Opus 4.8 --- backend/scripts/import_alma_json_to_d1.py | 24 ++++--- data_collection/QUICKSTART.md | 18 +++--- data_collection/SETUP.md | 4 +- data_collection/alma/cli.py | 76 +++++++++++------------ data_collection/alma/scraper.py | 60 +++++++++--------- 5 files changed, 95 insertions(+), 87 deletions(-) diff --git a/backend/scripts/import_alma_json_to_d1.py b/backend/scripts/import_alma_json_to_d1.py index 45aeb6c..8c57473 100644 --- a/backend/scripts/import_alma_json_to_d1.py +++ b/backend/scripts/import_alma_json_to_d1.py @@ -501,22 +501,26 @@ def _get_or_create_lecturer(lecturer_id_by_name: dict[str, int], name: str) -> i -- seeded study_areas.code: M.Sc. Machine Learning detail pages use MACH-* -- (seeded as ML-*), and B.Sc. Informatik Wahlpflicht modules appear as their -- INFM module numbers. Map those aliases so cross-listed courses still link to --- the right study area. The original scraped code is kept as source_code. +-- the right study area. The alias destination is scoped to its program +-- (study_areas.code is only unique per program; the B.Sc. codes PRAK/THEO/ +-- TECH/INFO are deliberately generic), and the original scraped code is kept +-- as source_code. INSERT OR IGNORE INTO course_study_area_links (course_id, study_area_id, source_code) SELECT f.course_id, sa.id, je.value FROM course_fields AS f JOIN json_each(f.value) AS je JOIN ( - SELECT 'MACH-FML' AS src, 'ML-FOUND' AS dst - UNION ALL SELECT 'MACH-DTML', 'ML-DIVERSE' - UNION ALL SELECT 'MACH-GCS', 'ML-CS' - UNION ALL SELECT 'MACH-EP', 'ML-EXP' - UNION ALL SELECT 'INFM3110', 'PRAK' - UNION ALL SELECT 'INFM3410', 'THEO' - UNION ALL SELECT 'INFM3310', 'TECH' - UNION ALL SELECT 'INFM2510', 'INFO' + SELECT 'MACH-FML' AS src, 'MSC_ML_2021' AS prog, 'ML-FOUND' AS dst + UNION ALL SELECT 'MACH-DTML', 'MSC_ML_2021', 'ML-DIVERSE' + UNION ALL SELECT 'MACH-GCS', 'MSC_ML_2021', 'ML-CS' + UNION ALL SELECT 'MACH-EP', 'MSC_ML_2021', 'ML-EXP' + UNION ALL SELECT 'INFM3110', 'BSC_INFO_2021', 'PRAK' + UNION ALL SELECT 'INFM3410', 'BSC_INFO_2021', 'THEO' + UNION ALL SELECT 'INFM3310', 'BSC_INFO_2021', 'TECH' + UNION ALL SELECT 'INFM2510', 'BSC_INFO_2021', 'INFO' ) AS alias ON alias.src = je.value -JOIN study_areas AS sa ON sa.code = alias.dst +JOIN study_programs AS sp ON sp.code = alias.prog +JOIN study_areas AS sa ON sa.program_id = sp.id AND sa.code = alias.dst WHERE f."key" = '_categories_json'; INSERT OR IGNORE INTO course_curriculum_matches (course_id, module_id, match_type, confidence) diff --git a/data_collection/QUICKSTART.md b/data_collection/QUICKSTART.md index 998856d..a6e00ac 100644 --- a/data_collection/QUICKSTART.md +++ b/data_collection/QUICKSTART.md @@ -11,7 +11,7 @@ 2. **Run scraper:** ```powershell - uv run python -m alma_scraper.cli --details + uv run python -m alma.cli --details ``` ### Option 2: Using `pip` @@ -29,7 +29,7 @@ 3. **Run scraper:** ```powershell - python -m alma_scraper.cli --details + python -m alma.cli --details ``` ## Usage @@ -40,7 +40,7 @@ Scrape the Informatik course catalog (Gesamtverzeichnis Lehrveranstaltungen Informatik) with course details: ```powershell -uv run python -m alma_scraper.cli --details +uv run python -m alma.cli --details ``` Each course detail includes a `categories` list — the module/study-program @@ -55,7 +55,7 @@ the scraper switches via ALMA's Semesterauswahl dropdown and rediscovers each branch by title chain (the deep-path IDs differ between semesters): ```powershell -uv run python -m alma_scraper.cli --details --from-semester "Sommer 2022" +uv run python -m alma.cli --details --from-semester "Sommer 2022" ``` In multi-period mode the scraper crawls the VVZ "Gesamtverzeichnis @@ -73,7 +73,7 @@ so an interrupted run still leaves a usable file. If a run was interrupted, resume it without redoing the completed semesters: ```powershell -uv run python -m alma_scraper.cli --details --continue output//courses_multi_semester.json +uv run python -m alma.cli --details --continue output//courses_multi_semester.json ``` Fully completed periods are kept and skipped; partial or skipped ones are @@ -82,7 +82,7 @@ redone. Output is written back to the same path. ### List available semesters ```powershell -uv run python -m alma_scraper.cli --list-periods +uv run python -m alma.cli --list-periods ``` ### Quick Test (2 minutes) @@ -90,7 +90,7 @@ uv run python -m alma_scraper.cli --list-periods Test scraping: ```powershell -uv run python -m alma_scraper.cli --details --max-runtime-seconds 120 +uv run python -m alma.cli --details --max-runtime-seconds 120 ``` ### Full Catalog @@ -98,7 +98,7 @@ uv run python -m alma_scraper.cli --details --max-runtime-seconds 120 Scrape entire university: ```powershell -uv run python -m alma_scraper.cli --full-catalog +uv run python -m alma.cli --full-catalog ``` ### Watch Progress @@ -129,4 +129,4 @@ output/ - `--pretty` - Pretty-print JSON - `--list-periods` - Print available period IDs and labels -For full help: `uv run python -m alma_scraper.cli --help` +For full help: `uv run python -m alma.cli --help` diff --git a/data_collection/SETUP.md b/data_collection/SETUP.md index 37fc872..b8878ea 100644 --- a/data_collection/SETUP.md +++ b/data_collection/SETUP.md @@ -9,7 +9,7 @@ 2. **Run scraper:** ```powershell - uv run python -m alma_scraper.cli --details + uv run python -m alma.cli --details ``` ## Option 2: Using `pip` (Virtual Environment) @@ -27,7 +27,7 @@ 3. **Run scraper:** ```powershell - python -m alma_scraper.cli --details + python -m alma.cli --details ``` ## Output diff --git a/data_collection/alma/cli.py b/data_collection/alma/cli.py index f033762..b23c40a 100644 --- a/data_collection/alma/cli.py +++ b/data_collection/alma/cli.py @@ -525,52 +525,52 @@ def _run_multi_period_scrape( flush=True, ) - period_bar = tqdm( - remaining, desc="semesters", unit="sem", disable=args.quiet, - ) - for period in period_bar: - period_bar.set_postfix_str(period.label) - scraped = _scrape_period_branches(scraper, args, period, progress_path) - if scraped is None: - tqdm.write( - f" ! could not find Informatik branch for {period.label}; skipping", - file=sys.stderr, - ) + # Context-managed so the outer bar is always closed, even if a period + # scrape raises partway through the run. + with tqdm(remaining, desc="semesters", unit="sem", disable=args.quiet) as period_bar: + for period in period_bar: + period_bar.set_postfix_str(period.label) + scraped = _scrape_period_branches(scraper, args, period, progress_path) + if scraped is None: + tqdm.write( + f" ! could not find Informatik branch for {period.label}; skipping", + file=sys.stderr, + ) + per_period_summary.append( + { + "period_id": period.period_id, + "period_label": period.label, + "courses": 0, + "catalog_nodes": 0, + "skipped": True, + } + ) + _write_multi_period_checkpoint( + out_path, args, periods, per_period_summary, + all_catalog_nodes, all_courses, + ) + continue + + period_courses, period_nodes, partial = scraped + all_courses.extend(period_courses) + all_catalog_nodes.extend(period_nodes) per_period_summary.append( { "period_id": period.period_id, "period_label": period.label, - "courses": 0, - "catalog_nodes": 0, - "skipped": True, + "courses": len(period_courses), + "catalog_nodes": len(period_nodes), + "partial": partial, } ) _write_multi_period_checkpoint( - out_path, args, periods, per_period_summary, - all_catalog_nodes, all_courses, + out_path, + args, + periods, + per_period_summary, + all_catalog_nodes, + all_courses, ) - continue - - period_courses, period_nodes, partial = scraped - all_courses.extend(period_courses) - all_catalog_nodes.extend(period_nodes) - per_period_summary.append( - { - "period_id": period.period_id, - "period_label": period.label, - "courses": len(period_courses), - "catalog_nodes": len(period_nodes), - "partial": partial, - } - ) - _write_multi_period_checkpoint( - out_path, - args, - periods, - per_period_summary, - all_catalog_nodes, - all_courses, - ) return _multi_period_result( args, periods, per_period_summary, all_catalog_nodes, all_courses diff --git a/data_collection/alma/scraper.py b/data_collection/alma/scraper.py index 7d7752c..78f4318 100644 --- a/data_collection/alma/scraper.py +++ b/data_collection/alma/scraper.py @@ -54,7 +54,7 @@ class ScrapeOptions: # per-course stderr message is suppressed (the bar replaces it); the # progress_file is still written every course. progress_bar: bool = False - # Short label for the detail progress bar (e.g. "SoSe 2026 · M.Sc. CS"). + # Short label for the detail progress bar (e.g. "SoSe 2026: VVZ"). progress_label: str | None = None @@ -339,42 +339,43 @@ def scrape(self, options: ScrapeOptions) -> dict[str, Any]: course_records: list[dict[str, Any]] = [] total_courses = len(courses) - detail_bar = tqdm( + # Context-managed so the bar is always closed, even if a detail fetch + # raises, instead of leaving a corrupted terminal line behind. + with tqdm( total=total_courses, desc=options.progress_label or "details", unit="course", leave=False, disable=not options.progress_bar, - ) - for index, course in enumerate(courses, start=1): - if self._runtime_exceeded(options, started_at): + ) as detail_bar: + for index, course in enumerate(courses, start=1): + if self._runtime_exceeded(options, started_at): + self._progress( + options, + "paused", + f"Runtime limit reached before detail {index}/{total_courses}", + course_index=index, + total_courses=total_courses, + ) + break + # When the bar is on it replaces the noisy per-course stderr line. self._progress( options, - "paused", - f"Runtime limit reached before detail {index}/{total_courses}", + "details", + f"Fetching detail {index}/{total_courses}: {course.title}", + to_stderr=not options.progress_bar, course_index=index, total_courses=total_courses, + current_course=course.title, ) - break - # When the bar is on it replaces the noisy per-course stderr line. - self._progress( - options, - "details", - f"Fetching detail {index}/{total_courses}: {course.title}", - to_stderr=not options.progress_bar, - course_index=index, - total_courses=total_courses, - current_course=course.title, - ) - record = asdict(course) - if options.fetch_details and course.detail_url: - record["details"] = self.fetch_course_details(course.detail_url) - time.sleep(self.polite_delay) - course_records.append(record) - detail_bar.update(1) - if options.checkpoint_path and index % max(options.checkpoint_every, 1) == 0: - self._write_checkpoint(options, started_at, course_records) - detail_bar.close() + record = asdict(course) + if options.fetch_details and course.detail_url: + record["details"] = self.fetch_course_details(course.detail_url) + time.sleep(self.polite_delay) + course_records.append(record) + detail_bar.update(1) + if options.checkpoint_path and index % max(options.checkpoint_every, 1) == 0: + self._write_checkpoint(options, started_at, course_records) result = { "source": { @@ -382,7 +383,10 @@ def scrape(self, options: ScrapeOptions) -> dict[str, Any]: "branch_title": branch_title, "latest_versions_only": options.latest_versions_only, "skipped_old_version_nodes": sorted(self.skipped_old_version_node_ids), - "partial": self._runtime_exceeded(options, started_at), + # timed_out covers both the runtime and max-expansions crawl + # limits; the runtime check also catches the detail-fetch loop + # breaking early. Either means the period is incomplete. + "partial": timed_out or self._runtime_exceeded(options, started_at), "fetched_at_unix": started_at, "finished_at_unix": int(time.time()), },