From 57f723047c0df8da39591321767e386d9638f522 Mon Sep 17 00:00:00 2001 From: Peter Chapman Date: Thu, 18 Jun 2026 13:19:57 +1200 Subject: [PATCH] Fix incorrect chapter numbers returned from usfm_structure_extractor --- machine/punctuation_analysis/chapter.py | 2 +- .../quotation_mark_update_first_pass.py | 2 +- .../usfm_structure_extractor.py | 16 +++--- .../test_quotation_denormalization.py | 16 +++++- .../test_usfm_structure_extractor.py | 56 ++++++++++++++----- 5 files changed, 65 insertions(+), 27 deletions(-) diff --git a/machine/punctuation_analysis/chapter.py b/machine/punctuation_analysis/chapter.py index d1e42af4..0abc15e4 100644 --- a/machine/punctuation_analysis/chapter.py +++ b/machine/punctuation_analysis/chapter.py @@ -6,4 +6,4 @@ @dataclass(frozen=True) class Chapter: verses: list[Verse] - chatper_num: int = 0 + chapter_num: int = 0 diff --git a/machine/punctuation_analysis/quotation_mark_update_first_pass.py b/machine/punctuation_analysis/quotation_mark_update_first_pass.py index 47088080..3bc4cfbe 100644 --- a/machine/punctuation_analysis/quotation_mark_update_first_pass.py +++ b/machine/punctuation_analysis/quotation_mark_update_first_pass.py @@ -58,7 +58,7 @@ def find_best_chapter_strategies(self) -> List[Tuple[int, QuotationMarkUpdateStr best_actions_by_chapter: List[Tuple[int, QuotationMarkUpdateStrategy]] = [] for chapter in self.get_chapters(): - best_actions_by_chapter.append((chapter.chatper_num, self._find_best_strategy_for_chapter(chapter))) + best_actions_by_chapter.append((chapter.chapter_num, self._find_best_strategy_for_chapter(chapter))) return best_actions_by_chapter diff --git a/machine/punctuation_analysis/usfm_structure_extractor.py b/machine/punctuation_analysis/usfm_structure_extractor.py index 7a6fdac3..f88fc76d 100644 --- a/machine/punctuation_analysis/usfm_structure_extractor.py +++ b/machine/punctuation_analysis/usfm_structure_extractor.py @@ -91,6 +91,14 @@ def get_chapters(self, include_chapters: Optional[Dict[int, List[int]]] = None) current_chapter_verses: list[Verse] = [] current_verse_segments: list[TextSegment] = [] for text_segment in self._text_segments: + if text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE): + if len(current_verse_segments) > 0: + current_chapter_verses.append(Verse(current_verse_segments)) + current_verse_segments = [] + if text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER): + if len(current_chapter_verses) > 0: + chapters.append(Chapter(current_chapter_verses, current_chapter)) + current_chapter_verses = [] if text_segment.book is not None: current_book = book_id_to_number(text_segment.book) if text_segment.chapter is not None: @@ -104,14 +112,6 @@ def get_chapters(self, include_chapters: Optional[Dict[int, List[int]]] = None) and current_chapter not in include_chapters[current_book] ): continue - if text_segment.marker_is_in_preceding_context(UsfmMarkerType.VERSE): - if len(current_verse_segments) > 0: - current_chapter_verses.append(Verse(current_verse_segments)) - current_verse_segments = [] - if text_segment.marker_is_in_preceding_context(UsfmMarkerType.CHAPTER): - if len(current_chapter_verses) > 0: - chapters.append(Chapter(current_chapter_verses, current_chapter)) - current_chapter_verses = [] current_verse_segments.append(text_segment) if len(current_verse_segments) > 0: current_chapter_verses.append(Verse(current_verse_segments)) diff --git a/tests/punctuation_analysis/test_quotation_denormalization.py b/tests/punctuation_analysis/test_quotation_denormalization.py index 44cc459b..ba7724a7 100644 --- a/tests/punctuation_analysis/test_quotation_denormalization.py +++ b/tests/punctuation_analysis/test_quotation_denormalization.py @@ -12,7 +12,13 @@ def test_full_quotation_denormalization_pipeline() -> None: normalized_usfm = """ \\id GEN - \\c 1 + \\c 2 + \\v 1 Thus the heavens and the earth were completed in all their vast array. + \\v 2 And by the seventh day God had finished the work He had been doing; + so on that day He rested from all His work. + \\v 3 Then God blessed the seventh day and sanctified it, + because on that day He rested from all the work of creation that He had accomplished. + \\c 3 \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, "Has God really said, @@ -24,7 +30,11 @@ def test_full_quotation_denormalization_pipeline() -> None: """ expected_denormalized_usfm = """\\id GEN -\\c 1 +\\c 2 +\\v 1 Thus the heavens and the earth were completed in all their vast array. +\\v 2 And by the seventh day God had finished the work He had been doing; so on that day He rested from all His work. +\\v 3 Then God blessed the seventh day and sanctified it, because on that day He rested from all the work of creation that He had accomplished. +\\c 3 \\v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?” \\v 2 The woman said to the serpent, “We may eat fruit from the trees of the garden, \\v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’” @@ -38,7 +48,7 @@ def test_full_quotation_denormalization_pipeline() -> None: parse_usfm(normalized_usfm, quotation_mark_denormalization_first_pass) best_chapter_strategies = quotation_mark_denormalization_first_pass.find_best_chapter_strategies() - assert [chapter for chapter, _ in best_chapter_strategies] == [1] + assert [chapter for chapter, _ in best_chapter_strategies] == [2, 3] quotation_mark_denormalizer = QuotationMarkDenormalizationUsfmUpdateBlockHandler( standard_english_quote_convention, diff --git a/tests/punctuation_analysis/test_usfm_structure_extractor.py b/tests/punctuation_analysis/test_usfm_structure_extractor.py index 571b0f8c..790b6e2a 100644 --- a/tests/punctuation_analysis/test_usfm_structure_extractor.py +++ b/tests/punctuation_analysis/test_usfm_structure_extractor.py @@ -46,7 +46,8 @@ def test_get_chapters_filter_by_chapter(): .build() ] ) - ] + ], + 2, ) ] @@ -58,6 +59,7 @@ def test_get_chapters_filter_by_chapter(): def test_chapter_and_verse_markers(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.text(verse_text_parser_state, "test") @@ -74,7 +76,8 @@ def test_chapter_and_verse_markers(): .build() ] ) - ] + ], + 1, ) ] @@ -86,6 +89,7 @@ def test_chapter_and_verse_markers(): def test_start_paragraph_marker(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.start_para(verse_text_parser_state, "p", False, None) @@ -104,7 +108,8 @@ def test_start_paragraph_marker(): .build() ] ) - ] + ], + 1, ) ] @@ -116,6 +121,7 @@ def test_start_paragraph_marker(): def test_start_character_marker(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.start_char(verse_text_parser_state, "k", False, None) @@ -134,7 +140,8 @@ def test_start_character_marker(): .build() ] ) - ] + ], + 1, ) ] @@ -146,6 +153,7 @@ def test_start_character_marker(): def test_end_character_marker(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.end_char(verse_text_parser_state, "k", None, False) @@ -164,7 +172,8 @@ def test_end_character_marker(): .build() ] ) - ] + ], + 1, ) ] @@ -176,6 +185,7 @@ def test_end_character_marker(): def test_end_note_marker(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.end_note(verse_text_parser_state, "f", False) @@ -194,7 +204,8 @@ def test_end_note_marker(): .build() ] ) - ] + ], + 1, ) ] @@ -206,6 +217,7 @@ def test_end_note_marker(): def test_end_table_marker(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.end_note(verse_text_parser_state, "tr", False) @@ -224,7 +236,8 @@ def test_end_table_marker(): .build() ] ) - ] + ], + 1, ) ] @@ -236,6 +249,7 @@ def test_end_table_marker(): def test_ref_marker(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.end_note(verse_text_parser_state, "x", False) @@ -254,7 +268,8 @@ def test_ref_marker(): .build() ] ) - ] + ], + 1, ) ] @@ -266,6 +281,7 @@ def test_ref_marker(): def test_sidebar_marker(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.end_note(verse_text_parser_state, "esb", False) @@ -284,7 +300,8 @@ def test_sidebar_marker(): .build() ] ) - ] + ], + 1, ) ] @@ -296,6 +313,7 @@ def test_sidebar_marker(): def test_multiple_verses(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.text(verse_text_parser_state, "test") @@ -323,7 +341,8 @@ def test_multiple_verses(): .build() ] ), - ] + ], + 1, ) ] @@ -337,9 +356,11 @@ def test_multiple_verses(): def test_multiple_chapters(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.text(verse_text_parser_state, "test") + verse_text_parser_state.verse_ref.chapter_num = 2 usfm_structure_extractor.chapter(verse_text_parser_state, "2", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.text(verse_text_parser_state, "test2") @@ -356,7 +377,8 @@ def test_multiple_chapters(): .build() ] ), - ] + ], + 1, ), Chapter( [ @@ -369,7 +391,8 @@ def test_multiple_chapters(): .build() ] ), - ] + ], + 2, ), ] @@ -383,6 +406,7 @@ def test_multiple_chapters(): def test_character_marker_in_text(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.text(verse_text_parser_state, "test") @@ -407,7 +431,8 @@ def test_character_marker_in_text(): .build(), ] ), - ] + ], + 1, ) ] @@ -424,6 +449,7 @@ def test_character_marker_in_text(): def test_empty_text(): usfm_structure_extractor = UsfmStructureExtractor() + verse_text_parser_state.verse_ref.chapter_num = 1 usfm_structure_extractor.chapter(verse_text_parser_state, "1", "c", None, None) usfm_structure_extractor.verse(verse_text_parser_state, "1", "v", None, None) usfm_structure_extractor.text(verse_text_parser_state, "test") @@ -450,7 +476,8 @@ def test_empty_text(): .build(), ] ), - ] + ], + 1, ) ] @@ -468,6 +495,7 @@ def test_empty_text(): def assert_chapter_equal(expected_chapters: List[Chapter], actual_chapters: List[Chapter]): assert len(expected_chapters) == len(actual_chapters) for expected_chapter, actual_chapter in zip(expected_chapters, actual_chapters): + assert expected_chapter.chapter_num == actual_chapter.chapter_num assert len(expected_chapter.verses) == len(actual_chapter.verses) for expected_verse, actual_verse in zip(expected_chapter.verses, actual_chapter.verses): assert len(expected_verse._text_segments) == len(actual_verse._text_segments)