From 3b7d2a7b4f75bb943e028708defc6cbb06e0b186 Mon Sep 17 00:00:00 2001 From: Peter Chapman Date: Thu, 18 Jun 2026 13:49:56 +1200 Subject: [PATCH] Fix incorrect chapter numbers returned from UsfmStructureExtractor --- .../UsfmStructureExtractor.cs | 22 +- .../QuotationDenormalizationTests.cs | 16 +- .../UsfmStructureExtractorTests.cs | 360 ++++++++++-------- 3 files changed, 232 insertions(+), 166 deletions(-) diff --git a/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs b/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs index 685c5d58..91b060a2 100644 --- a/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs +++ b/src/SIL.Machine/PunctuationAnalysis/UsfmStructureExtractor.cs @@ -141,17 +141,6 @@ public List GetChapters(IReadOnlyDictionary> includeChap var currentVerseSegments = new List(); foreach (TextSegment textSegment in _textSegments) { - if (textSegment.Book != null) - currentBook = Canon.BookIdToNumber(textSegment.Book); - if (textSegment.Chapter > 0) - currentChapter = textSegment.Chapter; - if (includeChapters != null && currentBook > 0) - { - if (!includeChapters.TryGetValue(currentBook, out List bookChapters)) - continue; - if (currentChapter > 0 && bookChapters.Count > 0 && !bookChapters.Contains(currentChapter)) - continue; - } if (textSegment.MarkerIsInPrecedingContext(UsfmMarkerType.Verse)) { if (currentVerseSegments.Count > 0) @@ -168,6 +157,17 @@ public List GetChapters(IReadOnlyDictionary> includeChap } currentChapterVerses = new List(); } + if (textSegment.Book != null) + currentBook = Canon.BookIdToNumber(textSegment.Book); + if (textSegment.Chapter > 0) + currentChapter = textSegment.Chapter; + if (includeChapters != null && currentBook > 0) + { + if (!includeChapters.TryGetValue(currentBook, out List bookChapters)) + continue; + if (currentChapter > 0 && bookChapters.Count > 0 && !bookChapters.Contains(currentChapter)) + continue; + } currentVerseSegments.Add(textSegment); } if (currentVerseSegments.Count > 0) diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationDenormalizationTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationDenormalizationTests.cs index fa3f31c0..1856840f 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationDenormalizationTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/QuotationDenormalizationTests.cs @@ -12,7 +12,13 @@ public void FullQuotationDenormalizationPipeline() string normalizedUsfm = @" \id GEN - \c 1 + \c 2 + \v 1 Thus the heavens and the earth were completed in all their vast array. + \v 2 And by the seventh day God had finished the work He had been doing; + so on that day He rested from all His work. + \v 3 Then God blessed the seventh day and sanctified it, + because on that day He rested from all the work of creation that He had accomplished. + \c 3 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, ""Has God really said, @@ -25,7 +31,11 @@ of the field which Yahweh God had made. string expectedDenormalizedUsfm = @"\id GEN -\c 1 +\c 2 +\v 1 Thus the heavens and the earth were completed in all their vast array. +\v 2 And by the seventh day God had finished the work He had been doing; so on that day He rested from all His work. +\v 3 Then God blessed the seventh day and sanctified it, because on that day He rested from all the work of creation that He had accomplished. +\c 3 \v 1 Now the serpent was more subtle than any animal of the field which Yahweh God had made. He said to the woman, “Has God really said, ‘You shall not eat of any tree of the garden’?” \v 2 The woman said to the serpent, “We may eat fruit from the trees of the garden, \v 3 but not the fruit of the tree which is in the middle of the garden. God has said, ‘You shall not eat of it. You shall not touch it, lest you die.’” @@ -44,7 +54,7 @@ of the field which Yahweh God had made. List<(int ChapterNumber, QuotationMarkUpdateStrategy Strategy)> bestChapterStrategies = quotationMarkDenormalizationFirstPass.FindBestChapterStrategies(); - Assert.That(bestChapterStrategies.Select(tuple => tuple.ChapterNumber).SequenceEqual([1])); + Assert.That(bestChapterStrategies.Select(tuple => tuple.ChapterNumber).SequenceEqual([2, 3])); var quotationMarkDenormalizer = new QuotationMarkDenormalizationUsfmUpdateBlockHandler( standardEnglishQuoteConvention, diff --git a/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs b/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs index c2a9be25..ec129931 100644 --- a/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs +++ b/tests/SIL.Machine.Tests/PunctuationAnalysis/UsfmStructureExtractorTests.cs @@ -51,15 +51,18 @@ public void GetChaptersFilterByChapter() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test2") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + ]), + ], + 2 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters( new Dictionary> { { 40, [2] } } @@ -73,21 +76,25 @@ public void GetChaptersFilterByChapter() public void ChapterAndVerseMarkers() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.Text(_verseTextParserState, "test"); List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -100,6 +107,7 @@ public void ChapterAndVerseMarkers() public void StartParagraphMarker() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.StartPara(_verseTextParserState, "p", false, null); @@ -107,16 +115,19 @@ public void StartParagraphMarker() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .AddPrecedingMarker(UsfmMarkerType.Paragraph) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Paragraph) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -129,6 +140,7 @@ public void StartParagraphMarker() public void StartCharacterMarker() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.StartChar(_verseTextParserState, "k", false, null); @@ -136,16 +148,19 @@ public void StartCharacterMarker() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .AddPrecedingMarker(UsfmMarkerType.Character) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -158,6 +173,7 @@ public void StartCharacterMarker() public void EndCharacterMarker() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.EndChar(_verseTextParserState, "k", null, false); @@ -165,16 +181,19 @@ public void EndCharacterMarker() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .AddPrecedingMarker(UsfmMarkerType.Character) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -187,6 +206,7 @@ public void EndCharacterMarker() public void EndNoteMarker() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.EndNote(_verseTextParserState, "f", false); @@ -194,16 +214,19 @@ public void EndNoteMarker() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .AddPrecedingMarker(UsfmMarkerType.Embed) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -216,6 +239,7 @@ public void EndNoteMarker() public void EndTableMarker() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.EndNote(_verseTextParserState, "tr", false); @@ -223,16 +247,19 @@ public void EndTableMarker() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .AddPrecedingMarker(UsfmMarkerType.Embed) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -245,6 +272,7 @@ public void EndTableMarker() public void RefMarker() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.EndNote(_verseTextParserState, "x", false); @@ -252,16 +280,19 @@ public void RefMarker() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .AddPrecedingMarker(UsfmMarkerType.Embed) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -274,6 +305,7 @@ public void RefMarker() public void SidebarMarker() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.EndNote(_verseTextParserState, "esb", false); @@ -281,16 +313,19 @@ public void SidebarMarker() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .AddPrecedingMarker(UsfmMarkerType.Embed) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Embed) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -303,6 +338,7 @@ public void SidebarMarker() public void MultipleVerses() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.Text(_verseTextParserState, "test"); @@ -311,22 +347,25 @@ public void MultipleVerses() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .Build(), - ]), - new Verse([ - new TextSegment.Builder() - .SetText("test2") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + ]), + new Verse([ + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -341,33 +380,41 @@ public void MultipleVerses() public void MultipleChapters() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.Text(_verseTextParserState, "test"); + _verseTextParserState.SetChapterNum(2); usfmStructureExtractor.Chapter(_verseTextParserState, "2", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.Text(_verseTextParserState, "test2"); List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .Build(), - ]), - ]), - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test2") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + ]), + ], + 1 + ), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + ]), + ], + 2 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -382,6 +429,7 @@ public void MultipleChapters() public void CharacterMarkerInText() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.Text(_verseTextParserState, "test"); @@ -390,21 +438,24 @@ public void CharacterMarkerInText() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .Build(), - new TextSegment.Builder() - .SetText("test2") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .AddPrecedingMarker(UsfmMarkerType.Character) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -423,6 +474,7 @@ public void CharacterMarkerInText() public void EmptyText() { var usfmStructureExtractor = new UsfmStructureExtractor(); + _verseTextParserState.SetChapterNum(1); usfmStructureExtractor.Chapter(_verseTextParserState, "1", "c", null, null); usfmStructureExtractor.Verse(_verseTextParserState, "1", "v", null, null); usfmStructureExtractor.Text(_verseTextParserState, "test"); @@ -433,21 +485,24 @@ public void EmptyText() List expectedChapters = [ - new Chapter([ - new Verse([ - new TextSegment.Builder() - .SetText("test") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .Build(), - new TextSegment.Builder() - .SetText("test2") - .AddPrecedingMarker(UsfmMarkerType.Chapter) - .AddPrecedingMarker(UsfmMarkerType.Verse) - .AddPrecedingMarker(UsfmMarkerType.Character) - .Build(), - ]), - ]), + new Chapter( + [ + new Verse([ + new TextSegment.Builder() + .SetText("test") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .Build(), + new TextSegment.Builder() + .SetText("test2") + .AddPrecedingMarker(UsfmMarkerType.Chapter) + .AddPrecedingMarker(UsfmMarkerType.Verse) + .AddPrecedingMarker(UsfmMarkerType.Character) + .Build(), + ]), + ], + 1 + ), ]; List actualChapters = usfmStructureExtractor.GetChapters(); @@ -467,6 +522,7 @@ private static void AssertChapterEqual(List expectedChapters, List