diff --git a/src/SIL.Machine/Corpora/FileParatextProjectVersificationErrorDetector.cs b/src/SIL.Machine/Corpora/FileParatextProjectVersificationErrorDetector.cs deleted file mode 100644 index a99f8d0e..00000000 --- a/src/SIL.Machine/Corpora/FileParatextProjectVersificationErrorDetector.cs +++ /dev/null @@ -1,14 +0,0 @@ -namespace SIL.Machine.Corpora -{ - public class FileParatextProjectVersificationErrorDetector : ParatextProjectVersificationErrorDetectorBase - { - public FileParatextProjectVersificationErrorDetector( - string projectDir, - ParatextProjectSettings parentSettings = null - ) - : base( - new FileParatextProjectFileHandler(projectDir), - FileParatextProjectSettingsParser.Parse(projectDir, parentSettings) - ) { } - } -} diff --git a/src/SIL.Machine/Corpora/FileUsfmVersificationAnalyzer.cs b/src/SIL.Machine/Corpora/FileUsfmVersificationAnalyzer.cs new file mode 100644 index 00000000..67330c04 --- /dev/null +++ b/src/SIL.Machine/Corpora/FileUsfmVersificationAnalyzer.cs @@ -0,0 +1,11 @@ +namespace SIL.Machine.Corpora +{ + public class FileUsfmVersificationAnalyzer : UsfmVersificationAnalyzerBase + { + public FileUsfmVersificationAnalyzer(string projectDir, ParatextProjectSettings parentSettings = null) + : base( + new FileParatextProjectFileHandler(projectDir), + FileParatextProjectSettingsParser.Parse(projectDir, parentSettings) + ) { } + } +} diff --git a/src/SIL.Machine/Corpora/ScrVersExtensions.cs b/src/SIL.Machine/Corpora/ScrVersExtensions.cs index 7114b10c..8e64cd89 100644 --- a/src/SIL.Machine/Corpora/ScrVersExtensions.cs +++ b/src/SIL.Machine/Corpora/ScrVersExtensions.cs @@ -30,12 +30,18 @@ public static IEnumerable GetReferencesForBook(this ScrVers scrVers, i return references; } - public static IEnumerable AllIncludedVerses(this ScrVers scrVers) + public static IEnumerable AllIncludedVerses(this ScrVers scrVers, HashSet onlyBooks = null) { for (int book = 1; book <= scrVers.GetLastBook(); book++) { - if (!Canon.IsCanonical(book) || (book > 86 && book < 93)) + if ( + !Canon.IsCanonical(book) + || (book > 86 && book < 93) + || (onlyBooks != null && !onlyBooks.Contains(book)) + ) + { continue; + } for (int chapter = 1; chapter <= scrVers.GetLastChapter(book); chapter++) { VerseRef? firstVerse = scrVers.FirstIncludedVerse(book, chapter); diff --git a/src/SIL.Machine/Corpora/UsfmVersificationAnalyzer.cs b/src/SIL.Machine/Corpora/UsfmVersificationAnalyzer.cs new file mode 100644 index 00000000..e2df3b1b --- /dev/null +++ b/src/SIL.Machine/Corpora/UsfmVersificationAnalyzer.cs @@ -0,0 +1,333 @@ +using System.Collections.Generic; +using System.Linq; +using SIL.Scripture; + +namespace SIL.Machine.Corpora +{ + public enum UsfmVersificationDiagnosticType + { + Missing, //Missing content + Extra, // Extra content + Invalid, // Invalid verse or chapter reference + IncorrectVerseSegment, // Verse segment in vrs but not in USFM or segment in USFM but not in vrs + UnsupportedVerseRange, // Verse range that will cross chapter boundaries when mapped to ScrVers.Original + } + + public class UsfmVersificationDiagnostic + { + public UsfmVersificationDiagnosticType Type { get; set; } + public int NumAffectedVerses => References.Sum(vr => vr.AllVerses().Count()); + public List References { get; set; } //Expected verses for Missing, actual verses for Extra and Invalid + public string Filename { get; set; } + public List LineNumbers { get; set; } + + public void Extend(VerseRef verseReference) + { + if (References.Count > 0) // Combine contiguous references + { + VerseRef lastReference = References[References.Count - 1]; + if (verseReference.Book == lastReference.Book && verseReference.ChapterNum == lastReference.ChapterNum) + { + int lastVerseNum = lastReference.AllVerses().Last().VerseNum; + int nextVerseNum = verseReference.AllVerses().First().VerseNum; + if ( + nextVerseNum == lastVerseNum + 1 + && VerseRef.TryParse( + $"{verseReference.Book} {verseReference.ChapterNum}:{lastReference.AllVerses().First().VerseNum}-{verseReference.AllVerses().Last().VerseNum}", + out VerseRef updatedVerseReference + ) + ) + { + References[References.Count - 1] = updatedVerseReference; + return; + } + } + + References.Add(verseReference); + } + else + { + References.Add(verseReference); + } + } + + public void Extend(VerseRef verseReference, int lineNumber) + { + Extend(verseReference); + LineNumbers.Add(lineNumber); + } + } + + public class UsfmVersificationAnalysis + { + public int TotalNumAffectedVerses { get; internal set; } + public int TotalNumEncounteredVerses { get; internal set; } + public IReadOnlyList Diagnostics { get; internal set; } + public ParatextProjectSettings ProjectSettings { get; internal set; } + } + + public class UsfmVersificationAnalyzer : UsfmParserHandlerBase + { + private readonly ParatextProjectSettings _settings; + private readonly IEnumerator _expectedVerses; + private readonly List _diagnostics; + private string _filename; + private bool _lastVerseInError; + private bool _lastVerseWasExtra; + private bool _lastVerseWasInvalid; + private int _totalVersesAnalyzed; + private int _lastLineNumber; + private bool _hasMore; + private VerseRef _nextExpectedVerse; + private VerseRef _prevEncounteredVerseRef; + + private void GetNextExpectedVerse() + { + _nextExpectedVerse = _expectedVerses.Current; + _hasMore = _expectedVerses.MoveNext(); + } + + private UsfmVersificationDiagnostic CurrentError => _diagnostics[_diagnostics.Count - 1]; + + public UsfmVersificationAnalyzer(ParatextProjectSettings settings, HashSet onlyBooks) + { + _settings = settings; + _expectedVerses = _settings.Versification.AllIncludedVerses(onlyBooks).GetEnumerator(); + _hasMore = _expectedVerses.MoveNext(); + _prevEncounteredVerseRef = new VerseRef(1, 1, 0); + _diagnostics = new List(); + _filename = null; + _lastVerseInError = false; + _lastVerseWasExtra = false; + _lastVerseWasInvalid = false; + _totalVersesAnalyzed = 0; + _lastLineNumber = 1; + } + + public UsfmVersificationAnalysis GetAnalysis() + { + while (_hasMore) + { + if (!_lastVerseWasInvalid) + GetNextExpectedVerse(); + HandleMissingVerse(); + _lastVerseWasInvalid = false; + } + return new UsfmVersificationAnalysis + { + TotalNumAffectedVerses = _diagnostics.Sum(d => d.NumAffectedVerses), + Diagnostics = _diagnostics, + TotalNumEncounteredVerses = _totalVersesAnalyzed, + ProjectSettings = _settings, + }; + } + + public override void StartBook(UsfmParserState state, string marker, string code) + { + _filename = _settings.GetBookFileName(state.VerseRef.Book); + } + + public override void Chapter( + UsfmParserState state, + string number, + string marker, + string altNumber, + string pubNumber + ) + { + VerseRef verseRef = state.VerseRef; + if (!Canon.IsCanonical(verseRef.Book)) + return; + verseRef.Chapter = number; + if (verseRef.ChapterNum == -1) + { + _diagnostics.Add( + new UsfmVersificationDiagnostic + { + Type = UsfmVersificationDiagnosticType.Invalid, + References = new List { verseRef }, + Filename = _filename, + LineNumbers = new List { state.LineNumber }, + } + ); + _lastVerseInError = true; + } + } + + public override void Verse( + UsfmParserState state, + string number, + string marker, + string altNumber, + string pubNumber + ) + { + VerseRef currentVerses = state.VerseRef; + + VerseRef verseRef = currentVerses; + if (!Canon.IsCanonical(verseRef.Book)) + return; + verseRef.Verse = number; + bool invalidVerseNum = verseRef.VerseNum == -1; + bool badVerseRange = ( + verseRef.ValidStatus == VerseRef.ValidStatusType.VerseOutOfOrder + || verseRef.ValidStatus == VerseRef.ValidStatusType.VerseRepeated + ); + if (invalidVerseNum || badVerseRange) + { + HandleInvalidVerse(state, verseRef); + _lastVerseWasInvalid = true; + } + else + { + _lastVerseWasInvalid = false; + } + + bool segmentMismatch = string.IsNullOrEmpty(currentVerses.Segment()) == currentVerses.HasSegmentsDefined; + if (segmentMismatch) + { + HandleIncorrectVerseSegment(state, verseRef); + } + + if (currentVerses.HasMultiple) + { + VerseRef copy = currentVerses; + bool hasCrossChapterVerseRange = !copy.ChangeVersificationWithRanges(ScrVers.Original); + if (hasCrossChapterVerseRange) + { + _diagnostics.Add( + new UsfmVersificationDiagnostic + { + Type = UsfmVersificationDiagnosticType.UnsupportedVerseRange, + References = new List { currentVerses }, + Filename = _filename, + LineNumbers = new List { state.LineNumber }, + } + ); + } + } + + foreach (VerseRef currentVerse in currentVerses.AllVerses().OrderBy(v => v)) + { + if (_prevEncounteredVerseRef.CompareTo(currentVerse, null, true, compareSegments: false) < 0) //Properly handle verse segments + { + if (!_lastVerseWasExtra && _hasMore) + { + GetNextExpectedVerse(); + } + _totalVersesAnalyzed++; + } + int compare = _nextExpectedVerse.CompareTo(currentVerse, null, true, compareSegments: false); + if (compare < 0 && _hasMore) + { + HandleMissingVerse(); + GetNextExpectedVerse(); + while ( + _hasMore && _nextExpectedVerse.CompareTo(currentVerse, null, true, compareSegments: false) < 0 + ) + { + CurrentError.Extend(_nextExpectedVerse); + GetNextExpectedVerse(); + } + } + else if ((compare > 0 && !_lastVerseWasInvalid) || (compare < 0 && !_hasMore)) //We want Invalid and Extra to be mutually exclusive to avoid duplicate errors for every Invalid/Extra verse + { + if (!_hasMore && _nextExpectedVerse.CompareTo(_prevEncounteredVerseRef) > 0) + { + _diagnostics.Add( + new UsfmVersificationDiagnostic + { + Type = UsfmVersificationDiagnosticType.Missing, + References = new List { _nextExpectedVerse }, + Filename = _filename, + LineNumbers = new List { _lastLineNumber }, + } + ); + } + + HandleExtraVerse(state.LineNumber, currentVerse); + } + if (compare <= 0) + _lastVerseWasExtra = false; + + _prevEncounteredVerseRef = currentVerse; + } + + _lastLineNumber = state.LineNumber; + } + + private void HandleInvalidVerse(UsfmParserState state, VerseRef verseRef) + { + _diagnostics.Add( + new UsfmVersificationDiagnostic + { + Type = UsfmVersificationDiagnosticType.Invalid, + References = new List { verseRef }, + Filename = _filename, + LineNumbers = new List { state.LineNumber }, + } + ); + _lastVerseInError = true; + } + + private void HandleIncorrectVerseSegment(UsfmParserState state, VerseRef verseRef) + { + _diagnostics.Add( + new UsfmVersificationDiagnostic + { + Type = UsfmVersificationDiagnosticType.IncorrectVerseSegment, + References = new List { verseRef }, + Filename = _filename, + LineNumbers = new List { state.LineNumber }, + } + ); + _lastVerseInError = true; + } + + private void HandleExtraVerse(int lineNumber, VerseRef currentVerse) + { + if (!_lastVerseInError || (_lastVerseInError && CurrentError.Type != UsfmVersificationDiagnosticType.Extra)) + { + _diagnostics.Add( + new UsfmVersificationDiagnostic + { + Type = UsfmVersificationDiagnosticType.Extra, + References = new List { currentVerse }, + Filename = _filename, + LineNumbers = new List { lineNumber }, + } + ); + _lastVerseInError = true; + } + else + { + CurrentError.Extend(currentVerse, lineNumber); + } + _lastVerseWasExtra = true; + } + + private void HandleMissingVerse() + { + if ( + !_lastVerseInError + || (_lastVerseInError && CurrentError.Type != UsfmVersificationDiagnosticType.Missing) + ) + { + _diagnostics.Add( + new UsfmVersificationDiagnostic + { + Type = UsfmVersificationDiagnosticType.Missing, + References = new List { _nextExpectedVerse }, + Filename = _filename, + LineNumbers = new List { _lastLineNumber }, + } + ); + _lastVerseInError = true; + } + else + { + CurrentError.Extend(_nextExpectedVerse); + } + } + } +} diff --git a/src/SIL.Machine/Corpora/ParatextProjectVersificationErrorDetectorBase.cs b/src/SIL.Machine/Corpora/UsfmVersificationAnalyzerBase.cs similarity index 70% rename from src/SIL.Machine/Corpora/ParatextProjectVersificationErrorDetectorBase.cs rename to src/SIL.Machine/Corpora/UsfmVersificationAnalyzerBase.cs index 98660613..bb326221 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectVersificationErrorDetectorBase.cs +++ b/src/SIL.Machine/Corpora/UsfmVersificationAnalyzerBase.cs @@ -1,17 +1,18 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using System.Text; using SIL.Scripture; namespace SIL.Machine.Corpora { - public abstract class ParatextProjectVersificationErrorDetectorBase + public abstract class UsfmVersificationAnalyzerBase { private readonly ParatextProjectSettings _settings; private readonly IParatextProjectFileHandler _paratextProjectFileHandler; - protected ParatextProjectVersificationErrorDetectorBase( + protected UsfmVersificationAnalyzerBase( IParatextProjectFileHandler paratextProjectFileHandler, ParatextProjectSettings settings ) @@ -20,12 +21,23 @@ ParatextProjectSettings settings _paratextProjectFileHandler = paratextProjectFileHandler; } - public IReadOnlyList GetUsfmVersificationErrors( - UsfmVersificationErrorDetector handler = null, - HashSet books = null + public UsfmVersificationAnalysis AnalyzeUsfmVersification( + HashSet books, + UsfmVersificationAnalyzer handler = null ) { - handler = handler ?? new UsfmVersificationErrorDetector(_settings); + return AnalyzeUsfmVersification( + books != null ? new HashSet(books.Select(b => Canon.BookIdToNumber(b))) : null, + handler + ); + } + + public UsfmVersificationAnalysis AnalyzeUsfmVersification( + HashSet books, + UsfmVersificationAnalyzer handler = null + ) + { + handler = handler ?? new UsfmVersificationAnalyzer(_settings, books); foreach (string bookId in _settings.GetAllScriptureBookIds()) { string fileName = _settings.GetBookFileName(bookId); @@ -56,7 +68,7 @@ public IReadOnlyList GetUsfmVersificationErrors( throw new InvalidOperationException(sb.ToString(), ex); } } - return handler.Errors; + return handler.GetAnalysis(); } } } diff --git a/src/SIL.Machine/Corpora/UsfmVersificationErrorDetector.cs b/src/SIL.Machine/Corpora/UsfmVersificationErrorDetector.cs deleted file mode 100644 index 4107cb7f..00000000 --- a/src/SIL.Machine/Corpora/UsfmVersificationErrorDetector.cs +++ /dev/null @@ -1,356 +0,0 @@ -using System.Collections.Generic; -using System.ComponentModel; -using System.Linq; -using SIL.Scripture; - -namespace SIL.Machine.Corpora -{ - public enum UsfmVersificationErrorType - { - MissingChapter, - MissingVerse, - ExtraVerse, - InvalidVerseRange, - MissingVerseSegment, - ExtraVerseSegment, - InvalidChapterNumber, - InvalidVerseNumber, - } - - public class UsfmVersificationError - { - private readonly int _bookNum; - private readonly int _expectedChapter; - private readonly int _expectedVerse; - private readonly int _actualChapter; - private readonly int _actualVerse; - private readonly string _actualValue; - private VerseRef? _verseRef = null; - - public UsfmVersificationError( - int bookNum, - int expectedChapter, - int expectedVerse, - int actualChapter, - int actualVerse, - string projectName, - VerseRef? verseRef = null - ) - { - _bookNum = bookNum; - _expectedChapter = expectedChapter; - _expectedVerse = expectedVerse; - _actualChapter = actualChapter; - _actualVerse = actualVerse; - _verseRef = verseRef; - ProjectName = projectName; - } - - public UsfmVersificationError( - int bookNum, - int expectedChapter, - string actualValue, - string projectName, - UsfmVersificationErrorType type - ) - { - _bookNum = bookNum; - _expectedChapter = expectedChapter; - _actualValue = actualValue; - ProjectName = projectName; - Type = type; - } - - public string ProjectName { get; private set; } - - public UsfmVersificationErrorType Type { get; private set; } - - // Returns true if there is an error - public bool CheckError() - { - //A non-empty chapter is expected - if (_expectedChapter > _actualChapter && _expectedVerse != 0) - { - Type = UsfmVersificationErrorType.MissingChapter; - return true; - } - if (_expectedVerse > _actualVerse && _expectedChapter == _actualChapter) - { - Type = UsfmVersificationErrorType.MissingVerse; - return true; - } - if (_verseRef != null) - { - if (string.IsNullOrEmpty(_verseRef.Value.Segment()) && _verseRef.Value.HasSegmentsDefined) - { - Type = UsfmVersificationErrorType.MissingVerseSegment; - return true; - } - if (!string.IsNullOrEmpty(_verseRef.Value.Segment()) && !_verseRef.Value.HasSegmentsDefined) - { - Type = UsfmVersificationErrorType.ExtraVerseSegment; - return true; - } - if (!_verseRef.Value.Valid) - { - Type = Map(_verseRef.Value.ValidStatus); - return true; - } - } - return false; - } - - private static UsfmVersificationErrorType Map(VerseRef.ValidStatusType validStatus) - { - switch (validStatus) - { - case VerseRef.ValidStatusType.OutOfRange: - return UsfmVersificationErrorType.ExtraVerse; - case VerseRef.ValidStatusType.VerseRepeated: - case VerseRef.ValidStatusType.VerseOutOfOrder: - return UsfmVersificationErrorType.InvalidVerseRange; - default: - throw new InvalidEnumArgumentException( - nameof(validStatus), - (int)validStatus, - typeof(VerseRef.ValidStatusType) - ); - } - } - - public string ExpectedVerseRef - { - get - { - if ( - Type == UsfmVersificationErrorType.ExtraVerse - || Type == UsfmVersificationErrorType.InvalidChapterNumber - || Type == UsfmVersificationErrorType.InvalidVerseNumber - ) - { - return ""; - } - - // We do not want to throw an exception here, and the VerseRef constructor can throw - // an exception with certain invalid verse data; use TryParse instead. - if ( - !VerseRef.TryParse( - $"{Canon.BookNumberToId(_bookNum)} {_expectedChapter}:{_expectedVerse}", - out VerseRef defaultVerseRef - ) - ) - { - return DefaultVerse(_expectedChapter, _expectedVerse); - } - if ( - Type == UsfmVersificationErrorType.MissingVerseSegment - && VerseRef.TryParse( - $"{defaultVerseRef.Book} {defaultVerseRef.Chapter}:{defaultVerseRef.Verse}a", - out VerseRef verseWithSegment - ) - ) - { - return verseWithSegment.ToString(); - } - if (Type == UsfmVersificationErrorType.InvalidVerseRange) - { - List sortedAllUniqueVerses = _verseRef - .Value.AllVerses() - .Distinct() - .OrderBy(v => v) - .ToList(); - VerseRef firstVerse = sortedAllUniqueVerses[0]; - VerseRef lastVerse = sortedAllUniqueVerses[sortedAllUniqueVerses.Count - 1]; - if (firstVerse.Equals(lastVerse)) - { - return firstVerse.ToString(); - } - else if ( - VerseRef.TryParse( - $"{firstVerse.Book} {firstVerse.Chapter}:{firstVerse.Verse}-{lastVerse.Verse}", - out VerseRef correctedVerseRangeRef - ) - ) - { - return correctedVerseRangeRef.ToString(); - } - } - return defaultVerseRef.ToString(); - } - } - - public string ActualVerseRef - { - get - { - if (Type == UsfmVersificationErrorType.InvalidChapterNumber) - { - return $"{Canon.BookNumberToId(_bookNum)} {_actualValue}"; - } - else if (Type == UsfmVersificationErrorType.InvalidVerseNumber) - { - return $"{Canon.BookNumberToId(_bookNum)} {_expectedChapter}:{_actualValue}"; - } - else if (_verseRef != null) - { - return _verseRef.ToString(); - } - else - { - if ( - VerseRef.TryParse( - $"{Canon.BookNumberToId(_bookNum)} {_actualChapter}:{_actualVerse}", - out VerseRef actualVerseRef - ) - ) - { - return actualVerseRef.ToString(); - } - } - return DefaultVerse(_actualChapter, _actualVerse); - } - } - - private string DefaultVerse(int chapter, int verse) - { - string verseString = _actualVerse == -1 ? "" : verse.ToString(); - return $"{Canon.BookNumberToId(_bookNum)} {chapter}:{verseString}"; - } - } - - public class UsfmVersificationErrorDetector : UsfmParserHandlerBase - { - private readonly string _projectName; - private readonly ScrVers _versification; - private int _currentBook; - private int _currentChapter; - private VerseRef _currentVerse; - private readonly List _errors; - - public UsfmVersificationErrorDetector(ParatextProjectSettings settings) - { - _projectName = settings.Name; - _versification = settings.Versification; - _currentBook = 0; - _currentChapter = 0; - _currentVerse = new VerseRef(); - _errors = new List(); - } - - public IReadOnlyList Errors => _errors; - - public override void EndUsfm(UsfmParserState state) - { - if (_currentBook > 0 && Canon.IsCanonical(_currentBook)) - { - var versificationError = new UsfmVersificationError( - _currentBook, - _versification.GetLastChapter(_currentBook), - _versification.GetLastVerse(_currentBook, _versification.GetLastChapter(_currentBook)), - _currentChapter, - _currentVerse.AllVerses().Last().VerseNum, - _projectName - ); - if (versificationError.CheckError()) - _errors.Add(versificationError); - } - } - - public override void StartBook(UsfmParserState state, string marker, string code) - { - _currentBook = state.VerseRef.BookNum; - _currentChapter = 0; - _currentVerse = new VerseRef(); - } - - public override void Chapter( - UsfmParserState state, - string number, - string marker, - string altNumber, - string pubNumber - ) - { - if (_currentBook > 0 && Canon.IsCanonical(_currentBook) && _currentChapter > 0) - { - var versificationError = new UsfmVersificationError( - _currentBook, - _currentChapter, - _versification.GetLastVerse(_currentBook, _currentChapter), - _currentChapter, - _currentVerse.AllVerses().Last().VerseNum, - _projectName - ); - if (versificationError.CheckError()) - _errors.Add(versificationError); - } - - _currentChapter = state.VerseRef.ChapterNum; - _currentVerse = new VerseRef(); - - // See whether the chapter number is invalid - VerseRef verseRef = state.VerseRef; - verseRef.Chapter = number; - if (verseRef.ChapterNum == -1) - { - _errors.Add( - new UsfmVersificationError( - _currentBook, - _currentChapter, - number, - _projectName, - UsfmVersificationErrorType.InvalidChapterNumber - ) - ); - } - } - - public override void Verse( - UsfmParserState state, - string number, - string marker, - string altNumber, - string pubNumber - ) - { - bool verseInError = false; - _currentVerse = state.VerseRef; - if (_currentBook > 0 && Canon.IsCanonical(_currentBook) && _currentChapter > 0) - { - var versificationError = new UsfmVersificationError( - _currentBook, - _currentChapter, - _currentVerse.AllVerses().Last().VerseNum, - _currentChapter, - _currentVerse.AllVerses().Last().VerseNum, - _projectName, - _currentVerse - ); - if (versificationError.CheckError()) - { - _errors.Add(versificationError); - verseInError = true; - } - } - - if (!verseInError) - { - // See whether the verse number is invalid - VerseRef verseRef = _currentVerse; - verseRef.Verse = number; - if (verseRef.VerseNum == -1) - { - _errors.Add( - new UsfmVersificationError( - _currentBook, - _currentChapter, - number, - _projectName, - UsfmVersificationErrorType.InvalidVerseNumber - ) - ); - } - } - } - } -} diff --git a/src/SIL.Machine/Corpora/ZipParatextProjectVersificationErrorDetector.cs b/src/SIL.Machine/Corpora/ZipParatextProjectVersificationErrorDetector.cs deleted file mode 100644 index a77fd629..00000000 --- a/src/SIL.Machine/Corpora/ZipParatextProjectVersificationErrorDetector.cs +++ /dev/null @@ -1,16 +0,0 @@ -using System.IO.Compression; - -namespace SIL.Machine.Corpora -{ - public class ZipParatextProjectVersificationErrorDetector : ParatextProjectVersificationErrorDetectorBase - { - public ZipParatextProjectVersificationErrorDetector( - ZipArchive archive, - ParatextProjectSettings parentSettings = null - ) - : base( - new ZipParatextProjectFileHandler(archive), - ZipParatextProjectSettingsParser.Parse(archive, parentSettings) - ) { } - } -} diff --git a/src/SIL.Machine/Corpora/ZipUsfmVersificationAnalyzer.cs b/src/SIL.Machine/Corpora/ZipUsfmVersificationAnalyzer.cs new file mode 100644 index 00000000..d1a0ef55 --- /dev/null +++ b/src/SIL.Machine/Corpora/ZipUsfmVersificationAnalyzer.cs @@ -0,0 +1,13 @@ +using System.IO.Compression; + +namespace SIL.Machine.Corpora +{ + public class ZipUsfmVersificationAnalyzer : UsfmVersificationAnalyzerBase + { + public ZipUsfmVersificationAnalyzer(ZipArchive archive, ParatextProjectSettings parentSettings = null) + : base( + new ZipParatextProjectFileHandler(archive), + ZipParatextProjectSettingsParser.Parse(archive, parentSettings) + ) { } + } +} diff --git a/tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectVersificationErrorDetector.cs b/tests/SIL.Machine.Tests/Corpora/MemoryUsfmVersificationAnalyzer.cs similarity index 67% rename from tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectVersificationErrorDetector.cs rename to tests/SIL.Machine.Tests/Corpora/MemoryUsfmVersificationAnalyzer.cs index c624d5c9..db9d1b3d 100644 --- a/tests/SIL.Machine.Tests/Corpora/MemoryParatextProjectVersificationErrorDetector.cs +++ b/tests/SIL.Machine.Tests/Corpora/MemoryUsfmVersificationAnalyzer.cs @@ -1,10 +1,10 @@ namespace SIL.Machine.Corpora; -public class MemoryParatextProjectVersificationErrorDetector( +public class MemoryUsfmVersificationAnalyzer( IDictionary? files = null, ParatextProjectSettings? settings = null ) - : ParatextProjectVersificationErrorDetectorBase( + : UsfmVersificationAnalyzerBase( new MemoryParatextProjectFileHandler(files), settings ?? new DefaultParatextProjectSettings() ); diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextProjectVersificationErrorTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextProjectVersificationErrorTests.cs deleted file mode 100644 index b4678fd9..00000000 --- a/tests/SIL.Machine.Tests/Corpora/ParatextProjectVersificationErrorTests.cs +++ /dev/null @@ -1,472 +0,0 @@ -using System.Text; -using System.Text.Json; -using NUnit.Framework; -using SIL.Scripture; - -namespace SIL.Machine.Corpora; - -[TestFixture] -public class ParatextProjectVersificationErrorDetectorTests -{ - [Test] - public void GetUsfmVersificationErrors_NoErrors() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "653JNTest.SFM", - @"\id 3JN - \c 1 - \v 1 - \v 2 - \v 3 - \v 4 - \v 5 - \v 6 - \v 7 - \v 8 - \v 9 - \v 10 - \v 11 - \v 12 - \v 13 - \v 14 - \v 15 - " - }, - } - ); - Assert.That( - env.GetUsfmVersificationErrors(), - Has.Count.EqualTo(0), - JsonSerializer.Serialize(env.GetUsfmVersificationErrors()) - ); - } - - [Test] - public void GetUsfmVersificationErrors_MissingVerse() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "653JNTest.SFM", - @"\id 3JN - \c 1 - \v 1 - \v 2 - \v 3 - \v 4 - \v 5 - \v 6 - \v 7 - \v 8 - \v 9 - \v 10 - \v 11 - \v 12 - \v 13 - \v 14 - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(1), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.MissingVerse)); - Assert.That(errors[0].ExpectedVerseRef, Is.EqualTo("3JN 1:15")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("3JN 1:14")); - } - - [Test] - public void GetUsfmVersificationErrors_MissingChapter() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "653JNTest.SFM", - @"\id 3JN - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(1), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.MissingChapter)); - Assert.That(errors[0].ExpectedVerseRef, Is.EqualTo("3JN 1:15")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("3JN 0:0")); - } - - [Test] - public void GetUsfmVersificationErrors_ExtraVerse() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "653JNTest.SFM", - @"\id 3JN - \c 1 - \v 1 - \v 2 - \v 3 - \v 4 - \v 5 - \v 6 - \v 7 - \v 8 - \v 9 - \v 10 - \v 11 - \v 12 - \v 13 - \v 14 - \v 15 - \v 16 - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(1), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.ExtraVerse)); - Assert.That(errors[0].ExpectedVerseRef, Is.EqualTo("")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("3JN 1:16")); - } - - [Test] - public void GetUsfmVersificationErrors_InvalidVerse() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "653JNTest.SFM", - @"\id 3JN - \c 1 - \v 1 - \v 2 - \v 3 - \v 4 - \v 5 - \v 6 - \v 7 - \v 8 - \v 9 - \v 10 - \v 11 - \v 13-12 - \v 14 - \v 15 - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(1), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.InvalidVerseRange)); - Assert.That(errors[0].ExpectedVerseRef, Is.EqualTo("3JN 1:12-13")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("3JN 1:13-12")); - } - - [Test] - public void GetUsfmVersificationErrors_ExtraVerseSegment() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "653JNTest.SFM", - @"\id 3JN - \c 1 - \v 1 - \v 2 - \v 3 - \v 4 - \v 5 - \v 6 - \v 7 - \v 8 - \v 9 - \v 10 - \v 11 - \v 12 - \v 13 - \v 14a - \v 14b - \v 15 - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(2), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.ExtraVerseSegment)); - Assert.That(errors[0].ExpectedVerseRef, Is.EqualTo("3JN 1:14")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("3JN 1:14a")); - } - - [Test] - public void GetUsfmVersificationErrors_MissingVerseSegment() - { - var env = new TestEnvironment( - settings: new DefaultParatextProjectSettings(versification: GetCustomVersification(@"*3JN 1:13,a,b")), - files: new Dictionary() - { - { - "653JNTest.SFM", - @"\id 3JN - \c 1 - \v 1 - \v 2 - \v 3 - \v 4 - \v 5 - \v 6 - \v 7 - \v 8 - \v 9 - \v 10 - \v 11 - \v 12 - \v 13 - \v 14 - \v 15 - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(1), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.MissingVerseSegment)); - Assert.That(errors[0].ExpectedVerseRef, Is.EqualTo("3JN 1:13a")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("3JN 1:13")); - } - - [Test] - public void GetUsfmVersificationErrors_IgnoreNonCanonicals() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "98XXETest.SFM", - @"\id XXE - \c 1 - \v 3-2 - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(0), JsonSerializer.Serialize(errors)); - } - - [Test] - public void GetUsfmVersificationErrors_ExtraVerse_ExcludedInCustomVrs() - { - var env = new TestEnvironment( - settings: new DefaultParatextProjectSettings(versification: GetCustomVersification(@"-3JN 1:13")), - files: new Dictionary() - { - { - "653JNTest.SFM", - @"\id 3JN - \c 1 - \v 1 - \v 2 - \v 3 - \v 4 - \v 5 - \v 6 - \v 7 - \v 8 - \v 9 - \v 10 - \v 11 - \v 12 - \v 13 - \v 14 - \v 15 - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(1), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.ExtraVerse)); - Assert.That(errors[0].ExpectedVerseRef, Is.EqualTo("")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("3JN 1:13")); - } - - [Test] - public void GetUsfmVersificationErrors_MultipleBooks() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "642JNTest.SFM", - @"\id 2JN - \c 1 - \v 1 - \v 2 - \v 3 - \v 4 - \v 5 - \v 6 - \v 7 - \v 8 - \v 9 - \v 10 - \v 11 - \v 12 - " - }, - { - "653JNTest.SFM", - @"\id 3JN - \c 1 - \v 1 - \v 2 - \v 3 - \v 4 - \v 5 - \v 6 - \v 7 - \v 8 - \v 9 - \v 10 - \v 11 - \v 12 - \v 13 - \v 14 - \v 15 - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(1), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.MissingVerse)); - Assert.That(errors[0].ExpectedVerseRef, Is.EqualTo("2JN 1:13")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("2JN 1:12")); - } - - [Test] - public void GetUsfmVersificationErrors_MultipleChapters() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "642JNTest.SFM", - @"\id 2JN - \c 1 - \v 1 - \v 2 - \v 3 - \v 4 - \v 5 - \v 6 - \v 7 - \v 8 - \v 9 - \v 10 - \v 11 - \v 12 - \c 2 - \v 1 - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(2), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.MissingVerse)); - Assert.That(errors[1].Type, Is.EqualTo(UsfmVersificationErrorType.ExtraVerse)); - Assert.That(errors[0].ExpectedVerseRef, Is.EqualTo("2JN 1:13")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("2JN 1:12")); - Assert.That(errors[1].ExpectedVerseRef, Is.EqualTo("")); - Assert.That(errors[1].ActualVerseRef, Is.EqualTo("2JN 2:1")); - } - - [Test] - public void GetUsfmVersificationErrors_InvalidChapterNumber() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "653JNTest.SFM", - @"\id 3JN - \c 1. - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(2), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.InvalidChapterNumber)); - Assert.That(errors[1].Type, Is.EqualTo(UsfmVersificationErrorType.MissingChapter)); - Assert.That(errors[0].ExpectedVerseRef, Is.Empty); - Assert.That(errors[1].ExpectedVerseRef, Is.EqualTo("3JN 1:15")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("3JN 1.")); - Assert.That(errors[1].ActualVerseRef, Is.EqualTo("3JN -1:0")); - } - - [Test] - public void GetUsfmVersificationErrors_InvalidVerseNumber() - { - var env = new TestEnvironment( - files: new Dictionary() - { - { - "653JNTest.SFM", - @"\id 3JN - \c 1 - \v v1 - " - }, - } - ); - IReadOnlyList errors = env.GetUsfmVersificationErrors(); - Assert.That(errors, Has.Count.EqualTo(2), JsonSerializer.Serialize(errors)); - Assert.That(errors[0].Type, Is.EqualTo(UsfmVersificationErrorType.InvalidVerseNumber)); - Assert.That(errors[1].Type, Is.EqualTo(UsfmVersificationErrorType.MissingVerse)); - Assert.That(errors[0].ExpectedVerseRef, Is.Empty); - Assert.That(errors[1].ExpectedVerseRef, Is.EqualTo("3JN 1:15")); - Assert.That(errors[0].ActualVerseRef, Is.EqualTo("3JN 1:v1")); - Assert.That(errors[1].ActualVerseRef, Is.EqualTo("3JN 1:0")); - } - - private class TestEnvironment(ParatextProjectSettings? settings = null, Dictionary? files = null) - { - public ParatextProjectVersificationErrorDetectorBase Detector { get; } = - new MemoryParatextProjectVersificationErrorDetector(files, settings); - - public IReadOnlyList GetUsfmVersificationErrors() - { - return Detector.GetUsfmVersificationErrors(); - } - } - - private static ScrVers GetCustomVersification(string customVrsContents, ScrVers? baseVersification = null) - { - baseVersification ??= ScrVers.English; - ScrVers customVersification = baseVersification; - using (var reader = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(customVrsContents)))) - { - customVersification = Versification.Table.Implementation.Load( - reader, - "custom.vrs", - baseVersification, - baseVersification.ToString() + "-" + customVrsContents.GetHashCode() - ); - } - Versification.Table.Implementation.RemoveAllUnknownVersifications(); - return customVersification; - } -} diff --git a/tests/SIL.Machine.Tests/Corpora/ScrVersExtensionsTests.cs b/tests/SIL.Machine.Tests/Corpora/ScrVersExtensionsTests.cs index d006e255..c6c3d740 100644 --- a/tests/SIL.Machine.Tests/Corpora/ScrVersExtensionsTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ScrVersExtensionsTests.cs @@ -12,22 +12,30 @@ public void AllIncludedVerses() List originalVerses = ScrVers.Original.AllIncludedVerses().ToList(); Assert.That(originalVerses, Has.Count.EqualTo(41899)); Assert.That(originalVerses[21899].BBBCCCVVV, Is.EqualTo(27003024)); + List englishVerses = ScrVers.English.AllIncludedVerses().ToList(); Assert.That(englishVerses, Has.Count.EqualTo(38393)); Assert.That(englishVerses[englishVerses.Count - 1].BBBCCCVVV, Is.EqualTo(123001020)); + List russianOrthodoxVerses = ScrVers.RussianOrthodox.AllIncludedVerses().ToList(); Assert.That(russianOrthodoxVerses, Has.Count.EqualTo(37280)); Assert.That(russianOrthodoxVerses[russianOrthodoxVerses.Count - 1].BBBCCCVVV, Is.EqualTo(83001015)); + + List originalVersesGenesis = ScrVers.Original.AllIncludedVerses([1]).ToList(); + Assert.That(originalVersesGenesis, Has.Count.EqualTo(1533)); } [Test] public void HasCrossBookMappings() { - Assert.That(!ScrVers.Original.HasCrossBookMappings()); - Assert.That(ScrVers.English.HasCrossBookMappings()); - Assert.That(ScrVers.RussianOrthodox.HasCrossBookMappings()); - Assert.That(!ScrVers.RussianProtestant.HasCrossBookMappings()); - Assert.That(ScrVers.Vulgate.HasCrossBookMappings()); - Assert.That(ScrVers.Vulgate.HasCrossBookMappings(ScrVers.English)); + using (Assert.EnterMultipleScope()) + { + Assert.That(!ScrVers.Original.HasCrossBookMappings()); + Assert.That(ScrVers.English.HasCrossBookMappings()); + Assert.That(ScrVers.RussianOrthodox.HasCrossBookMappings()); + Assert.That(!ScrVers.RussianProtestant.HasCrossBookMappings()); + Assert.That(ScrVers.Vulgate.HasCrossBookMappings()); + Assert.That(ScrVers.Vulgate.HasCrossBookMappings(ScrVers.English)); + } } } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs index 4692fbfd..8bcb7472 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmManualTests.cs @@ -2,6 +2,7 @@ using System.Text.Json; using NUnit.Framework; using SIL.Machine.PunctuationAnalysis; +using SIL.Scripture; namespace SIL.Machine.Corpora; @@ -85,13 +86,15 @@ public void AnalyzeCorporaQuoteConventions() public void ValidateUsfmVersification() { using ZipArchive zipArchive = ZipFile.OpenRead(CorporaTestHelpers.UsfmSourceProjectZipPath); - var versificationErrorDetector = new ZipParatextProjectVersificationErrorDetector(zipArchive); - IReadOnlyList errors = versificationErrorDetector.GetUsfmVersificationErrors(); + var versificationAnalyzer = new ZipUsfmVersificationAnalyzer(zipArchive); + UsfmVersificationAnalysis analysis = versificationAnalyzer.AnalyzeUsfmVersification( + Canon.AllBookIds.ToHashSet() + ); Assert.That( - errors, + analysis.Diagnostics, Has.Count.EqualTo(0), - JsonSerializer.Serialize(errors, new JsonSerializerOptions { WriteIndented = true }) + JsonSerializer.Serialize(analysis.Diagnostics, new JsonSerializerOptions { WriteIndented = true }) ); } } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmVersificationAnalyzerTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmVersificationAnalyzerTests.cs new file mode 100644 index 00000000..fe495ffd --- /dev/null +++ b/tests/SIL.Machine.Tests/Corpora/UsfmVersificationAnalyzerTests.cs @@ -0,0 +1,560 @@ +using System.Text; +using System.Text.Json; +using NUnit.Framework; +using SIL.Scripture; + +namespace SIL.Machine.Corpora; + +[TestFixture] +public class UsfmVersificationAnalyzerTests +{ + [Test] + public void GetUsfmVersificationErrors_NoErrors() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "653JNTest.SFM", + @"\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + \v 15 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["3JN"]); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(15)); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(0), JsonSerializer.Serialize(analysis.Diagnostics)); + } + + [Test] + public void GetUsfmVersificationErrors_MissingVerse() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "653JNTest.SFM", + @"\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["3JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(1), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(14)); + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Missing)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].Filename, Is.EqualTo("653JNTest.SFM")); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([16])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("3JN 1:15")); + } + + [Test] + public void GetUsfmVersificationErrors_MissingChapter() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "653JNTest.SFM", + @"\id 3JN + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["3JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(1), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(0)); + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Missing)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(15)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([1])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("3JN 1:1-15")); + } + + [Test] + public void GetUsfmVersificationErrors_ExtraVerse() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "653JNTest.SFM", + @"\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + \v 15 + \v 16 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["3JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(1), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(16)); + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Extra)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([18])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("3JN 1:16")); + } + + [Test] + public void GetUsfmVersificationErrors_InvalidVerse() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "653JNTest.SFM", + @"\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 13-12 + \v 14 + \v 15 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["3JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(1), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(15)); + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Invalid)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(2)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([14])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("3JN 1:13-12")); + } + + [Test] + public void GetUsfmVersificationErrors_ExtraVerseSegment() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "653JNTest.SFM", + @"\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14a + \v 14b + \v 15 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["3JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(2), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(15)); + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.IncorrectVerseSegment)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([16])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("3JN 1:14a")); + } + + [Test] + public void GetUsfmVersificationErrors_MissingVerseSegment() + { + var env = new TestEnvironment( + settings: new DefaultParatextProjectSettings(versification: GetCustomVersification(@"*3JN 1:13,a,b")), + files: new Dictionary() + { + { + "653JNTest.SFM", + @"\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + \v 15 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["3JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(1), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(15)); + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.IncorrectVerseSegment)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([15])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("3JN 1:13")); + } + + [Test] + public void GetUsfmVersificationErrors_IgnoreNonCanonicals() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "98XXETest.SFM", + @"\id XXE + \c 1 + \v 3-2 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["XXE"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(0), JsonSerializer.Serialize(analysis.Diagnostics)); + } + + [Test] + public void GetUsfmVersificationErrors_ExtraVerse_ExcludedInCustomVrs() + { + var env = new TestEnvironment( + settings: new DefaultParatextProjectSettings(versification: GetCustomVersification(@"-3JN 1:13")), + files: new Dictionary() + { + { + "653JNTest.SFM", + @"\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + \v 15 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["3JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(1), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(15)); + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Extra)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([15])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("3JN 1:13")); + } + + [Test] + public void GetUsfmVersificationErrors_MultipleBooks() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "642JNTest.SFM", + @"\id 2JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + " + }, + { + "653JNTest.SFM", + @"\id 3JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \v 13 + \v 14 + \v 15 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["2JN", "3JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(1), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(27)); + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Missing)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([14])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("2JN 1:13")); + } + + [Test] + public void GetUsfmVersificationErrors_MultipleChapters() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "642JNTest.SFM", + @"\id 2JN + \c 1 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6 + \v 7 + \v 8 + \v 9 + \v 10 + \v 11 + \v 12 + \c 2 + \v 1 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["2JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(2), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(13)); + + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Missing)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([14])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("2JN 1:13")); + + Assert.That(analysis.Diagnostics[1].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Extra)); + Assert.That(analysis.Diagnostics[1].NumAffectedVerses, Is.EqualTo(1)); + Assert.That(analysis.Diagnostics[1].LineNumbers.SequenceEqual([16])); + Assert.That(analysis.Diagnostics[1].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[1].References[0].ToString(), Is.EqualTo("2JN 2:1")); + } + + [Test] + public void GetUsfmVersificationErrors_InvalidChapterNumber() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "653JNTest.SFM", + @"\id 3JN + \c 1. + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["3JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(2), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(0)); + + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Invalid)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([2])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("3JN :0")); + + Assert.That(analysis.Diagnostics[1].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Missing)); + Assert.That(analysis.Diagnostics[1].NumAffectedVerses, Is.EqualTo(15)); + Assert.That(analysis.Diagnostics[1].LineNumbers.SequenceEqual([1])); + Assert.That(analysis.Diagnostics[1].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[1].References[0].ToString(), Is.EqualTo("3JN 1:1-15")); + } + + [Test] + public void GetUsfmVersificationErrors_InvalidVerseNumber() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "653JNTest.SFM", + @"\id 3JN + \c 1 + \v v1 + " + }, + } + ); + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["3JN"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(2), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(1)); + + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Invalid)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([3])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("3JN 1:v1")); + + Assert.That(analysis.Diagnostics[1].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Missing)); + Assert.That(analysis.Diagnostics[1].NumAffectedVerses, Is.EqualTo(15)); + Assert.That(analysis.Diagnostics[1].LineNumbers.SequenceEqual([3])); + Assert.That(analysis.Diagnostics[1].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[1].References[0].ToString(), Is.EqualTo("3JN 1:1-15")); + } + + [Test] + public void GetUsfmVersificationErrors_UnsupportedCrossChapterVerseReference() + { + var env = new TestEnvironment( + files: new Dictionary() + { + { + "03LEVTest.SFM", + @"\id LEV + \c 6 + \v 1 + \v 2 + \v 3 + \v 4 + \v 5 + \v 6-9 + \v 10-30 + " + }, + } + ); // LEV 6:6-9 maps to 5:25-6:2 in the Original versification + UsfmVersificationAnalysis analysis = env.AnalyzeUsfmVersification(["LEV"]); + Assert.That(analysis.Diagnostics, Has.Count.EqualTo(3), JsonSerializer.Serialize(analysis.Diagnostics)); + Assert.That(analysis.TotalNumEncounteredVerses, Is.EqualTo(30)); + Assert.That(analysis.TotalNumAffectedVerses, Is.EqualTo(833)); + + Assert.That(analysis.Diagnostics[0].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Missing)); + Assert.That(analysis.Diagnostics[0].NumAffectedVerses, Is.EqualTo(104)); + Assert.That(analysis.Diagnostics[0].LineNumbers.SequenceEqual([1])); + Assert.That(analysis.Diagnostics[0].References, Has.Count.EqualTo(5)); + Assert.That(analysis.Diagnostics[0].References[0].ToString(), Is.EqualTo("LEV 1:1-17")); + + Assert.That(analysis.Diagnostics[1].Type, Is.EqualTo(UsfmVersificationDiagnosticType.UnsupportedVerseRange)); + Assert.That(analysis.Diagnostics[1].NumAffectedVerses, Is.EqualTo(4)); + Assert.That(analysis.Diagnostics[1].LineNumbers.SequenceEqual([8])); + Assert.That(analysis.Diagnostics[1].References, Has.Count.EqualTo(1)); + Assert.That(analysis.Diagnostics[1].References[0].ToString(), Is.EqualTo("LEV 6:6-9")); + + Assert.That(analysis.Diagnostics[2].Type, Is.EqualTo(UsfmVersificationDiagnosticType.Missing)); + Assert.That(analysis.Diagnostics[2].NumAffectedVerses, Is.EqualTo(725)); + Assert.That(analysis.Diagnostics[2].LineNumbers.SequenceEqual([9])); + Assert.That(analysis.Diagnostics[2].References, Has.Count.EqualTo(21)); + Assert.That(analysis.Diagnostics[2].References[0].ToString(), Is.EqualTo("LEV 7:1-38")); + } + + private class TestEnvironment(ParatextProjectSettings? settings = null, Dictionary? files = null) + { + public UsfmVersificationAnalyzerBase Analyzer { get; } = new MemoryUsfmVersificationAnalyzer(files, settings); + + public UsfmVersificationAnalysis AnalyzeUsfmVersification(HashSet? onlyBooks = null) + { + return Analyzer.AnalyzeUsfmVersification(onlyBooks); + } + } + + private static ScrVers GetCustomVersification(string customVrsContents, ScrVers? baseVersification = null) + { + baseVersification ??= ScrVers.English; + ScrVers customVersification = baseVersification; + using (var reader = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(customVrsContents)))) + { + customVersification = Versification.Table.Implementation.Load( + reader, + "custom.vrs", + baseVersification, + baseVersification.ToString() + "-" + customVrsContents.GetHashCode() + ); + } + Versification.Table.Implementation.RemoveAllUnknownVersifications(); + return customVersification; + } +}