diff --git a/docs/critical_import_tool.md b/docs/critical_import_tool.md index 3971e3e..eb8b74d 100644 --- a/docs/critical_import_tool.md +++ b/docs/critical_import_tool.md @@ -238,9 +238,10 @@ The currently enabled phase-3 table set is: Current phase-3 notes: - header detection now tolerates minor `top` misalignment across the `A-E` header glyphs +- first-row body parsing can now begin slightly above the first roll-band label when the PDF places prose between the header row and the label, which prevents clipped `01-05` cells such as `Mana.pdf` - row boundaries can snap to the last affix-to-prose transition between adjacent roll labels when midpoint slicing would leak into the next row - affix symbols are learned from the footer legend before body parsing, so symbol-only affix fragments are classified correctly -- affix fragments that cross a column boundary in the XML can be split on hard internal spacing before column assignment, which is required for `Mana.pdf` +- cross-column text fragments can now be split at geometry-aligned whitespace boundaries before column assignment, while affix fragments still split on hard internal spacing - footer page numbers are filtered out before body parsing - validation allows a single contiguous affix block either before or after prose diff --git a/src/RolemasterDb.App/rolemaster.db b/src/RolemasterDb.App/rolemaster.db index 598b2ed..d34c0bf 100644 Binary files a/src/RolemasterDb.App/rolemaster.db and b/src/RolemasterDb.App/rolemaster.db differ diff --git a/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs b/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs index 3c1b018..8cdc803 100644 --- a/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs +++ b/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs @@ -201,6 +201,107 @@ public sealed class StandardCriticalTableParserIntegrationTests Assert.DoesNotContain('\uF06C', row100C.DescriptionText); } + [Fact] + public async Task Arcane_aether_first_row_keeps_c_and_d_text_in_separate_columns() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "arcane-aether", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row01C = parseResult.Table.Results.Single(item => + item.GroupKey is null && + string.Equals(item.RollBandLabel, "01-05", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "C", StringComparison.Ordinal)); + var row01D = parseResult.Table.Results.Single(item => + item.GroupKey is null && + string.Equals(item.RollBandLabel, "01-05", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "D", StringComparison.Ordinal)); + + Assert.Equal("Ooooh. That's the way to frighten him", row01C.DescriptionText); + Assert.Equal("That looked like it hurt. It didn't.", row01D.DescriptionText); + } + + [Fact] + public async Task Arcane_aether_31_40_keeps_a_and_b_text_in_separate_columns() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "arcane-aether", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row31A = parseResult.Table.Results.Single(item => + item.GroupKey is null && + string.Equals(item.RollBandLabel, "31-40", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "A", StringComparison.Ordinal)); + var row31B = parseResult.Table.Results.Single(item => + item.GroupKey is null && + string.Equals(item.RollBandLabel, "31-40", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "B", StringComparison.Ordinal)); + + Assert.Equal("Burns cause foe to bring up his guard.", row31A.DescriptionText); + Assert.Equal("Confused foe brings up his guard. He loses initiative for two rounds.", row31B.DescriptionText); + } + + [Fact] + public async Task Arcane_aether_41_50_keeps_d_and_e_text_in_separate_columns() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "arcane-aether", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row41D = parseResult.Table.Results.Single(item => + item.GroupKey is null && + string.Equals(item.RollBandLabel, "41-50", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "D", StringComparison.Ordinal)); + var row41E = parseResult.Table.Results.Single(item => + item.GroupKey is null && + string.Equals(item.RollBandLabel, "41-50", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "E", StringComparison.Ordinal)); + + Assert.Equal("Foe is spun by a strike to his shoulder.", row41D.DescriptionText); + Assert.Equal("Powerful blast knocks foe back three steps and cause him to drop all objects.", row41E.DescriptionText); + } + + [Fact] + public async Task Mana_first_row_keeps_all_five_columns_populated() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + + Assert.Equal("Lots of fireworks, but little effect.", FindResult(parseResult, "01-05", "A").DescriptionText); + Assert.Equal("Somewhere a bell tolls.", FindResult(parseResult, "01-05", "B").DescriptionText); + Assert.Equal("Weak blast.", FindResult(parseResult, "01-05", "C").DescriptionText); + Assert.Equal("Foe dances around your blast.", FindResult(parseResult, "01-05", "D").DescriptionText); + Assert.Equal("Foe does damage trying to dodge.", FindResult(parseResult, "01-05", "E").DescriptionText); + } + + [Fact] + public async Task Arcane_nether_first_row_keeps_b_column_populated() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "arcane-nether", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + + Assert.Equal("Glancing blow.", FindResult(parseResult, "01-05", "B").DescriptionText); + } + + [Fact] + public async Task Krush_36_45_keeps_a_b_and_c_content_in_separate_columns() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "krush", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + + Assert.Equal("Bust foe's shin. You have initiative.", FindResult(parseResult, "36-45", "A").DescriptionText); + Assert.Equal("Blow to foe's left calf. You gain initiative.", FindResult(parseResult, "36-45", "B").DescriptionText); + Assert.Equal("Catch foe in lower leg. You gain initiative, while foe regains footing.", FindResult(parseResult, "36-45", "C").DescriptionText); + } + + [Fact] + public async Task Super_large_creature_weapon_99_100_holy_arms_does_not_capture_previous_row_text() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "super_large_creature_weapon", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row99HolyArms = parseResult.Table.Results.Single(item => + item.GroupKey is null && + string.Equals(item.RollBandLabel, "99-100", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "HOLY_ARMS", StringComparison.Ordinal)); + + Assert.StartsWith("Strike through foe's heart kills him instantly.", row99HolyArms.DescriptionText, StringComparison.Ordinal); + Assert.DoesNotContain("all allies get (+10)", row99HolyArms.DescriptionText, StringComparison.OrdinalIgnoreCase); + } + [Fact] public async Task Mana_affix_boundaries_keep_71_75_a_and_b_separate() { @@ -551,6 +652,16 @@ public sealed class StandardCriticalTableParserIntegrationTests private static CriticalImportManifest LoadManifest() => new CriticalImportManifestLoader().Load(Path.Combine(GetRepositoryRoot(), "sources", "critical-import-manifest.json")); + private static ParsedCriticalResult FindResult( + CriticalTableParseResult parseResult, + string rollBandLabel, + string columnKey, + string? groupKey = null) => + parseResult.Table.Results.Single(item => + string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) && + string.Equals(item.RollBandLabel, rollBandLabel, StringComparison.Ordinal) && + string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal)); + private static string GetArtifactCacheRoot() { var cacheRoot = Path.Combine(Path.GetTempPath(), "RolemasterDb.ImportTool.Tests"); diff --git a/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs index acc92d3..f50ad60 100644 --- a/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs +++ b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs @@ -9,15 +9,17 @@ namespace RolemasterDb.ImportTool.Parsing; internal static class CriticalTableParserSupport { internal const int HeaderToBodyMinimumGap = 20; + internal const int HeaderToRowLabelMinimumGap = 10; internal const int FooterLabelExclusionGap = 15; internal const int FooterPageNumberExclusionGap = 80; internal const int RowLabelDuplicateTolerance = 15; internal const int TopGroupingTolerance = 2; + internal const int BoundarySplitSearchRadiusChars = 12; private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled); - private static readonly Regex SentenceFragmentSplitRegex = new(@"\S.*?(?:[.!?](?:['"")\]]*)|$)", RegexOptions.Compiled); private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled); private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|–)\d+\)$", RegexOptions.Compiled); + private static readonly Regex BoundaryBonusLineRegex = new(@"^(?:all allies|all foe's allies|all foes|all opponents)\b", RegexOptions.IgnoreCase | RegexOptions.Compiled); internal static List LoadFragments(string xmlContent) { @@ -405,7 +407,7 @@ internal static class CriticalTableParserSupport .ToList(); var isAffixLike = columnTexts.Count > 0 && - columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols)); + columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols) || IsBoundaryBonusLine(text)); bodyLines.Add((lineFragments[0].Top, isAffixLike)); } @@ -448,6 +450,20 @@ internal static class CriticalTableParserSupport .Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1)) .ToList(); + internal static int ResolveBodyStartTop(int headerTop, IReadOnlyList rowAnchors) + { + if (rowAnchors.Count == 0) + { + return headerTop + HeaderToBodyMinimumGap; + } + + return Math.Min( + headerTop + HeaderToBodyMinimumGap, + Math.Max( + headerTop + HeaderToRowLabelMinimumGap, + rowAnchors[0].Top - HeaderToRowLabelMinimumGap - TopGroupingTolerance)); + } + internal static List BuildBodyFragments( IReadOnlyList fragments, int bodyStartTop, @@ -618,12 +634,30 @@ internal static class CriticalTableParserSupport IReadOnlyList<(string Key, double CenterX)> columnCenters, IReadOnlySet affixLegendSymbols) { - if (!TryGetBoundaryCrossingPattern(fragment, columnCenters, affixLegendSymbols, out var splitPattern)) + if (!CrossesColumnBoundary(fragment, columnCenters)) { return [fragment]; } - var matches = splitPattern.Matches(fragment.Text); + if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) && + fragment.Text.Contains(" ", StringComparison.Ordinal)) + { + return BuildSplitFragmentsFromMatches(fragment, MultiFragmentSplitRegex.Matches(fragment.Text), columnCenters); + } + + if (TrySplitProseFragmentAtBoundaries(fragment, columnCenters, out var splitFragments)) + { + return splitFragments; + } + + return [fragment]; + } + + private static IReadOnlyList BuildSplitFragmentsFromMatches( + XmlTextFragment fragment, + MatchCollection matches, + IReadOnlyList<(string Key, double CenterX)> columnCenters) + { if (matches.Count < 2) { return [fragment]; @@ -668,34 +702,158 @@ internal static class CriticalTableParserSupport : [fragment]; } - private static bool TryGetBoundaryCrossingPattern( + private static bool TrySplitProseFragmentAtBoundaries( XmlTextFragment fragment, IReadOnlyList<(string Key, double CenterX)> columnCenters, - IReadOnlySet affixLegendSymbols, - out Regex splitPattern) + out IReadOnlyList splitFragments) { - splitPattern = null!; + splitFragments = null!; - if (!CrossesColumnBoundary(fragment, columnCenters)) + var boundaryIndexes = FindBoundarySplitIndexes(fragment, columnCenters); + if (boundaryIndexes.Count == 0) { return false; } - if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) && - fragment.Text.Contains(" ", StringComparison.Ordinal)) + var segments = new List(); + var segmentStart = 0; + var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1); + + foreach (var splitIndex in boundaryIndexes) { - splitPattern = MultiFragmentSplitRegex; - return true; + var segment = CreateFragmentSegment(fragment, segmentStart, splitIndex - segmentStart, characterWidth); + if (segment is not null) + { + segments.Add(segment); + } + + segmentStart = splitIndex; } - if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) && - CountSentenceLikeSegments(fragment.Text) >= 2) + var trailingSegment = CreateFragmentSegment(fragment, segmentStart, fragment.Text.Length - segmentStart, characterWidth); + if (trailingSegment is not null) { - splitPattern = SentenceFragmentSplitRegex; - return true; + segments.Add(trailingSegment); } - return false; + if (segments.Count < 2) + { + return false; + } + + splitFragments = segments; + return true; + } + + private static List FindBoundarySplitIndexes( + XmlTextFragment fragment, + IReadOnlyList<(string Key, double CenterX)> columnCenters) + { + var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1); + var fragmentRight = fragment.Left + fragment.Width; + var splitIndexes = new List(); + var minimumIndex = 1; + + for (var index = 0; index < columnCenters.Count - 1; index++) + { + var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0; + if (fragment.Left >= boundary || fragmentRight <= boundary) + { + continue; + } + + var targetIndex = (int)Math.Round((boundary - fragment.Left) / characterWidth); + var splitIndex = FindNearestWhitespaceSplitIndex(fragment.Text, targetIndex, minimumIndex); + if (splitIndex is null) + { + return []; + } + + splitIndexes.Add(splitIndex.Value); + minimumIndex = splitIndex.Value + 1; + } + + return splitIndexes; + } + + private static int? FindNearestWhitespaceSplitIndex(string text, int targetIndex, int minimumIndex) + { + var start = Math.Max(minimumIndex, targetIndex - BoundarySplitSearchRadiusChars); + var end = Math.Min(text.Length - 1, targetIndex + BoundarySplitSearchRadiusChars); + int? bestIndex = null; + var bestDistance = int.MaxValue; + + for (var index = start; index <= end; index++) + { + if (!char.IsWhiteSpace(text[index])) + { + continue; + } + + var candidate = index; + while (candidate < text.Length && char.IsWhiteSpace(text[candidate])) + { + candidate++; + } + + if (candidate <= minimumIndex || candidate >= text.Length) + { + continue; + } + + var distance = Math.Abs(candidate - targetIndex); + if (distance >= bestDistance) + { + continue; + } + + bestDistance = distance; + bestIndex = candidate; + } + + return bestIndex; + } + + private static XmlTextFragment? CreateFragmentSegment( + XmlTextFragment fragment, + int startIndex, + int length, + double characterWidth) + { + if (length <= 0) + { + return null; + } + + var rawSegment = fragment.Text.Substring(startIndex, length); + var trimmedStart = 0; + while (trimmedStart < rawSegment.Length && char.IsWhiteSpace(rawSegment[trimmedStart])) + { + trimmedStart++; + } + + var trimmedEnd = rawSegment.Length - 1; + while (trimmedEnd >= trimmedStart && char.IsWhiteSpace(rawSegment[trimmedEnd])) + { + trimmedEnd--; + } + + if (trimmedEnd < trimmedStart) + { + return null; + } + + var actualStart = startIndex + trimmedStart; + var actualLength = trimmedEnd - trimmedStart + 1; + var segmentText = CollapseWhitespace(fragment.Text.Substring(actualStart, actualLength)); + + return new XmlTextFragment( + fragment.PageNumber, + fragment.Top, + fragment.Left + (int)Math.Round(characterWidth * actualStart), + Math.Max(1, (int)Math.Round(characterWidth * actualLength)), + fragment.Height, + segmentText); } private static bool CrossesColumnBoundary( @@ -716,10 +874,8 @@ internal static class CriticalTableParserSupport return false; } - private static int CountSentenceLikeSegments(string text) => - SentenceFragmentSplitRegex.Matches(text) - .Select(match => CollapseWhitespace(match.Value)) - .Count(value => !string.IsNullOrWhiteSpace(value)); + private static bool IsBoundaryBonusLine(string text) => + BoundaryBonusLineRegex.IsMatch(text.Trim()); private static void AddLegendMatch( IDictionary symbolEffects, diff --git a/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs index fb759e2..e521d9e 100644 --- a/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs @@ -32,10 +32,9 @@ public sealed class GroupedVariantCriticalTableParser }) .ToList(); - var bodyStartTop = Math.Max( - groupHeaders.Max(item => item.Top), - columnHeaders.Max(item => item.Top)) - + CriticalTableParserSupport.HeaderToBodyMinimumGap; + var headerTop = Math.Max( + groupHeaders.Max(item => item.Top), + columnHeaders.Max(item => item.Top)); var keyTop = CriticalTableParserSupport.FindKeyTop(fragments); var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop); var affixLegendSymbols = affixLegend.ClassificationSymbols; @@ -43,9 +42,10 @@ public sealed class GroupedVariantCriticalTableParser var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments( fragments, leftCutoff, - bodyStartTop, + headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap, keyTop); var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments); + var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors); if (rowAnchors.Count == 0) { diff --git a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs index df85985..50c975f 100644 --- a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs @@ -14,7 +14,7 @@ public sealed class StandardCriticalTableParser .Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX)) .ToList(); - var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap; + var headerTop = headerFragments.Max(item => item.Top); var keyTop = CriticalTableParserSupport.FindKeyTop(fragments); var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop); var affixLegendSymbols = affixLegend.ClassificationSymbols; @@ -22,9 +22,10 @@ public sealed class StandardCriticalTableParser var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments( fragments, leftCutoff, - bodyStartTop, + headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap, keyTop); var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments); + var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors); if (rowAnchors.Count == 0) { diff --git a/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs index 8ceec30..c2c53bf 100644 --- a/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs @@ -27,7 +27,7 @@ public sealed class VariantColumnCriticalTableParser }) .ToList(); - var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap; + var headerTop = headerFragments.Max(item => item.Top); var keyTop = CriticalTableParserSupport.FindKeyTop(fragments); var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop); var affixLegendSymbols = affixLegend.ClassificationSymbols; @@ -35,9 +35,10 @@ public sealed class VariantColumnCriticalTableParser var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments( fragments, leftCutoff, - bodyStartTop, + headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap, keyTop); var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments); + var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors); if (rowAnchors.Count == 0) {