diff --git a/docs/critical_import_tool.md b/docs/critical_import_tool.md index cf0dbc8..5e481c9 100644 --- a/docs/critical_import_tool.md +++ b/docs/critical_import_tool.md @@ -240,6 +240,7 @@ Current phase-3 notes: - header detection now tolerates minor `top` misalignment across the `A-E` header glyphs - row boundaries can snap to the last affix-to-prose transition between adjacent roll labels when midpoint slicing would leak into the next row - affix symbols are learned from the footer legend before body parsing, so symbol-only affix fragments are classified correctly +- affix fragments that cross a column boundary in the XML can be split on hard internal spacing before column assignment, which is required for `Mana.pdf` - footer page numbers are filtered out before body parsing - validation allows a single contiguous affix block either before or after prose diff --git a/src/RolemasterDb.App/rolemaster.db b/src/RolemasterDb.App/rolemaster.db index 8992895..f9fa532 100644 Binary files a/src/RolemasterDb.App/rolemaster.db and b/src/RolemasterDb.App/rolemaster.db differ diff --git a/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs b/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs index bc957b5..3c44cfa 100644 --- a/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs +++ b/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs @@ -137,6 +137,70 @@ public sealed class StandardCriticalTableParserIntegrationTests Assert.DoesNotContain('\uF06C', row100C.DescriptionText); } + [Fact] + public async Task Mana_affix_boundaries_keep_71_75_a_and_b_separate() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row71A = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "A", StringComparison.Ordinal)); + var row71B = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "B", StringComparison.Ordinal)); + + Assert.DoesNotContain("+10H -", row71A.RawAffixText, StringComparison.Ordinal); + Assert.Contains("+10H -", row71B.RawAffixText, StringComparison.Ordinal); + } + + [Fact] + public async Task Mana_affix_boundaries_keep_71_75_d_and_e_separate() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row71D = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "D", StringComparison.Ordinal)); + var row71E = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "E", StringComparison.Ordinal)); + + Assert.DoesNotContain("+16H - 6", row71D.RawAffixText, StringComparison.Ordinal); + Assert.Contains("+16H - 6", row71E.RawAffixText, StringComparison.Ordinal); + } + + [Fact] + public async Task Mana_affix_boundaries_keep_91_95_b_and_c_separate() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row91B = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "B", StringComparison.Ordinal)); + var row91C = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "C", StringComparison.Ordinal)); + + Assert.DoesNotContain("+19H - 9", row91B.RawAffixText, StringComparison.Ordinal); + Assert.Contains("+19H - 9", row91C.RawAffixText, StringComparison.Ordinal); + } + + [Fact] + public async Task Mana_affix_boundaries_keep_86_90_b_and_c_separate() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row86B = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "B", StringComparison.Ordinal)); + var row86C = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "C", StringComparison.Ordinal)); + + Assert.DoesNotContain("+16H - 8", row86B.RawAffixText, StringComparison.Ordinal); + Assert.Contains("+16H - 8", row86C.RawAffixText, StringComparison.Ordinal); + } + private static async Task LoadParseResultAsync(CriticalImportManifestEntry entry) { var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml"); diff --git a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs index ea572cb..15db26e 100644 --- a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs @@ -11,6 +11,7 @@ public sealed class StandardCriticalTableParser private const int FooterPageNumberExclusionGap = 80; private const int RowLabelDuplicateTolerance = 15; private const int TopGroupingTolerance = 2; + private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled); private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled); private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled); @@ -55,6 +56,7 @@ public sealed class StandardCriticalTableParser !rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) && !headerFragments.Contains(item)) .ToList(); + bodyFragments = SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols); var bodyLines = BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols); var parsedRollBands = rowAnchors @@ -460,6 +462,101 @@ public sealed class StandardCriticalTableParser return symbols; } + private static List SplitBoundaryCrossingAffixFragments( + IReadOnlyList bodyFragments, + IReadOnlyList columnCenters, + ISet affixLegendSymbols) + { + var splitFragments = new List(bodyFragments.Count); + + foreach (var fragment in bodyFragments) + { + splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols)); + } + + return splitFragments; + } + + private static IReadOnlyList SplitBoundaryCrossingAffixFragment( + XmlTextFragment fragment, + IReadOnlyList columnCenters, + ISet affixLegendSymbols) + { + if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols)) + { + return [fragment]; + } + + var matches = MultiFragmentSplitRegex.Matches(fragment.Text); + if (matches.Count < 2) + { + return [fragment]; + } + + var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1); + var splitFragments = new List(matches.Count); + + foreach (Match match in matches) + { + var segmentText = CollapseWhitespace(match.Value); + if (segmentText.Length == 0) + { + continue; + } + + var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index); + var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length)); + + splitFragments.Add(new XmlTextFragment( + fragment.PageNumber, + fragment.Top, + segmentLeft, + segmentWidth, + fragment.Height, + segmentText)); + } + + if (splitFragments.Count < 2) + { + return [fragment]; + } + + var originalColumn = ResolveColumn(fragment.CenterX, columnCenters); + var distinctColumns = splitFragments + .Select(item => ResolveColumn(item.CenterX, columnCenters)) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToList(); + + return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase)) + ? splitFragments + : [fragment]; + } + + private static bool LooksLikeBoundaryCrossingAffixFragment( + XmlTextFragment fragment, + IReadOnlyList columnCenters, + ISet affixLegendSymbols) + { + if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) || + !fragment.Text.Contains(" ", StringComparison.Ordinal)) + { + return false; + } + + var fragmentRight = fragment.Left + fragment.Width; + + for (var index = 0; index < columnCenters.Count - 1; index++) + { + var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0; + if (fragment.Left < boundary && fragmentRight > boundary) + { + return true; + } + } + + return false; + } + private static void AddLegendMatch(HashSet symbols, string value, string pattern) { foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))