Fix critical importer row and column boundary parsing

This commit is contained in:
2026-03-14 14:34:27 +01:00
parent eb7de020b1
commit 28587fc6df
7 changed files with 302 additions and 32 deletions

View File

@@ -9,15 +9,17 @@ namespace RolemasterDb.ImportTool.Parsing;
internal static class CriticalTableParserSupport
{
internal const int HeaderToBodyMinimumGap = 20;
internal const int HeaderToRowLabelMinimumGap = 10;
internal const int FooterLabelExclusionGap = 15;
internal const int FooterPageNumberExclusionGap = 80;
internal const int RowLabelDuplicateTolerance = 15;
internal const int TopGroupingTolerance = 2;
internal const int BoundarySplitSearchRadiusChars = 12;
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
private static readonly Regex SentenceFragmentSplitRegex = new(@"\S.*?(?:[.!?](?:['"")\]]*)|$)", RegexOptions.Compiled);
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[-])", RegexOptions.Compiled);
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|)\d+\)$", RegexOptions.Compiled);
private static readonly Regex BoundaryBonusLineRegex = new(@"^(?:all allies|all foe's allies|all foes|all opponents)\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
internal static List<XmlTextFragment> LoadFragments(string xmlContent)
{
@@ -405,7 +407,7 @@ internal static class CriticalTableParserSupport
.ToList();
var isAffixLike = columnTexts.Count > 0 &&
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols));
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols) || IsBoundaryBonusLine(text));
bodyLines.Add((lineFragments[0].Top, isAffixLike));
}
@@ -448,6 +450,20 @@ internal static class CriticalTableParserSupport
.Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
internal static int ResolveBodyStartTop(int headerTop, IReadOnlyList<RowAnchor> rowAnchors)
{
if (rowAnchors.Count == 0)
{
return headerTop + HeaderToBodyMinimumGap;
}
return Math.Min(
headerTop + HeaderToBodyMinimumGap,
Math.Max(
headerTop + HeaderToRowLabelMinimumGap,
rowAnchors[0].Top - HeaderToRowLabelMinimumGap - TopGroupingTolerance));
}
internal static List<XmlTextFragment> BuildBodyFragments(
IReadOnlyList<XmlTextFragment> fragments,
int bodyStartTop,
@@ -618,12 +634,30 @@ internal static class CriticalTableParserSupport
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols)
{
if (!TryGetBoundaryCrossingPattern(fragment, columnCenters, affixLegendSymbols, out var splitPattern))
if (!CrossesColumnBoundary(fragment, columnCenters))
{
return [fragment];
}
var matches = splitPattern.Matches(fragment.Text);
if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
fragment.Text.Contains(" ", StringComparison.Ordinal))
{
return BuildSplitFragmentsFromMatches(fragment, MultiFragmentSplitRegex.Matches(fragment.Text), columnCenters);
}
if (TrySplitProseFragmentAtBoundaries(fragment, columnCenters, out var splitFragments))
{
return splitFragments;
}
return [fragment];
}
private static IReadOnlyList<XmlTextFragment> BuildSplitFragmentsFromMatches(
XmlTextFragment fragment,
MatchCollection matches,
IReadOnlyList<(string Key, double CenterX)> columnCenters)
{
if (matches.Count < 2)
{
return [fragment];
@@ -668,34 +702,158 @@ internal static class CriticalTableParserSupport
: [fragment];
}
private static bool TryGetBoundaryCrossingPattern(
private static bool TrySplitProseFragmentAtBoundaries(
XmlTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols,
out Regex splitPattern)
out IReadOnlyList<XmlTextFragment> splitFragments)
{
splitPattern = null!;
splitFragments = null!;
if (!CrossesColumnBoundary(fragment, columnCenters))
var boundaryIndexes = FindBoundarySplitIndexes(fragment, columnCenters);
if (boundaryIndexes.Count == 0)
{
return false;
}
if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
fragment.Text.Contains(" ", StringComparison.Ordinal))
var segments = new List<XmlTextFragment>();
var segmentStart = 0;
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
foreach (var splitIndex in boundaryIndexes)
{
splitPattern = MultiFragmentSplitRegex;
return true;
var segment = CreateFragmentSegment(fragment, segmentStart, splitIndex - segmentStart, characterWidth);
if (segment is not null)
{
segments.Add(segment);
}
segmentStart = splitIndex;
}
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
CountSentenceLikeSegments(fragment.Text) >= 2)
var trailingSegment = CreateFragmentSegment(fragment, segmentStart, fragment.Text.Length - segmentStart, characterWidth);
if (trailingSegment is not null)
{
splitPattern = SentenceFragmentSplitRegex;
return true;
segments.Add(trailingSegment);
}
return false;
if (segments.Count < 2)
{
return false;
}
splitFragments = segments;
return true;
}
private static List<int> FindBoundarySplitIndexes(
XmlTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters)
{
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
var fragmentRight = fragment.Left + fragment.Width;
var splitIndexes = new List<int>();
var minimumIndex = 1;
for (var index = 0; index < columnCenters.Count - 1; index++)
{
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
if (fragment.Left >= boundary || fragmentRight <= boundary)
{
continue;
}
var targetIndex = (int)Math.Round((boundary - fragment.Left) / characterWidth);
var splitIndex = FindNearestWhitespaceSplitIndex(fragment.Text, targetIndex, minimumIndex);
if (splitIndex is null)
{
return [];
}
splitIndexes.Add(splitIndex.Value);
minimumIndex = splitIndex.Value + 1;
}
return splitIndexes;
}
private static int? FindNearestWhitespaceSplitIndex(string text, int targetIndex, int minimumIndex)
{
var start = Math.Max(minimumIndex, targetIndex - BoundarySplitSearchRadiusChars);
var end = Math.Min(text.Length - 1, targetIndex + BoundarySplitSearchRadiusChars);
int? bestIndex = null;
var bestDistance = int.MaxValue;
for (var index = start; index <= end; index++)
{
if (!char.IsWhiteSpace(text[index]))
{
continue;
}
var candidate = index;
while (candidate < text.Length && char.IsWhiteSpace(text[candidate]))
{
candidate++;
}
if (candidate <= minimumIndex || candidate >= text.Length)
{
continue;
}
var distance = Math.Abs(candidate - targetIndex);
if (distance >= bestDistance)
{
continue;
}
bestDistance = distance;
bestIndex = candidate;
}
return bestIndex;
}
private static XmlTextFragment? CreateFragmentSegment(
XmlTextFragment fragment,
int startIndex,
int length,
double characterWidth)
{
if (length <= 0)
{
return null;
}
var rawSegment = fragment.Text.Substring(startIndex, length);
var trimmedStart = 0;
while (trimmedStart < rawSegment.Length && char.IsWhiteSpace(rawSegment[trimmedStart]))
{
trimmedStart++;
}
var trimmedEnd = rawSegment.Length - 1;
while (trimmedEnd >= trimmedStart && char.IsWhiteSpace(rawSegment[trimmedEnd]))
{
trimmedEnd--;
}
if (trimmedEnd < trimmedStart)
{
return null;
}
var actualStart = startIndex + trimmedStart;
var actualLength = trimmedEnd - trimmedStart + 1;
var segmentText = CollapseWhitespace(fragment.Text.Substring(actualStart, actualLength));
return new XmlTextFragment(
fragment.PageNumber,
fragment.Top,
fragment.Left + (int)Math.Round(characterWidth * actualStart),
Math.Max(1, (int)Math.Round(characterWidth * actualLength)),
fragment.Height,
segmentText);
}
private static bool CrossesColumnBoundary(
@@ -716,10 +874,8 @@ internal static class CriticalTableParserSupport
return false;
}
private static int CountSentenceLikeSegments(string text) =>
SentenceFragmentSplitRegex.Matches(text)
.Select(match => CollapseWhitespace(match.Value))
.Count(value => !string.IsNullOrWhiteSpace(value));
private static bool IsBoundaryBonusLine(string text) =>
BoundaryBonusLineRegex.IsMatch(text.Trim());
private static void AddLegendMatch(
IDictionary<string, string> symbolEffects,

View File

@@ -32,10 +32,9 @@ public sealed class GroupedVariantCriticalTableParser
})
.ToList();
var bodyStartTop = Math.Max(
groupHeaders.Max(item => item.Top),
columnHeaders.Max(item => item.Top))
+ CriticalTableParserSupport.HeaderToBodyMinimumGap;
var headerTop = Math.Max(
groupHeaders.Max(item => item.Top),
columnHeaders.Max(item => item.Top));
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
var affixLegendSymbols = affixLegend.ClassificationSymbols;
@@ -43,9 +42,10 @@ public sealed class GroupedVariantCriticalTableParser
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
bodyStartTop,
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
keyTop);
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
if (rowAnchors.Count == 0)
{

View File

@@ -14,7 +14,7 @@ public sealed class StandardCriticalTableParser
.Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX))
.ToList();
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
var headerTop = headerFragments.Max(item => item.Top);
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
var affixLegendSymbols = affixLegend.ClassificationSymbols;
@@ -22,9 +22,10 @@ public sealed class StandardCriticalTableParser
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
bodyStartTop,
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
keyTop);
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
if (rowAnchors.Count == 0)
{

View File

@@ -27,7 +27,7 @@ public sealed class VariantColumnCriticalTableParser
})
.ToList();
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
var headerTop = headerFragments.Max(item => item.Top);
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
var affixLegendSymbols = affixLegend.ClassificationSymbols;
@@ -35,9 +35,10 @@ public sealed class VariantColumnCriticalTableParser
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
bodyStartTop,
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
keyTop);
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
if (rowAnchors.Count == 0)
{