Fix critical importer row and column boundary parsing
This commit is contained in:
@@ -9,15 +9,17 @@ namespace RolemasterDb.ImportTool.Parsing;
|
||||
internal static class CriticalTableParserSupport
|
||||
{
|
||||
internal const int HeaderToBodyMinimumGap = 20;
|
||||
internal const int HeaderToRowLabelMinimumGap = 10;
|
||||
internal const int FooterLabelExclusionGap = 15;
|
||||
internal const int FooterPageNumberExclusionGap = 80;
|
||||
internal const int RowLabelDuplicateTolerance = 15;
|
||||
internal const int TopGroupingTolerance = 2;
|
||||
internal const int BoundarySplitSearchRadiusChars = 12;
|
||||
|
||||
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
|
||||
private static readonly Regex SentenceFragmentSplitRegex = new(@"\S.*?(?:[.!?](?:['"")\]]*)|$)", RegexOptions.Compiled);
|
||||
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
||||
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|–)\d+\)$", RegexOptions.Compiled);
|
||||
private static readonly Regex BoundaryBonusLineRegex = new(@"^(?:all allies|all foe's allies|all foes|all opponents)\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
||||
internal static List<XmlTextFragment> LoadFragments(string xmlContent)
|
||||
{
|
||||
@@ -405,7 +407,7 @@ internal static class CriticalTableParserSupport
|
||||
.ToList();
|
||||
|
||||
var isAffixLike = columnTexts.Count > 0 &&
|
||||
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols));
|
||||
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols) || IsBoundaryBonusLine(text));
|
||||
|
||||
bodyLines.Add((lineFragments[0].Top, isAffixLike));
|
||||
}
|
||||
@@ -448,6 +450,20 @@ internal static class CriticalTableParserSupport
|
||||
.Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1))
|
||||
.ToList();
|
||||
|
||||
internal static int ResolveBodyStartTop(int headerTop, IReadOnlyList<RowAnchor> rowAnchors)
|
||||
{
|
||||
if (rowAnchors.Count == 0)
|
||||
{
|
||||
return headerTop + HeaderToBodyMinimumGap;
|
||||
}
|
||||
|
||||
return Math.Min(
|
||||
headerTop + HeaderToBodyMinimumGap,
|
||||
Math.Max(
|
||||
headerTop + HeaderToRowLabelMinimumGap,
|
||||
rowAnchors[0].Top - HeaderToRowLabelMinimumGap - TopGroupingTolerance));
|
||||
}
|
||||
|
||||
internal static List<XmlTextFragment> BuildBodyFragments(
|
||||
IReadOnlyList<XmlTextFragment> fragments,
|
||||
int bodyStartTop,
|
||||
@@ -618,12 +634,30 @@ internal static class CriticalTableParserSupport
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
IReadOnlySet<string> affixLegendSymbols)
|
||||
{
|
||||
if (!TryGetBoundaryCrossingPattern(fragment, columnCenters, affixLegendSymbols, out var splitPattern))
|
||||
if (!CrossesColumnBoundary(fragment, columnCenters))
|
||||
{
|
||||
return [fragment];
|
||||
}
|
||||
|
||||
var matches = splitPattern.Matches(fragment.Text);
|
||||
if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
|
||||
fragment.Text.Contains(" ", StringComparison.Ordinal))
|
||||
{
|
||||
return BuildSplitFragmentsFromMatches(fragment, MultiFragmentSplitRegex.Matches(fragment.Text), columnCenters);
|
||||
}
|
||||
|
||||
if (TrySplitProseFragmentAtBoundaries(fragment, columnCenters, out var splitFragments))
|
||||
{
|
||||
return splitFragments;
|
||||
}
|
||||
|
||||
return [fragment];
|
||||
}
|
||||
|
||||
private static IReadOnlyList<XmlTextFragment> BuildSplitFragmentsFromMatches(
|
||||
XmlTextFragment fragment,
|
||||
MatchCollection matches,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters)
|
||||
{
|
||||
if (matches.Count < 2)
|
||||
{
|
||||
return [fragment];
|
||||
@@ -668,34 +702,158 @@ internal static class CriticalTableParserSupport
|
||||
: [fragment];
|
||||
}
|
||||
|
||||
private static bool TryGetBoundaryCrossingPattern(
|
||||
private static bool TrySplitProseFragmentAtBoundaries(
|
||||
XmlTextFragment fragment,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
IReadOnlySet<string> affixLegendSymbols,
|
||||
out Regex splitPattern)
|
||||
out IReadOnlyList<XmlTextFragment> splitFragments)
|
||||
{
|
||||
splitPattern = null!;
|
||||
splitFragments = null!;
|
||||
|
||||
if (!CrossesColumnBoundary(fragment, columnCenters))
|
||||
var boundaryIndexes = FindBoundarySplitIndexes(fragment, columnCenters);
|
||||
if (boundaryIndexes.Count == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
|
||||
fragment.Text.Contains(" ", StringComparison.Ordinal))
|
||||
var segments = new List<XmlTextFragment>();
|
||||
var segmentStart = 0;
|
||||
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
|
||||
|
||||
foreach (var splitIndex in boundaryIndexes)
|
||||
{
|
||||
splitPattern = MultiFragmentSplitRegex;
|
||||
return true;
|
||||
var segment = CreateFragmentSegment(fragment, segmentStart, splitIndex - segmentStart, characterWidth);
|
||||
if (segment is not null)
|
||||
{
|
||||
segments.Add(segment);
|
||||
}
|
||||
|
||||
segmentStart = splitIndex;
|
||||
}
|
||||
|
||||
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
|
||||
CountSentenceLikeSegments(fragment.Text) >= 2)
|
||||
var trailingSegment = CreateFragmentSegment(fragment, segmentStart, fragment.Text.Length - segmentStart, characterWidth);
|
||||
if (trailingSegment is not null)
|
||||
{
|
||||
splitPattern = SentenceFragmentSplitRegex;
|
||||
return true;
|
||||
segments.Add(trailingSegment);
|
||||
}
|
||||
|
||||
return false;
|
||||
if (segments.Count < 2)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
splitFragments = segments;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static List<int> FindBoundarySplitIndexes(
|
||||
XmlTextFragment fragment,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters)
|
||||
{
|
||||
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
|
||||
var fragmentRight = fragment.Left + fragment.Width;
|
||||
var splitIndexes = new List<int>();
|
||||
var minimumIndex = 1;
|
||||
|
||||
for (var index = 0; index < columnCenters.Count - 1; index++)
|
||||
{
|
||||
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
|
||||
if (fragment.Left >= boundary || fragmentRight <= boundary)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var targetIndex = (int)Math.Round((boundary - fragment.Left) / characterWidth);
|
||||
var splitIndex = FindNearestWhitespaceSplitIndex(fragment.Text, targetIndex, minimumIndex);
|
||||
if (splitIndex is null)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
splitIndexes.Add(splitIndex.Value);
|
||||
minimumIndex = splitIndex.Value + 1;
|
||||
}
|
||||
|
||||
return splitIndexes;
|
||||
}
|
||||
|
||||
private static int? FindNearestWhitespaceSplitIndex(string text, int targetIndex, int minimumIndex)
|
||||
{
|
||||
var start = Math.Max(minimumIndex, targetIndex - BoundarySplitSearchRadiusChars);
|
||||
var end = Math.Min(text.Length - 1, targetIndex + BoundarySplitSearchRadiusChars);
|
||||
int? bestIndex = null;
|
||||
var bestDistance = int.MaxValue;
|
||||
|
||||
for (var index = start; index <= end; index++)
|
||||
{
|
||||
if (!char.IsWhiteSpace(text[index]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var candidate = index;
|
||||
while (candidate < text.Length && char.IsWhiteSpace(text[candidate]))
|
||||
{
|
||||
candidate++;
|
||||
}
|
||||
|
||||
if (candidate <= minimumIndex || candidate >= text.Length)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var distance = Math.Abs(candidate - targetIndex);
|
||||
if (distance >= bestDistance)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
bestDistance = distance;
|
||||
bestIndex = candidate;
|
||||
}
|
||||
|
||||
return bestIndex;
|
||||
}
|
||||
|
||||
private static XmlTextFragment? CreateFragmentSegment(
|
||||
XmlTextFragment fragment,
|
||||
int startIndex,
|
||||
int length,
|
||||
double characterWidth)
|
||||
{
|
||||
if (length <= 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var rawSegment = fragment.Text.Substring(startIndex, length);
|
||||
var trimmedStart = 0;
|
||||
while (trimmedStart < rawSegment.Length && char.IsWhiteSpace(rawSegment[trimmedStart]))
|
||||
{
|
||||
trimmedStart++;
|
||||
}
|
||||
|
||||
var trimmedEnd = rawSegment.Length - 1;
|
||||
while (trimmedEnd >= trimmedStart && char.IsWhiteSpace(rawSegment[trimmedEnd]))
|
||||
{
|
||||
trimmedEnd--;
|
||||
}
|
||||
|
||||
if (trimmedEnd < trimmedStart)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var actualStart = startIndex + trimmedStart;
|
||||
var actualLength = trimmedEnd - trimmedStart + 1;
|
||||
var segmentText = CollapseWhitespace(fragment.Text.Substring(actualStart, actualLength));
|
||||
|
||||
return new XmlTextFragment(
|
||||
fragment.PageNumber,
|
||||
fragment.Top,
|
||||
fragment.Left + (int)Math.Round(characterWidth * actualStart),
|
||||
Math.Max(1, (int)Math.Round(characterWidth * actualLength)),
|
||||
fragment.Height,
|
||||
segmentText);
|
||||
}
|
||||
|
||||
private static bool CrossesColumnBoundary(
|
||||
@@ -716,10 +874,8 @@ internal static class CriticalTableParserSupport
|
||||
return false;
|
||||
}
|
||||
|
||||
private static int CountSentenceLikeSegments(string text) =>
|
||||
SentenceFragmentSplitRegex.Matches(text)
|
||||
.Select(match => CollapseWhitespace(match.Value))
|
||||
.Count(value => !string.IsNullOrWhiteSpace(value));
|
||||
private static bool IsBoundaryBonusLine(string text) =>
|
||||
BoundaryBonusLineRegex.IsMatch(text.Trim());
|
||||
|
||||
private static void AddLegendMatch(
|
||||
IDictionary<string, string> symbolEffects,
|
||||
|
||||
@@ -32,10 +32,9 @@ public sealed class GroupedVariantCriticalTableParser
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var bodyStartTop = Math.Max(
|
||||
groupHeaders.Max(item => item.Top),
|
||||
columnHeaders.Max(item => item.Top))
|
||||
+ CriticalTableParserSupport.HeaderToBodyMinimumGap;
|
||||
var headerTop = Math.Max(
|
||||
groupHeaders.Max(item => item.Top),
|
||||
columnHeaders.Max(item => item.Top));
|
||||
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
||||
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
|
||||
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
||||
@@ -43,9 +42,10 @@ public sealed class GroupedVariantCriticalTableParser
|
||||
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||
fragments,
|
||||
leftCutoff,
|
||||
bodyStartTop,
|
||||
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
|
||||
keyTop);
|
||||
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
|
||||
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
|
||||
|
||||
if (rowAnchors.Count == 0)
|
||||
{
|
||||
|
||||
@@ -14,7 +14,7 @@ public sealed class StandardCriticalTableParser
|
||||
.Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX))
|
||||
.ToList();
|
||||
|
||||
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
|
||||
var headerTop = headerFragments.Max(item => item.Top);
|
||||
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
||||
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
|
||||
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
||||
@@ -22,9 +22,10 @@ public sealed class StandardCriticalTableParser
|
||||
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||
fragments,
|
||||
leftCutoff,
|
||||
bodyStartTop,
|
||||
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
|
||||
keyTop);
|
||||
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
|
||||
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
|
||||
|
||||
if (rowAnchors.Count == 0)
|
||||
{
|
||||
|
||||
@@ -27,7 +27,7 @@ public sealed class VariantColumnCriticalTableParser
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
|
||||
var headerTop = headerFragments.Max(item => item.Top);
|
||||
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
||||
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
|
||||
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
||||
@@ -35,9 +35,10 @@ public sealed class VariantColumnCriticalTableParser
|
||||
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||
fragments,
|
||||
leftCutoff,
|
||||
bodyStartTop,
|
||||
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
|
||||
keyTop);
|
||||
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
|
||||
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
|
||||
|
||||
if (rowAnchors.Count == 0)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user