Fix mana affix column leakage

This commit is contained in:
2026-03-14 02:53:33 +01:00
parent 73ce64e879
commit a391a1421a
4 changed files with 162 additions and 0 deletions

View File

@@ -11,6 +11,7 @@ public sealed class StandardCriticalTableParser
private const int FooterPageNumberExclusionGap = 80;
private const int RowLabelDuplicateTolerance = 15;
private const int TopGroupingTolerance = 2;
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[-])", RegexOptions.Compiled);
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled);
@@ -55,6 +56,7 @@ public sealed class StandardCriticalTableParser
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList();
bodyFragments = SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyLines = BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
@@ -460,6 +462,101 @@ public sealed class StandardCriticalTableParser
return symbols;
}
private static List<XmlTextFragment> SplitBoundaryCrossingAffixFragments(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count);
foreach (var fragment in bodyFragments)
{
splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols));
}
return splitFragments;
}
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingAffixFragment(
XmlTextFragment fragment,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols))
{
return [fragment];
}
var matches = MultiFragmentSplitRegex.Matches(fragment.Text);
if (matches.Count < 2)
{
return [fragment];
}
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
var splitFragments = new List<XmlTextFragment>(matches.Count);
foreach (Match match in matches)
{
var segmentText = CollapseWhitespace(match.Value);
if (segmentText.Length == 0)
{
continue;
}
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
splitFragments.Add(new XmlTextFragment(
fragment.PageNumber,
fragment.Top,
segmentLeft,
segmentWidth,
fragment.Height,
segmentText));
}
if (splitFragments.Count < 2)
{
return [fragment];
}
var originalColumn = ResolveColumn(fragment.CenterX, columnCenters);
var distinctColumns = splitFragments
.Select(item => ResolveColumn(item.CenterX, columnCenters))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase))
? splitFragments
: [fragment];
}
private static bool LooksLikeBoundaryCrossingAffixFragment(
XmlTextFragment fragment,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) ||
!fragment.Text.Contains(" ", StringComparison.Ordinal))
{
return false;
}
var fragmentRight = fragment.Left + fragment.Width;
for (var index = 0; index < columnCenters.Count - 1; index++)
{
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
if (fragment.Left < boundary && fragmentRight > boundary)
{
return true;
}
}
return false;
}
private static void AddLegendMatch(HashSet<string> symbols, string value, string pattern)
{
foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))