Fix puncture prose boundary splitting

This commit is contained in:
2026-03-14 13:11:27 +01:00
parent f2a45656de
commit 839241ea62
3 changed files with 82 additions and 10 deletions

View File

@@ -15,6 +15,7 @@ internal static class CriticalTableParserSupport
internal const int TopGroupingTolerance = 2;
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
private static readonly Regex SentenceFragmentSplitRegex = new(@"\S.*?(?:[.!?](?:['"")\]]*)|$)", RegexOptions.Compiled);
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[-])", RegexOptions.Compiled);
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|)\d+\)$", RegexOptions.Compiled);
@@ -373,7 +374,7 @@ internal static class CriticalTableParserSupport
supportsPowerPointModifier: footerText.Contains("powerpoint modification", StringComparison.OrdinalIgnoreCase));
}
internal static List<XmlTextFragment> SplitBoundaryCrossingAffixFragments(
internal static List<XmlTextFragment> SplitBoundaryCrossingFragments(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols)
@@ -382,7 +383,7 @@ internal static class CriticalTableParserSupport
foreach (var fragment in bodyFragments)
{
splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols));
splitFragments.AddRange(SplitBoundaryCrossingFragment(fragment, columnCenters, affixLegendSymbols));
}
return splitFragments;
@@ -467,7 +468,7 @@ internal static class CriticalTableParserSupport
!excludedFragments.Contains(item))
.ToList();
return SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
return SplitBoundaryCrossingFragments(bodyFragments, columnCenters, affixLegendSymbols);
}
internal static void RepairLeadingAffixLeakage(List<ColumnarCellEntry> cellEntries, IReadOnlySet<string> affixLegendSymbols)
@@ -612,17 +613,17 @@ internal static class CriticalTableParserSupport
return true;
}
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingAffixFragment(
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingFragment(
XmlTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols)
{
if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols))
if (!TryGetBoundaryCrossingPattern(fragment, columnCenters, affixLegendSymbols, out var splitPattern))
{
return [fragment];
}
var matches = MultiFragmentSplitRegex.Matches(fragment.Text);
var matches = splitPattern.Matches(fragment.Text);
if (matches.Count < 2)
{
return [fragment];
@@ -667,17 +668,40 @@ internal static class CriticalTableParserSupport
: [fragment];
}
private static bool LooksLikeBoundaryCrossingAffixFragment(
private static bool TryGetBoundaryCrossingPattern(
XmlTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols)
IReadOnlySet<string> affixLegendSymbols,
out Regex splitPattern)
{
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) ||
!fragment.Text.Contains(" ", StringComparison.Ordinal))
splitPattern = null!;
if (!CrossesColumnBoundary(fragment, columnCenters))
{
return false;
}
if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
fragment.Text.Contains(" ", StringComparison.Ordinal))
{
splitPattern = MultiFragmentSplitRegex;
return true;
}
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
CountSentenceLikeSegments(fragment.Text) >= 2)
{
splitPattern = SentenceFragmentSplitRegex;
return true;
}
return false;
}
private static bool CrossesColumnBoundary(
XmlTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters)
{
var fragmentRight = fragment.Left + fragment.Width;
for (var index = 0; index < columnCenters.Count - 1; index++)
@@ -692,6 +716,11 @@ internal static class CriticalTableParserSupport
return false;
}
private static int CountSentenceLikeSegments(string text) =>
SentenceFragmentSplitRegex.Matches(text)
.Select(match => CollapseWhitespace(match.Value))
.Count(value => !string.IsNullOrWhiteSpace(value));
private static void AddLegendMatch(
IDictionary<string, string> symbolEffects,
string value,