Fix puncture prose boundary splitting
This commit is contained in:
@@ -15,6 +15,7 @@ internal static class CriticalTableParserSupport
|
||||
internal const int TopGroupingTolerance = 2;
|
||||
|
||||
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
|
||||
private static readonly Regex SentenceFragmentSplitRegex = new(@"\S.*?(?:[.!?](?:['"")\]]*)|$)", RegexOptions.Compiled);
|
||||
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
||||
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|–)\d+\)$", RegexOptions.Compiled);
|
||||
|
||||
@@ -373,7 +374,7 @@ internal static class CriticalTableParserSupport
|
||||
supportsPowerPointModifier: footerText.Contains("powerpoint modification", StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
internal static List<XmlTextFragment> SplitBoundaryCrossingAffixFragments(
|
||||
internal static List<XmlTextFragment> SplitBoundaryCrossingFragments(
|
||||
IReadOnlyList<XmlTextFragment> bodyFragments,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
IReadOnlySet<string> affixLegendSymbols)
|
||||
@@ -382,7 +383,7 @@ internal static class CriticalTableParserSupport
|
||||
|
||||
foreach (var fragment in bodyFragments)
|
||||
{
|
||||
splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols));
|
||||
splitFragments.AddRange(SplitBoundaryCrossingFragment(fragment, columnCenters, affixLegendSymbols));
|
||||
}
|
||||
|
||||
return splitFragments;
|
||||
@@ -467,7 +468,7 @@ internal static class CriticalTableParserSupport
|
||||
!excludedFragments.Contains(item))
|
||||
.ToList();
|
||||
|
||||
return SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
|
||||
return SplitBoundaryCrossingFragments(bodyFragments, columnCenters, affixLegendSymbols);
|
||||
}
|
||||
|
||||
internal static void RepairLeadingAffixLeakage(List<ColumnarCellEntry> cellEntries, IReadOnlySet<string> affixLegendSymbols)
|
||||
@@ -612,17 +613,17 @@ internal static class CriticalTableParserSupport
|
||||
return true;
|
||||
}
|
||||
|
||||
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingAffixFragment(
|
||||
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingFragment(
|
||||
XmlTextFragment fragment,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
IReadOnlySet<string> affixLegendSymbols)
|
||||
{
|
||||
if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols))
|
||||
if (!TryGetBoundaryCrossingPattern(fragment, columnCenters, affixLegendSymbols, out var splitPattern))
|
||||
{
|
||||
return [fragment];
|
||||
}
|
||||
|
||||
var matches = MultiFragmentSplitRegex.Matches(fragment.Text);
|
||||
var matches = splitPattern.Matches(fragment.Text);
|
||||
if (matches.Count < 2)
|
||||
{
|
||||
return [fragment];
|
||||
@@ -667,17 +668,40 @@ internal static class CriticalTableParserSupport
|
||||
: [fragment];
|
||||
}
|
||||
|
||||
private static bool LooksLikeBoundaryCrossingAffixFragment(
|
||||
private static bool TryGetBoundaryCrossingPattern(
|
||||
XmlTextFragment fragment,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
IReadOnlySet<string> affixLegendSymbols)
|
||||
IReadOnlySet<string> affixLegendSymbols,
|
||||
out Regex splitPattern)
|
||||
{
|
||||
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) ||
|
||||
!fragment.Text.Contains(" ", StringComparison.Ordinal))
|
||||
splitPattern = null!;
|
||||
|
||||
if (!CrossesColumnBoundary(fragment, columnCenters))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
|
||||
fragment.Text.Contains(" ", StringComparison.Ordinal))
|
||||
{
|
||||
splitPattern = MultiFragmentSplitRegex;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
|
||||
CountSentenceLikeSegments(fragment.Text) >= 2)
|
||||
{
|
||||
splitPattern = SentenceFragmentSplitRegex;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static bool CrossesColumnBoundary(
|
||||
XmlTextFragment fragment,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters)
|
||||
{
|
||||
var fragmentRight = fragment.Left + fragment.Width;
|
||||
|
||||
for (var index = 0; index < columnCenters.Count - 1; index++)
|
||||
@@ -692,6 +716,11 @@ internal static class CriticalTableParserSupport
|
||||
return false;
|
||||
}
|
||||
|
||||
private static int CountSentenceLikeSegments(string text) =>
|
||||
SentenceFragmentSplitRegex.Matches(text)
|
||||
.Select(match => CollapseWhitespace(match.Value))
|
||||
.Count(value => !string.IsNullOrWhiteSpace(value));
|
||||
|
||||
private static void AddLegendMatch(
|
||||
IDictionary<string, string> symbolEffects,
|
||||
string value,
|
||||
|
||||
Reference in New Issue
Block a user