Fix mana affix column leakage
This commit is contained in:
@@ -240,6 +240,7 @@ Current phase-3 notes:
|
|||||||
- header detection now tolerates minor `top` misalignment across the `A-E` header glyphs
|
- header detection now tolerates minor `top` misalignment across the `A-E` header glyphs
|
||||||
- row boundaries can snap to the last affix-to-prose transition between adjacent roll labels when midpoint slicing would leak into the next row
|
- row boundaries can snap to the last affix-to-prose transition between adjacent roll labels when midpoint slicing would leak into the next row
|
||||||
- affix symbols are learned from the footer legend before body parsing, so symbol-only affix fragments are classified correctly
|
- affix symbols are learned from the footer legend before body parsing, so symbol-only affix fragments are classified correctly
|
||||||
|
- affix fragments that cross a column boundary in the XML can be split on hard internal spacing before column assignment, which is required for `Mana.pdf`
|
||||||
- footer page numbers are filtered out before body parsing
|
- footer page numbers are filtered out before body parsing
|
||||||
- validation allows a single contiguous affix block either before or after prose
|
- validation allows a single contiguous affix block either before or after prose
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
@@ -137,6 +137,70 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
Assert.DoesNotContain('\uF06C', row100C.DescriptionText);
|
Assert.DoesNotContain('\uF06C', row100C.DescriptionText);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Mana_affix_boundaries_keep_71_75_a_and_b_separate()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var row71A = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "A", StringComparison.Ordinal));
|
||||||
|
var row71B = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.DoesNotContain("+10H -", row71A.RawAffixText, StringComparison.Ordinal);
|
||||||
|
Assert.Contains("+10H -", row71B.RawAffixText, StringComparison.Ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Mana_affix_boundaries_keep_71_75_d_and_e_separate()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var row71D = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "D", StringComparison.Ordinal));
|
||||||
|
var row71E = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.DoesNotContain("+16H - 6", row71D.RawAffixText, StringComparison.Ordinal);
|
||||||
|
Assert.Contains("+16H - 6", row71E.RawAffixText, StringComparison.Ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Mana_affix_boundaries_keep_91_95_b_and_c_separate()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var row91B = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
|
||||||
|
var row91C = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.DoesNotContain("+19H - 9", row91B.RawAffixText, StringComparison.Ordinal);
|
||||||
|
Assert.Contains("+19H - 9", row91C.RawAffixText, StringComparison.Ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Mana_affix_boundaries_keep_86_90_b_and_c_separate()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var row86B = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
|
||||||
|
var row86C = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.DoesNotContain("+16H - 8", row86B.RawAffixText, StringComparison.Ordinal);
|
||||||
|
Assert.Contains("+16H - 8", row86C.RawAffixText, StringComparison.Ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
private static async Task<StandardCriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
|
private static async Task<StandardCriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
|
||||||
{
|
{
|
||||||
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
|
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ public sealed class StandardCriticalTableParser
|
|||||||
private const int FooterPageNumberExclusionGap = 80;
|
private const int FooterPageNumberExclusionGap = 80;
|
||||||
private const int RowLabelDuplicateTolerance = 15;
|
private const int RowLabelDuplicateTolerance = 15;
|
||||||
private const int TopGroupingTolerance = 2;
|
private const int TopGroupingTolerance = 2;
|
||||||
|
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
|
||||||
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
||||||
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled);
|
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled);
|
||||||
|
|
||||||
@@ -55,6 +56,7 @@ public sealed class StandardCriticalTableParser
|
|||||||
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
|
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
|
||||||
!headerFragments.Contains(item))
|
!headerFragments.Contains(item))
|
||||||
.ToList();
|
.ToList();
|
||||||
|
bodyFragments = SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
|
||||||
var bodyLines = BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
|
var bodyLines = BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
|
||||||
|
|
||||||
var parsedRollBands = rowAnchors
|
var parsedRollBands = rowAnchors
|
||||||
@@ -460,6 +462,101 @@ public sealed class StandardCriticalTableParser
|
|||||||
return symbols;
|
return symbols;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static List<XmlTextFragment> SplitBoundaryCrossingAffixFragments(
|
||||||
|
IReadOnlyList<XmlTextFragment> bodyFragments,
|
||||||
|
IReadOnlyList<ColumnAnchor> columnCenters,
|
||||||
|
ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count);
|
||||||
|
|
||||||
|
foreach (var fragment in bodyFragments)
|
||||||
|
{
|
||||||
|
splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols));
|
||||||
|
}
|
||||||
|
|
||||||
|
return splitFragments;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingAffixFragment(
|
||||||
|
XmlTextFragment fragment,
|
||||||
|
IReadOnlyList<ColumnAnchor> columnCenters,
|
||||||
|
ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols))
|
||||||
|
{
|
||||||
|
return [fragment];
|
||||||
|
}
|
||||||
|
|
||||||
|
var matches = MultiFragmentSplitRegex.Matches(fragment.Text);
|
||||||
|
if (matches.Count < 2)
|
||||||
|
{
|
||||||
|
return [fragment];
|
||||||
|
}
|
||||||
|
|
||||||
|
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
|
||||||
|
var splitFragments = new List<XmlTextFragment>(matches.Count);
|
||||||
|
|
||||||
|
foreach (Match match in matches)
|
||||||
|
{
|
||||||
|
var segmentText = CollapseWhitespace(match.Value);
|
||||||
|
if (segmentText.Length == 0)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
|
||||||
|
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
|
||||||
|
|
||||||
|
splitFragments.Add(new XmlTextFragment(
|
||||||
|
fragment.PageNumber,
|
||||||
|
fragment.Top,
|
||||||
|
segmentLeft,
|
||||||
|
segmentWidth,
|
||||||
|
fragment.Height,
|
||||||
|
segmentText));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (splitFragments.Count < 2)
|
||||||
|
{
|
||||||
|
return [fragment];
|
||||||
|
}
|
||||||
|
|
||||||
|
var originalColumn = ResolveColumn(fragment.CenterX, columnCenters);
|
||||||
|
var distinctColumns = splitFragments
|
||||||
|
.Select(item => ResolveColumn(item.CenterX, columnCenters))
|
||||||
|
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase))
|
||||||
|
? splitFragments
|
||||||
|
: [fragment];
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool LooksLikeBoundaryCrossingAffixFragment(
|
||||||
|
XmlTextFragment fragment,
|
||||||
|
IReadOnlyList<ColumnAnchor> columnCenters,
|
||||||
|
ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) ||
|
||||||
|
!fragment.Text.Contains(" ", StringComparison.Ordinal))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var fragmentRight = fragment.Left + fragment.Width;
|
||||||
|
|
||||||
|
for (var index = 0; index < columnCenters.Count - 1; index++)
|
||||||
|
{
|
||||||
|
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
|
||||||
|
if (fragment.Left < boundary && fragmentRight > boundary)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private static void AddLegendMatch(HashSet<string> symbols, string value, string pattern)
|
private static void AddLegendMatch(HashSet<string> symbols, string value, string pattern)
|
||||||
{
|
{
|
||||||
foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))
|
foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))
|
||||||
|
|||||||
Reference in New Issue
Block a user