Fix critical importer row and column boundary parsing
This commit is contained in:
@@ -238,9 +238,10 @@ The currently enabled phase-3 table set is:
|
|||||||
Current phase-3 notes:
|
Current phase-3 notes:
|
||||||
|
|
||||||
- header detection now tolerates minor `top` misalignment across the `A-E` header glyphs
|
- header detection now tolerates minor `top` misalignment across the `A-E` header glyphs
|
||||||
|
- first-row body parsing can now begin slightly above the first roll-band label when the PDF places prose between the header row and the label, which prevents clipped `01-05` cells such as `Mana.pdf`
|
||||||
- row boundaries can snap to the last affix-to-prose transition between adjacent roll labels when midpoint slicing would leak into the next row
|
- row boundaries can snap to the last affix-to-prose transition between adjacent roll labels when midpoint slicing would leak into the next row
|
||||||
- affix symbols are learned from the footer legend before body parsing, so symbol-only affix fragments are classified correctly
|
- affix symbols are learned from the footer legend before body parsing, so symbol-only affix fragments are classified correctly
|
||||||
- affix fragments that cross a column boundary in the XML can be split on hard internal spacing before column assignment, which is required for `Mana.pdf`
|
- cross-column text fragments can now be split at geometry-aligned whitespace boundaries before column assignment, while affix fragments still split on hard internal spacing
|
||||||
- footer page numbers are filtered out before body parsing
|
- footer page numbers are filtered out before body parsing
|
||||||
- validation allows a single contiguous affix block either before or after prose
|
- validation allows a single contiguous affix block either before or after prose
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
@@ -201,6 +201,107 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
Assert.DoesNotContain('\uF06C', row100C.DescriptionText);
|
Assert.DoesNotContain('\uF06C', row100C.DescriptionText);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Arcane_aether_first_row_keeps_c_and_d_text_in_separate_columns()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "arcane-aether", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var row01C = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
|
string.Equals(item.RollBandLabel, "01-05", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
|
||||||
|
var row01D = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
|
string.Equals(item.RollBandLabel, "01-05", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "D", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.Equal("Ooooh. That's the way to frighten him", row01C.DescriptionText);
|
||||||
|
Assert.Equal("That looked like it hurt. It didn't.", row01D.DescriptionText);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Arcane_aether_31_40_keeps_a_and_b_text_in_separate_columns()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "arcane-aether", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var row31A = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
|
string.Equals(item.RollBandLabel, "31-40", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "A", StringComparison.Ordinal));
|
||||||
|
var row31B = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
|
string.Equals(item.RollBandLabel, "31-40", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.Equal("Burns cause foe to bring up his guard.", row31A.DescriptionText);
|
||||||
|
Assert.Equal("Confused foe brings up his guard. He loses initiative for two rounds.", row31B.DescriptionText);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Arcane_aether_41_50_keeps_d_and_e_text_in_separate_columns()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "arcane-aether", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var row41D = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
|
string.Equals(item.RollBandLabel, "41-50", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "D", StringComparison.Ordinal));
|
||||||
|
var row41E = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
|
string.Equals(item.RollBandLabel, "41-50", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.Equal("Foe is spun by a strike to his shoulder.", row41D.DescriptionText);
|
||||||
|
Assert.Equal("Powerful blast knocks foe back three steps and cause him to drop all objects.", row41E.DescriptionText);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Mana_first_row_keeps_all_five_columns_populated()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
|
||||||
|
Assert.Equal("Lots of fireworks, but little effect.", FindResult(parseResult, "01-05", "A").DescriptionText);
|
||||||
|
Assert.Equal("Somewhere a bell tolls.", FindResult(parseResult, "01-05", "B").DescriptionText);
|
||||||
|
Assert.Equal("Weak blast.", FindResult(parseResult, "01-05", "C").DescriptionText);
|
||||||
|
Assert.Equal("Foe dances around your blast.", FindResult(parseResult, "01-05", "D").DescriptionText);
|
||||||
|
Assert.Equal("Foe does damage trying to dodge.", FindResult(parseResult, "01-05", "E").DescriptionText);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Arcane_nether_first_row_keeps_b_column_populated()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "arcane-nether", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
|
||||||
|
Assert.Equal("Glancing blow.", FindResult(parseResult, "01-05", "B").DescriptionText);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Krush_36_45_keeps_a_b_and_c_content_in_separate_columns()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "krush", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
|
||||||
|
Assert.Equal("Bust foe's shin. You have initiative.", FindResult(parseResult, "36-45", "A").DescriptionText);
|
||||||
|
Assert.Equal("Blow to foe's left calf. You gain initiative.", FindResult(parseResult, "36-45", "B").DescriptionText);
|
||||||
|
Assert.Equal("Catch foe in lower leg. You gain initiative, while foe regains footing.", FindResult(parseResult, "36-45", "C").DescriptionText);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Super_large_creature_weapon_99_100_holy_arms_does_not_capture_previous_row_text()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "super_large_creature_weapon", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var row99HolyArms = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
|
string.Equals(item.RollBandLabel, "99-100", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "HOLY_ARMS", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.StartsWith("Strike through foe's heart kills him instantly.", row99HolyArms.DescriptionText, StringComparison.Ordinal);
|
||||||
|
Assert.DoesNotContain("all allies get (+10)", row99HolyArms.DescriptionText, StringComparison.OrdinalIgnoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public async Task Mana_affix_boundaries_keep_71_75_a_and_b_separate()
|
public async Task Mana_affix_boundaries_keep_71_75_a_and_b_separate()
|
||||||
{
|
{
|
||||||
@@ -551,6 +652,16 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
private static CriticalImportManifest LoadManifest() =>
|
private static CriticalImportManifest LoadManifest() =>
|
||||||
new CriticalImportManifestLoader().Load(Path.Combine(GetRepositoryRoot(), "sources", "critical-import-manifest.json"));
|
new CriticalImportManifestLoader().Load(Path.Combine(GetRepositoryRoot(), "sources", "critical-import-manifest.json"));
|
||||||
|
|
||||||
|
private static ParsedCriticalResult FindResult(
|
||||||
|
CriticalTableParseResult parseResult,
|
||||||
|
string rollBandLabel,
|
||||||
|
string columnKey,
|
||||||
|
string? groupKey = null) =>
|
||||||
|
parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.RollBandLabel, rollBandLabel, StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal));
|
||||||
|
|
||||||
private static string GetArtifactCacheRoot()
|
private static string GetArtifactCacheRoot()
|
||||||
{
|
{
|
||||||
var cacheRoot = Path.Combine(Path.GetTempPath(), "RolemasterDb.ImportTool.Tests");
|
var cacheRoot = Path.Combine(Path.GetTempPath(), "RolemasterDb.ImportTool.Tests");
|
||||||
|
|||||||
@@ -9,15 +9,17 @@ namespace RolemasterDb.ImportTool.Parsing;
|
|||||||
internal static class CriticalTableParserSupport
|
internal static class CriticalTableParserSupport
|
||||||
{
|
{
|
||||||
internal const int HeaderToBodyMinimumGap = 20;
|
internal const int HeaderToBodyMinimumGap = 20;
|
||||||
|
internal const int HeaderToRowLabelMinimumGap = 10;
|
||||||
internal const int FooterLabelExclusionGap = 15;
|
internal const int FooterLabelExclusionGap = 15;
|
||||||
internal const int FooterPageNumberExclusionGap = 80;
|
internal const int FooterPageNumberExclusionGap = 80;
|
||||||
internal const int RowLabelDuplicateTolerance = 15;
|
internal const int RowLabelDuplicateTolerance = 15;
|
||||||
internal const int TopGroupingTolerance = 2;
|
internal const int TopGroupingTolerance = 2;
|
||||||
|
internal const int BoundarySplitSearchRadiusChars = 12;
|
||||||
|
|
||||||
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
|
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
|
||||||
private static readonly Regex SentenceFragmentSplitRegex = new(@"\S.*?(?:[.!?](?:['"")\]]*)|$)", RegexOptions.Compiled);
|
|
||||||
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
||||||
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|–)\d+\)$", RegexOptions.Compiled);
|
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|–)\d+\)$", RegexOptions.Compiled);
|
||||||
|
private static readonly Regex BoundaryBonusLineRegex = new(@"^(?:all allies|all foe's allies|all foes|all opponents)\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||||
|
|
||||||
internal static List<XmlTextFragment> LoadFragments(string xmlContent)
|
internal static List<XmlTextFragment> LoadFragments(string xmlContent)
|
||||||
{
|
{
|
||||||
@@ -405,7 +407,7 @@ internal static class CriticalTableParserSupport
|
|||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
var isAffixLike = columnTexts.Count > 0 &&
|
var isAffixLike = columnTexts.Count > 0 &&
|
||||||
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols));
|
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols) || IsBoundaryBonusLine(text));
|
||||||
|
|
||||||
bodyLines.Add((lineFragments[0].Top, isAffixLike));
|
bodyLines.Add((lineFragments[0].Top, isAffixLike));
|
||||||
}
|
}
|
||||||
@@ -448,6 +450,20 @@ internal static class CriticalTableParserSupport
|
|||||||
.Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1))
|
.Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1))
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
|
internal static int ResolveBodyStartTop(int headerTop, IReadOnlyList<RowAnchor> rowAnchors)
|
||||||
|
{
|
||||||
|
if (rowAnchors.Count == 0)
|
||||||
|
{
|
||||||
|
return headerTop + HeaderToBodyMinimumGap;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Math.Min(
|
||||||
|
headerTop + HeaderToBodyMinimumGap,
|
||||||
|
Math.Max(
|
||||||
|
headerTop + HeaderToRowLabelMinimumGap,
|
||||||
|
rowAnchors[0].Top - HeaderToRowLabelMinimumGap - TopGroupingTolerance));
|
||||||
|
}
|
||||||
|
|
||||||
internal static List<XmlTextFragment> BuildBodyFragments(
|
internal static List<XmlTextFragment> BuildBodyFragments(
|
||||||
IReadOnlyList<XmlTextFragment> fragments,
|
IReadOnlyList<XmlTextFragment> fragments,
|
||||||
int bodyStartTop,
|
int bodyStartTop,
|
||||||
@@ -618,12 +634,30 @@ internal static class CriticalTableParserSupport
|
|||||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||||
IReadOnlySet<string> affixLegendSymbols)
|
IReadOnlySet<string> affixLegendSymbols)
|
||||||
{
|
{
|
||||||
if (!TryGetBoundaryCrossingPattern(fragment, columnCenters, affixLegendSymbols, out var splitPattern))
|
if (!CrossesColumnBoundary(fragment, columnCenters))
|
||||||
{
|
{
|
||||||
return [fragment];
|
return [fragment];
|
||||||
}
|
}
|
||||||
|
|
||||||
var matches = splitPattern.Matches(fragment.Text);
|
if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
|
||||||
|
fragment.Text.Contains(" ", StringComparison.Ordinal))
|
||||||
|
{
|
||||||
|
return BuildSplitFragmentsFromMatches(fragment, MultiFragmentSplitRegex.Matches(fragment.Text), columnCenters);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (TrySplitProseFragmentAtBoundaries(fragment, columnCenters, out var splitFragments))
|
||||||
|
{
|
||||||
|
return splitFragments;
|
||||||
|
}
|
||||||
|
|
||||||
|
return [fragment];
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IReadOnlyList<XmlTextFragment> BuildSplitFragmentsFromMatches(
|
||||||
|
XmlTextFragment fragment,
|
||||||
|
MatchCollection matches,
|
||||||
|
IReadOnlyList<(string Key, double CenterX)> columnCenters)
|
||||||
|
{
|
||||||
if (matches.Count < 2)
|
if (matches.Count < 2)
|
||||||
{
|
{
|
||||||
return [fragment];
|
return [fragment];
|
||||||
@@ -668,34 +702,158 @@ internal static class CriticalTableParserSupport
|
|||||||
: [fragment];
|
: [fragment];
|
||||||
}
|
}
|
||||||
|
|
||||||
private static bool TryGetBoundaryCrossingPattern(
|
private static bool TrySplitProseFragmentAtBoundaries(
|
||||||
XmlTextFragment fragment,
|
XmlTextFragment fragment,
|
||||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||||
IReadOnlySet<string> affixLegendSymbols,
|
out IReadOnlyList<XmlTextFragment> splitFragments)
|
||||||
out Regex splitPattern)
|
|
||||||
{
|
{
|
||||||
splitPattern = null!;
|
splitFragments = null!;
|
||||||
|
|
||||||
if (!CrossesColumnBoundary(fragment, columnCenters))
|
var boundaryIndexes = FindBoundarySplitIndexes(fragment, columnCenters);
|
||||||
|
if (boundaryIndexes.Count == 0)
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
|
var segments = new List<XmlTextFragment>();
|
||||||
fragment.Text.Contains(" ", StringComparison.Ordinal))
|
var segmentStart = 0;
|
||||||
|
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
|
||||||
|
|
||||||
|
foreach (var splitIndex in boundaryIndexes)
|
||||||
{
|
{
|
||||||
splitPattern = MultiFragmentSplitRegex;
|
var segment = CreateFragmentSegment(fragment, segmentStart, splitIndex - segmentStart, characterWidth);
|
||||||
return true;
|
if (segment is not null)
|
||||||
|
{
|
||||||
|
segments.Add(segment);
|
||||||
|
}
|
||||||
|
|
||||||
|
segmentStart = splitIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) &&
|
var trailingSegment = CreateFragmentSegment(fragment, segmentStart, fragment.Text.Length - segmentStart, characterWidth);
|
||||||
CountSentenceLikeSegments(fragment.Text) >= 2)
|
if (trailingSegment is not null)
|
||||||
{
|
{
|
||||||
splitPattern = SentenceFragmentSplitRegex;
|
segments.Add(trailingSegment);
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
if (segments.Count < 2)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
splitFragments = segments;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<int> FindBoundarySplitIndexes(
|
||||||
|
XmlTextFragment fragment,
|
||||||
|
IReadOnlyList<(string Key, double CenterX)> columnCenters)
|
||||||
|
{
|
||||||
|
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
|
||||||
|
var fragmentRight = fragment.Left + fragment.Width;
|
||||||
|
var splitIndexes = new List<int>();
|
||||||
|
var minimumIndex = 1;
|
||||||
|
|
||||||
|
for (var index = 0; index < columnCenters.Count - 1; index++)
|
||||||
|
{
|
||||||
|
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
|
||||||
|
if (fragment.Left >= boundary || fragmentRight <= boundary)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var targetIndex = (int)Math.Round((boundary - fragment.Left) / characterWidth);
|
||||||
|
var splitIndex = FindNearestWhitespaceSplitIndex(fragment.Text, targetIndex, minimumIndex);
|
||||||
|
if (splitIndex is null)
|
||||||
|
{
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
splitIndexes.Add(splitIndex.Value);
|
||||||
|
minimumIndex = splitIndex.Value + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return splitIndexes;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int? FindNearestWhitespaceSplitIndex(string text, int targetIndex, int minimumIndex)
|
||||||
|
{
|
||||||
|
var start = Math.Max(minimumIndex, targetIndex - BoundarySplitSearchRadiusChars);
|
||||||
|
var end = Math.Min(text.Length - 1, targetIndex + BoundarySplitSearchRadiusChars);
|
||||||
|
int? bestIndex = null;
|
||||||
|
var bestDistance = int.MaxValue;
|
||||||
|
|
||||||
|
for (var index = start; index <= end; index++)
|
||||||
|
{
|
||||||
|
if (!char.IsWhiteSpace(text[index]))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var candidate = index;
|
||||||
|
while (candidate < text.Length && char.IsWhiteSpace(text[candidate]))
|
||||||
|
{
|
||||||
|
candidate++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (candidate <= minimumIndex || candidate >= text.Length)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var distance = Math.Abs(candidate - targetIndex);
|
||||||
|
if (distance >= bestDistance)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
bestDistance = distance;
|
||||||
|
bestIndex = candidate;
|
||||||
|
}
|
||||||
|
|
||||||
|
return bestIndex;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static XmlTextFragment? CreateFragmentSegment(
|
||||||
|
XmlTextFragment fragment,
|
||||||
|
int startIndex,
|
||||||
|
int length,
|
||||||
|
double characterWidth)
|
||||||
|
{
|
||||||
|
if (length <= 0)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
var rawSegment = fragment.Text.Substring(startIndex, length);
|
||||||
|
var trimmedStart = 0;
|
||||||
|
while (trimmedStart < rawSegment.Length && char.IsWhiteSpace(rawSegment[trimmedStart]))
|
||||||
|
{
|
||||||
|
trimmedStart++;
|
||||||
|
}
|
||||||
|
|
||||||
|
var trimmedEnd = rawSegment.Length - 1;
|
||||||
|
while (trimmedEnd >= trimmedStart && char.IsWhiteSpace(rawSegment[trimmedEnd]))
|
||||||
|
{
|
||||||
|
trimmedEnd--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (trimmedEnd < trimmedStart)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
var actualStart = startIndex + trimmedStart;
|
||||||
|
var actualLength = trimmedEnd - trimmedStart + 1;
|
||||||
|
var segmentText = CollapseWhitespace(fragment.Text.Substring(actualStart, actualLength));
|
||||||
|
|
||||||
|
return new XmlTextFragment(
|
||||||
|
fragment.PageNumber,
|
||||||
|
fragment.Top,
|
||||||
|
fragment.Left + (int)Math.Round(characterWidth * actualStart),
|
||||||
|
Math.Max(1, (int)Math.Round(characterWidth * actualLength)),
|
||||||
|
fragment.Height,
|
||||||
|
segmentText);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static bool CrossesColumnBoundary(
|
private static bool CrossesColumnBoundary(
|
||||||
@@ -716,10 +874,8 @@ internal static class CriticalTableParserSupport
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int CountSentenceLikeSegments(string text) =>
|
private static bool IsBoundaryBonusLine(string text) =>
|
||||||
SentenceFragmentSplitRegex.Matches(text)
|
BoundaryBonusLineRegex.IsMatch(text.Trim());
|
||||||
.Select(match => CollapseWhitespace(match.Value))
|
|
||||||
.Count(value => !string.IsNullOrWhiteSpace(value));
|
|
||||||
|
|
||||||
private static void AddLegendMatch(
|
private static void AddLegendMatch(
|
||||||
IDictionary<string, string> symbolEffects,
|
IDictionary<string, string> symbolEffects,
|
||||||
|
|||||||
@@ -32,10 +32,9 @@ public sealed class GroupedVariantCriticalTableParser
|
|||||||
})
|
})
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
var bodyStartTop = Math.Max(
|
var headerTop = Math.Max(
|
||||||
groupHeaders.Max(item => item.Top),
|
groupHeaders.Max(item => item.Top),
|
||||||
columnHeaders.Max(item => item.Top))
|
columnHeaders.Max(item => item.Top));
|
||||||
+ CriticalTableParserSupport.HeaderToBodyMinimumGap;
|
|
||||||
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
||||||
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
|
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
|
||||||
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
||||||
@@ -43,9 +42,10 @@ public sealed class GroupedVariantCriticalTableParser
|
|||||||
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||||
fragments,
|
fragments,
|
||||||
leftCutoff,
|
leftCutoff,
|
||||||
bodyStartTop,
|
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
|
||||||
keyTop);
|
keyTop);
|
||||||
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
|
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
|
||||||
|
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
|
||||||
|
|
||||||
if (rowAnchors.Count == 0)
|
if (rowAnchors.Count == 0)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ public sealed class StandardCriticalTableParser
|
|||||||
.Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX))
|
.Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX))
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
|
var headerTop = headerFragments.Max(item => item.Top);
|
||||||
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
||||||
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
|
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
|
||||||
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
||||||
@@ -22,9 +22,10 @@ public sealed class StandardCriticalTableParser
|
|||||||
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||||
fragments,
|
fragments,
|
||||||
leftCutoff,
|
leftCutoff,
|
||||||
bodyStartTop,
|
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
|
||||||
keyTop);
|
keyTop);
|
||||||
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
|
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
|
||||||
|
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
|
||||||
|
|
||||||
if (rowAnchors.Count == 0)
|
if (rowAnchors.Count == 0)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ public sealed class VariantColumnCriticalTableParser
|
|||||||
})
|
})
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
|
var headerTop = headerFragments.Max(item => item.Top);
|
||||||
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
||||||
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
|
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
|
||||||
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
||||||
@@ -35,9 +35,10 @@ public sealed class VariantColumnCriticalTableParser
|
|||||||
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||||
fragments,
|
fragments,
|
||||||
leftCutoff,
|
leftCutoff,
|
||||||
bodyStartTop,
|
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
|
||||||
keyTop);
|
keyTop);
|
||||||
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
|
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
|
||||||
|
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
|
||||||
|
|
||||||
if (rowAnchors.Count == 0)
|
if (rowAnchors.Count == 0)
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user