Implement phase 3 standard critical imports
This commit is contained in:
@@ -7,14 +7,17 @@ namespace RolemasterDb.ImportTool.Parsing;
|
||||
public sealed class StandardCriticalTableParser
|
||||
{
|
||||
private const int HeaderToBodyMinimumGap = 20;
|
||||
private const int FooterLabelExclusionGap = 15;
|
||||
private const int FooterPageNumberExclusionGap = 80;
|
||||
private const int RowLabelDuplicateTolerance = 15;
|
||||
private const int TopGroupingTolerance = 2;
|
||||
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
||||
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled);
|
||||
|
||||
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
{
|
||||
var fragments = LoadFragments(xmlContent);
|
||||
var headerFragments = FindHeaderFragments(fragments);
|
||||
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments);
|
||||
var validationErrors = new List<string>();
|
||||
|
||||
var columnCenters = headerFragments
|
||||
@@ -22,6 +25,16 @@ public sealed class StandardCriticalTableParser
|
||||
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
|
||||
.ToList();
|
||||
|
||||
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
||||
var keyTop = fragments
|
||||
.Where(item =>
|
||||
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
|
||||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
|
||||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
|
||||
.Select(item => (int?)item.Top)
|
||||
.Min() ?? int.MaxValue;
|
||||
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments, keyTop);
|
||||
|
||||
var rowAnchors = rowLabelFragments
|
||||
.OrderBy(item => item.Top)
|
||||
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
|
||||
@@ -32,16 +45,11 @@ public sealed class StandardCriticalTableParser
|
||||
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
||||
}
|
||||
|
||||
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
||||
var keyTop = fragments
|
||||
.Where(item => string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase))
|
||||
.Select(item => (int?)item.Top)
|
||||
.Min() ?? int.MaxValue;
|
||||
|
||||
var bodyFragments = fragments
|
||||
.Where(item =>
|
||||
item.Top >= bodyStartTop &&
|
||||
item.Top < keyTop - 1 &&
|
||||
!IsFooterPageNumberFragment(item, keyTop) &&
|
||||
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
|
||||
!headerFragments.Contains(item))
|
||||
.ToList();
|
||||
@@ -56,11 +64,11 @@ public sealed class StandardCriticalTableParser
|
||||
{
|
||||
var rowStart = rowIndex == 0
|
||||
? bodyStartTop
|
||||
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0);
|
||||
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0) + 1;
|
||||
|
||||
var rowEnd = rowIndex == rowAnchors.Count - 1
|
||||
? keyTop - 1
|
||||
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0);
|
||||
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0) + 1;
|
||||
|
||||
var rowFragments = bodyFragments
|
||||
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
|
||||
@@ -95,26 +103,12 @@ public sealed class StandardCriticalTableParser
|
||||
|
||||
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
|
||||
{
|
||||
var firstProseIndex = cellEntry.Lines.FindIndex(line => !IsAffixLikeLine(line));
|
||||
var firstAffixIndex = cellEntry.Lines.FindIndex(IsAffixLikeLine);
|
||||
var segmentCount = CountLineTypeSegments(cellEntry.Lines);
|
||||
|
||||
if (firstProseIndex > 0)
|
||||
if (segmentCount > 2)
|
||||
{
|
||||
validationErrors.Add(
|
||||
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' begins with affix-like lines before prose.");
|
||||
}
|
||||
|
||||
if (firstAffixIndex >= 0)
|
||||
{
|
||||
var proseAfterAffix = cellEntry.Lines
|
||||
.Skip(firstAffixIndex + 1)
|
||||
.Any(line => !IsAffixLikeLine(line));
|
||||
|
||||
if (proseAfterAffix)
|
||||
{
|
||||
validationErrors.Add(
|
||||
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' contains prose after affix lines.");
|
||||
}
|
||||
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
|
||||
}
|
||||
|
||||
var rawAffixLines = cellEntry.Lines.Where(IsAffixLikeLine).ToList();
|
||||
@@ -200,12 +194,13 @@ public sealed class StandardCriticalTableParser
|
||||
|
||||
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
||||
{
|
||||
var groupedByTop = fragments
|
||||
var headerCandidates = fragments
|
||||
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
|
||||
.GroupBy(item => item.Top)
|
||||
.OrderBy(group => group.Key);
|
||||
.OrderBy(item => item.Top)
|
||||
.ThenBy(item => item.Left)
|
||||
.ToList();
|
||||
|
||||
foreach (var group in groupedByTop)
|
||||
foreach (var group in GroupByTop(headerCandidates))
|
||||
{
|
||||
var ordered = group.OrderBy(item => item.Left).ToList();
|
||||
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
|
||||
@@ -220,18 +215,37 @@ public sealed class StandardCriticalTableParser
|
||||
|
||||
private static List<XmlTextFragment> FindRowLabelFragments(
|
||||
IReadOnlyList<XmlTextFragment> fragments,
|
||||
IReadOnlyList<XmlTextFragment> headerFragments)
|
||||
IReadOnlyList<XmlTextFragment> headerFragments,
|
||||
int keyTop)
|
||||
{
|
||||
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
|
||||
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
||||
|
||||
return fragments
|
||||
var candidates = fragments
|
||||
.Where(item =>
|
||||
item.Left < leftCutoff &&
|
||||
item.Top >= bodyStartTop &&
|
||||
item.Top < keyTop - FooterLabelExclusionGap &&
|
||||
IsRollBandLabel(item.Text))
|
||||
.OrderBy(item => item.Top)
|
||||
.ToList();
|
||||
|
||||
var deduped = new List<XmlTextFragment>();
|
||||
|
||||
foreach (var candidate in candidates)
|
||||
{
|
||||
var previous = deduped.LastOrDefault();
|
||||
if (previous is not null &&
|
||||
string.Equals(previous.Text, candidate.Text, StringComparison.OrdinalIgnoreCase) &&
|
||||
Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
deduped.Add(candidate);
|
||||
}
|
||||
|
||||
return deduped;
|
||||
}
|
||||
|
||||
private static bool IsRollBandLabel(string value) =>
|
||||
@@ -293,7 +307,7 @@ public sealed class StandardCriticalTableParser
|
||||
return false;
|
||||
}
|
||||
|
||||
if (value == "-" || value == "\u2014")
|
||||
if (value == "-" || value == "\u2013" || value == "\u2014")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@@ -301,7 +315,10 @@ public sealed class StandardCriticalTableParser
|
||||
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
|
||||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
|
||||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
|
||||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase))
|
||||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
|
||||
value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
|
||||
value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
|
||||
value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return value.Contains(':', StringComparison.Ordinal);
|
||||
}
|
||||
@@ -311,10 +328,9 @@ public sealed class StandardCriticalTableParser
|
||||
value.StartsWith("\u220F", StringComparison.Ordinal) ||
|
||||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
|
||||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
|
||||
StandaloneModifierAffixLineRegex.IsMatch(value) ||
|
||||
NumericAffixLineRegex.IsMatch(value) ||
|
||||
value.Contains(" - ", StringComparison.Ordinal) ||
|
||||
value.Contains("(-", StringComparison.Ordinal) ||
|
||||
value.Contains("(+", StringComparison.Ordinal);
|
||||
value.Contains(" - ", StringComparison.Ordinal);
|
||||
}
|
||||
|
||||
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries)
|
||||
@@ -361,6 +377,55 @@ public sealed class StandardCriticalTableParser
|
||||
.Replace('\n', ' ')
|
||||
.Trim();
|
||||
|
||||
private static int CountLineTypeSegments(IReadOnlyList<string> lines)
|
||||
{
|
||||
var segmentCount = 0;
|
||||
bool? previousIsAffix = null;
|
||||
|
||||
foreach (var line in lines)
|
||||
{
|
||||
var currentIsAffix = IsAffixLikeLine(line);
|
||||
if (previousIsAffix == currentIsAffix)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
segmentCount++;
|
||||
previousIsAffix = currentIsAffix;
|
||||
}
|
||||
|
||||
return segmentCount;
|
||||
}
|
||||
|
||||
private static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
|
||||
{
|
||||
if (keyTop == int.MaxValue)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return fragment.Top >= keyTop - FooterPageNumberExclusionGap &&
|
||||
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
|
||||
}
|
||||
|
||||
private static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
|
||||
{
|
||||
var groups = new List<List<XmlTextFragment>>();
|
||||
|
||||
foreach (var fragment in fragments)
|
||||
{
|
||||
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance)
|
||||
{
|
||||
groups.Add([fragment]);
|
||||
continue;
|
||||
}
|
||||
|
||||
groups[^1].Add(fragment);
|
||||
}
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
private sealed record ColumnAnchor(string Key, double CenterX);
|
||||
|
||||
private sealed record RowAnchor(string Label, int Top, int SortOrder);
|
||||
|
||||
Reference in New Issue
Block a user