Implement phase 4 critical table imports
This commit is contained in:
@@ -0,0 +1,276 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class VariantColumnCriticalTableParser
|
||||
{
|
||||
private static readonly ColumnDefinition[] ExpectedColumns =
|
||||
[
|
||||
new("NORMAL", "Normal"),
|
||||
new("MAGIC", "Magic"),
|
||||
new("MITHRIL", "Mithril"),
|
||||
new("HOLY_ARMS", "Holy Arms"),
|
||||
new("SLAYING", "Slaying")
|
||||
];
|
||||
|
||||
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
{
|
||||
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
|
||||
var headerFragments = FindHeaderFragments(fragments);
|
||||
var validationErrors = new List<string>();
|
||||
var validationWarnings = new List<string>();
|
||||
|
||||
var columnAnchors = headerFragments
|
||||
.OrderBy(item => item.Left)
|
||||
.Select(item =>
|
||||
{
|
||||
var definition = ResolveColumnDefinition(item.Text);
|
||||
return (definition.Key, definition.Label, item.CenterX);
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
|
||||
var keyTop = fragments
|
||||
.Where(item =>
|
||||
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
|
||||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
|
||||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
|
||||
.Select(item => (int?)item.Top)
|
||||
.Min() ?? int.MaxValue;
|
||||
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
|
||||
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
|
||||
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||
fragments,
|
||||
leftCutoff,
|
||||
bodyStartTop,
|
||||
keyTop);
|
||||
|
||||
var rowAnchors = rowLabelFragments
|
||||
.OrderBy(item => item.Top)
|
||||
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
|
||||
.ToList();
|
||||
|
||||
if (rowAnchors.Count == 0)
|
||||
{
|
||||
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
||||
}
|
||||
|
||||
var columnCenters = columnAnchors
|
||||
.Select(item => (item.Key, item.CenterX))
|
||||
.ToList();
|
||||
|
||||
var bodyFragments = fragments
|
||||
.Where(item =>
|
||||
item.Top >= bodyStartTop &&
|
||||
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
|
||||
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
|
||||
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
|
||||
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
|
||||
!headerFragments.Contains(item))
|
||||
.ToList();
|
||||
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
|
||||
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
|
||||
|
||||
var parsedRollBands = rowAnchors
|
||||
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
|
||||
.ToList();
|
||||
|
||||
var cellEntries = new List<CellEntry>();
|
||||
|
||||
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
|
||||
{
|
||||
var rowStart = rowIndex == 0
|
||||
? bodyStartTop
|
||||
: ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
|
||||
|
||||
var rowEnd = rowIndex == rowAnchors.Count - 1
|
||||
? keyTop - 1
|
||||
: ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
|
||||
|
||||
var rowFragments = bodyFragments
|
||||
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
|
||||
.ToList();
|
||||
|
||||
foreach (var columnAnchor in columnAnchors)
|
||||
{
|
||||
var cellFragments = rowFragments
|
||||
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
|
||||
.OrderBy(item => item.Top)
|
||||
.ThenBy(item => item.Left)
|
||||
.ToList();
|
||||
|
||||
if (cellFragments.Count == 0)
|
||||
{
|
||||
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
|
||||
continue;
|
||||
}
|
||||
|
||||
cellEntries.Add(new CellEntry(
|
||||
rowAnchors[rowIndex].Label,
|
||||
rowIndex,
|
||||
columnAnchor.Key,
|
||||
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
|
||||
}
|
||||
}
|
||||
|
||||
RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
|
||||
|
||||
var parsedCells = new List<ParsedCriticalCellArtifact>();
|
||||
var parsedResults = new List<ParsedCriticalResult>();
|
||||
|
||||
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey, StringComparer.Ordinal))
|
||||
{
|
||||
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
|
||||
if (segmentCount > 2)
|
||||
{
|
||||
validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
|
||||
}
|
||||
|
||||
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
|
||||
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
|
||||
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
|
||||
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
|
||||
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
|
||||
|
||||
parsedCells.Add(new ParsedCriticalCellArtifact(
|
||||
null,
|
||||
cellEntry.RollBandLabel,
|
||||
cellEntry.ColumnKey,
|
||||
cellEntry.Lines,
|
||||
rawCellText,
|
||||
descriptionText,
|
||||
rawAffixText));
|
||||
|
||||
parsedResults.Add(new ParsedCriticalResult(
|
||||
null,
|
||||
cellEntry.ColumnKey,
|
||||
cellEntry.RollBandLabel,
|
||||
rawCellText,
|
||||
descriptionText,
|
||||
rawAffixText));
|
||||
}
|
||||
|
||||
if (columnAnchors.Count != ExpectedColumns.Length)
|
||||
{
|
||||
validationErrors.Add($"Expected {ExpectedColumns.Length} variant columns but found {columnAnchors.Count}.");
|
||||
}
|
||||
|
||||
if (parsedCells.Count != rowAnchors.Count * columnAnchors.Count)
|
||||
{
|
||||
validationErrors.Add($"Expected {rowAnchors.Count * columnAnchors.Count} parsed cells but produced {parsedCells.Count}.");
|
||||
}
|
||||
|
||||
var validationReport = new ImportValidationReport(
|
||||
validationErrors.Count == 0,
|
||||
validationErrors,
|
||||
validationWarnings,
|
||||
rowAnchors.Count,
|
||||
parsedCells.Count);
|
||||
|
||||
var table = new ParsedCriticalTable(
|
||||
entry.Slug,
|
||||
entry.DisplayName,
|
||||
entry.Family,
|
||||
Path.GetFileName(entry.PdfPath),
|
||||
"Imported from PDF XML extraction.",
|
||||
[],
|
||||
ExpectedColumns.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Label, "variant", index + 1)).ToList(),
|
||||
parsedRollBands,
|
||||
parsedResults);
|
||||
|
||||
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport);
|
||||
}
|
||||
|
||||
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
||||
{
|
||||
var expectedLabels = ExpectedColumns
|
||||
.Select(item => item.Label.ToLowerInvariant())
|
||||
.ToList();
|
||||
|
||||
var headerCandidates = fragments
|
||||
.Where(item => expectedLabels.Contains(item.Text.Trim().ToLowerInvariant(), StringComparer.Ordinal))
|
||||
.OrderBy(item => item.Top)
|
||||
.ThenBy(item => item.Left)
|
||||
.ToList();
|
||||
|
||||
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
|
||||
{
|
||||
var ordered = group.OrderBy(item => item.Left).ToList();
|
||||
var labels = ordered.Select(item => item.Text.Trim().ToLowerInvariant()).ToList();
|
||||
if (labels.SequenceEqual(expectedLabels))
|
||||
{
|
||||
return ordered;
|
||||
}
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Could not find the variant-column header row in the XML artifact.");
|
||||
}
|
||||
|
||||
private static ColumnDefinition ResolveColumnDefinition(string value) =>
|
||||
ExpectedColumns.SingleOrDefault(item => string.Equals(item.Label, value.Trim(), StringComparison.OrdinalIgnoreCase))
|
||||
?? throw new InvalidOperationException($"Unsupported variant column label '{value}'.");
|
||||
|
||||
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
|
||||
{
|
||||
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
|
||||
var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
|
||||
|
||||
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
|
||||
{
|
||||
foreach (var columnKey in columnKeys)
|
||||
{
|
||||
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
|
||||
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
|
||||
if (current is null || next is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var leadingAffixCount = 0;
|
||||
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
|
||||
{
|
||||
leadingAffixCount++;
|
||||
}
|
||||
|
||||
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
|
||||
next.Lines.RemoveRange(0, leadingAffixCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static int ResolveRowBoundaryTop(
|
||||
RowAnchor current,
|
||||
RowAnchor next,
|
||||
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
|
||||
{
|
||||
var linesBetweenLabels = bodyLines
|
||||
.Where(item => item.Top >= current.Top && item.Top < next.Top)
|
||||
.OrderBy(item => item.Top)
|
||||
.ToList();
|
||||
|
||||
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
|
||||
{
|
||||
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
|
||||
{
|
||||
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
|
||||
}
|
||||
|
||||
private sealed record ColumnDefinition(string Key, string Label);
|
||||
|
||||
private sealed record RowAnchor(string Label, int Top, int SortOrder);
|
||||
|
||||
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
|
||||
{
|
||||
public string RollBandLabel { get; } = rollBandLabel;
|
||||
public int RowIndex { get; } = rowIndex;
|
||||
public string ColumnKey { get; } = columnKey;
|
||||
public List<string> Lines { get; } = lines;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user