Implement phase 4 critical table imports

This commit is contained in:
2026-03-14 03:27:14 +01:00
parent a391a1421a
commit b2f61c3d73
17 changed files with 1280 additions and 474 deletions

View File

@@ -8,6 +8,8 @@ public sealed class CriticalImportCommandRunner
private readonly ImportArtifactWriter artifactWriter = new();
private readonly PdfXmlExtractor pdfXmlExtractor = new();
private readonly StandardCriticalTableParser standardParser = new();
private readonly VariantColumnCriticalTableParser variantColumnParser = new();
private readonly GroupedVariantCriticalTableParser groupedVariantParser = new();
public async Task<int> RunAsync(ResetOptions options)
{
@@ -96,14 +98,24 @@ public sealed class CriticalImportCommandRunner
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
}
private StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
private CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
if (!string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
if (string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
{
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 2.");
return standardParser.Parse(entry, xmlContent);
}
return standardParser.Parse(entry, xmlContent);
if (string.Equals(entry.Family, "variant_column", StringComparison.OrdinalIgnoreCase))
{
return variantColumnParser.Parse(entry, xmlContent);
}
if (string.Equals(entry.Family, "grouped_variant", StringComparison.OrdinalIgnoreCase))
{
return groupedVariantParser.Parse(entry, xmlContent);
}
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by the importer.");
}
private static ImportArtifactPaths CreateArtifactPaths(string slug) =>

View File

@@ -43,6 +43,15 @@ public sealed class CriticalImportLoader(string databasePath)
Notes = table.Notes
};
entity.Groups = table.Groups
.Select(item => new CriticalGroup
{
GroupKey = item.GroupKey,
Label = item.Label,
SortOrder = item.SortOrder
})
.ToList();
entity.Columns = table.Columns
.Select(item => new CriticalColumn
{
@@ -63,12 +72,14 @@ public sealed class CriticalImportLoader(string databasePath)
})
.ToList();
var groupsByKey = entity.Groups.ToDictionary(item => item.GroupKey, StringComparer.OrdinalIgnoreCase);
var columnsByKey = entity.Columns.ToDictionary(item => item.ColumnKey, StringComparer.OrdinalIgnoreCase);
var rollBandsByLabel = entity.RollBands.ToDictionary(item => item.Label, StringComparer.OrdinalIgnoreCase);
entity.Results = table.Results
.Select(item => new CriticalResult
{
CriticalGroup = item.GroupKey is null ? null : groupsByKey[item.GroupKey],
CriticalColumn = columnsByKey[item.ColumnKey],
CriticalRollBand = rollBandsByLabel[item.RollBandLabel],
RawCellText = item.RawCellText,

View File

@@ -11,7 +11,7 @@ public sealed class ImportArtifactWriter
WriteIndented = true
};
public async Task WriteAsync(ImportArtifactPaths artifactPaths, StandardCriticalTableParseResult parseResult, CancellationToken cancellationToken = default)
public async Task WriteAsync(ImportArtifactPaths artifactPaths, CriticalTableParseResult parseResult, CancellationToken cancellationToken = default)
{
Directory.CreateDirectory(artifactPaths.DirectoryPath);

View File

@@ -0,0 +1,13 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class CriticalTableParseResult(
ParsedCriticalTable table,
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<ParsedCriticalCellArtifact> cells,
ImportValidationReport validationReport)
{
public ParsedCriticalTable Table { get; } = table;
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
public ImportValidationReport ValidationReport { get; } = validationReport;
}

View File

@@ -0,0 +1,477 @@
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
namespace RolemasterDb.ImportTool.Parsing;
internal static class CriticalTableParserSupport
{
internal const int HeaderToBodyMinimumGap = 20;
internal const int FooterLabelExclusionGap = 15;
internal const int FooterPageNumberExclusionGap = 80;
internal const int RowLabelDuplicateTolerance = 15;
internal const int TopGroupingTolerance = 2;
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[-])", RegexOptions.Compiled);
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|)\d+\)$", RegexOptions.Compiled);
internal static List<XmlTextFragment> LoadFragments(string xmlContent)
{
using var stringReader = new StringReader(xmlContent);
using var xmlReader = XmlReader.Create(
stringReader,
new XmlReaderSettings
{
DtdProcessing = DtdProcessing.Ignore
});
var document = XDocument.Load(xmlReader);
return document.Descendants("page")
.SelectMany(page =>
{
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
return page.Elements("text")
.Select(item => new XmlTextFragment(
pageNumber,
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
})
.ToList();
}
internal static List<XmlTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments,
int leftCutoff,
int bodyStartTop,
int keyTop)
{
var candidates = fragments
.Where(item =>
item.Left < leftCutoff &&
item.Top >= bodyStartTop &&
item.Top < keyTop - FooterLabelExclusionGap &&
(IsRollBandLabel(item.Text) || LooksLikeSplitRollBandStart(item.Text)))
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
var merged = new List<XmlTextFragment>();
for (var index = 0; index < candidates.Count; index++)
{
var candidate = candidates[index];
if (TryMergeSplitRollBand(candidates, index, out var mergedCandidate))
{
merged.Add(mergedCandidate);
index++;
continue;
}
if (IsRollBandLabel(candidate.Text))
{
merged.Add(candidate);
}
}
var deduped = new List<XmlTextFragment>();
foreach (var candidate in merged)
{
var previous = deduped.LastOrDefault();
if (previous is not null &&
string.Equals(NormalizeRollBandLabel(previous.Text), NormalizeRollBandLabel(candidate.Text), StringComparison.OrdinalIgnoreCase) &&
Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance)
{
continue;
}
deduped.Add(candidate);
}
return deduped;
}
internal static bool IsRollBandLabel(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:\s*-\s*\d{2,3})?$|^\d{2,3}\+$");
internal static bool IsPotentialRowLabelFragment(XmlTextFragment fragment, int leftCutoff) =>
fragment.Left < leftCutoff &&
(IsRollBandLabel(fragment.Text) || LooksLikeSplitRollBandStart(fragment.Text));
internal static string NormalizeRollBandLabel(string label) =>
Regex.Replace(CollapseWhitespace(label), @"\s*-\s*", "-");
internal static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
{
var normalizedLabel = NormalizeRollBandLabel(label);
if (normalizedLabel.EndsWith('+'))
{
return new ParsedCriticalRollBand(normalizedLabel, int.Parse(normalizedLabel[..^1]), null, sortOrder);
}
var parts = normalizedLabel.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
return parts.Length == 1
? new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder)
: new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
}
internal static string ResolveColumn(double centerX, IReadOnlyList<(string Key, double CenterX)> columns)
{
for (var index = 0; index < columns.Count - 1; index++)
{
var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
if (centerX < boundary)
{
return columns[index].Key;
}
}
return columns[^1].Key;
}
internal static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
{
var lines = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
{
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
lines.Add([fragment]);
continue;
}
lines[^1].Add(fragment);
}
return lines
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
}
internal static bool IsAffixLikeLine(string line, ISet<string> affixLegendSymbols)
{
var value = line.Trim();
if (value.Length == 0)
{
return false;
}
if (value is "-" or "\u2013" or "\u2014")
{
return true;
}
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase))
{
return value.Contains(':', StringComparison.Ordinal);
}
if (affixLegendSymbols.Count > 0 &&
affixLegendSymbols.Any(symbol => value.Contains(symbol, StringComparison.Ordinal)))
{
if (value.Any(char.IsDigit))
{
return true;
}
var remainder = value;
foreach (var symbol in affixLegendSymbols.OrderByDescending(item => item.Length))
{
remainder = remainder.Replace(symbol, string.Empty, StringComparison.Ordinal);
}
remainder = remainder
.Replace("+", string.Empty, StringComparison.Ordinal)
.Replace("-", string.Empty, StringComparison.Ordinal)
.Replace("", string.Empty, StringComparison.Ordinal)
.Replace("(", string.Empty, StringComparison.Ordinal)
.Replace(")", string.Empty, StringComparison.Ordinal)
.Replace("/", string.Empty, StringComparison.Ordinal);
if (string.IsNullOrWhiteSpace(remainder))
{
return true;
}
}
return value.StartsWith("+", StringComparison.Ordinal) ||
value.StartsWith("\u2211", StringComparison.Ordinal) ||
value.StartsWith("\u220F", StringComparison.Ordinal) ||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
StandaloneModifierAffixLineRegex.IsMatch(value) ||
NumericAffixLineRegex.IsMatch(value) ||
value.Contains(" - ", StringComparison.Ordinal) ||
value.Contains(" ", StringComparison.Ordinal);
}
internal static int CountLineTypeSegments(IReadOnlyList<string> lines, ISet<string> affixLegendSymbols)
{
var segmentCount = 0;
bool? previousIsAffix = null;
foreach (var line in lines)
{
var currentIsAffix = IsAffixLikeLine(line, affixLegendSymbols);
if (previousIsAffix == currentIsAffix)
{
continue;
}
segmentCount++;
previousIsAffix = currentIsAffix;
}
return segmentCount;
}
internal static string CollapseWhitespace(string value) =>
Regex.Replace(value.Trim(), @"\s+", " ");
internal static string NormalizeText(string value) =>
value
.Replace('\u00a0', ' ')
.Replace('\r', ' ')
.Replace('\n', ' ')
.Replace('', '\'')
.Trim();
internal static HashSet<string> DetectAffixLegendSymbols(IReadOnlyList<XmlTextFragment> fragments, int keyTop)
{
if (keyTop == int.MaxValue)
{
return [];
}
var footerLines = GroupByTop(fragments
.Where(item => item.Top >= keyTop - TopGroupingTolerance)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList())
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.ToList();
var symbols = new HashSet<string>(StringComparer.Ordinal);
foreach (var footerLine in footerLines)
{
AddLegendMatch(symbols, footerLine, @"must parry\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"no parry\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"stun(?:ned)?\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"bleed\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"powerpoint modification.*=\s*(\S)");
}
return symbols;
}
internal static List<XmlTextFragment> SplitBoundaryCrossingAffixFragments(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
ISet<string> affixLegendSymbols)
{
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count);
foreach (var fragment in bodyFragments)
{
splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols));
}
return splitFragments;
}
internal static List<(int Top, bool IsAffixLike)> BuildBodyLines(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
ISet<string> affixLegendSymbols)
{
var bodyLines = new List<(int Top, bool IsAffixLike)>();
foreach (var lineFragments in GroupByTop(bodyFragments.OrderBy(item => item.Top).ThenBy(item => item.Left).ToList()))
{
var columnTexts = lineFragments
.GroupBy(item => ResolveColumn(item.CenterX, columnCenters), StringComparer.OrdinalIgnoreCase)
.Select(group => CollapseWhitespace(string.Join(' ', group.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
var isAffixLike = columnTexts.Count > 0 &&
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols));
bodyLines.Add((lineFragments[0].Top, isAffixLike));
}
return bodyLines;
}
internal static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
{
if (keyTop == int.MaxValue)
{
return false;
}
return fragment.Top >= keyTop - FooterPageNumberExclusionGap &&
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
}
internal static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
{
var groups = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments)
{
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
groups.Add([fragment]);
continue;
}
groups[^1].Add(fragment);
}
return groups;
}
private static bool LooksLikeSplitRollBandStart(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}\s*-$");
private static bool TryMergeSplitRollBand(IReadOnlyList<XmlTextFragment> candidates, int index, out XmlTextFragment mergedCandidate)
{
var current = candidates[index];
if (!LooksLikeSplitRollBandStart(current.Text) || index + 1 >= candidates.Count)
{
mergedCandidate = null!;
return false;
}
var next = candidates[index + 1];
if (current.PageNumber != next.PageNumber ||
!Regex.IsMatch(next.Text.Trim(), @"^\d{2,3}$") ||
next.Top <= current.Top ||
next.Top - current.Top > RowLabelDuplicateTolerance + 5 ||
Math.Abs(next.Left - current.Left) > 20)
{
mergedCandidate = null!;
return false;
}
var startDigits = Regex.Match(current.Text, @"\d{2,3}").Value;
var mergedLabel = $"{startDigits}-{next.Text.Trim()}";
var right = Math.Max(current.Left + current.Width, next.Left + next.Width);
mergedCandidate = new XmlTextFragment(
current.PageNumber,
current.Top,
Math.Min(current.Left, next.Left),
right - Math.Min(current.Left, next.Left),
Math.Max(current.Height, next.Height),
mergedLabel);
return true;
}
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingAffixFragment(
XmlTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
ISet<string> affixLegendSymbols)
{
if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols))
{
return [fragment];
}
var matches = MultiFragmentSplitRegex.Matches(fragment.Text);
if (matches.Count < 2)
{
return [fragment];
}
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
var splitFragments = new List<XmlTextFragment>(matches.Count);
foreach (Match match in matches)
{
var segmentText = CollapseWhitespace(match.Value);
if (segmentText.Length == 0)
{
continue;
}
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
splitFragments.Add(new XmlTextFragment(
fragment.PageNumber,
fragment.Top,
segmentLeft,
segmentWidth,
fragment.Height,
segmentText));
}
if (splitFragments.Count < 2)
{
return [fragment];
}
var originalColumn = ResolveColumn(fragment.CenterX, columnCenters);
var distinctColumns = splitFragments
.Select(item => ResolveColumn(item.CenterX, columnCenters))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase))
? splitFragments
: [fragment];
}
private static bool LooksLikeBoundaryCrossingAffixFragment(
XmlTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
ISet<string> affixLegendSymbols)
{
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) ||
!fragment.Text.Contains(" ", StringComparison.Ordinal))
{
return false;
}
var fragmentRight = fragment.Left + fragment.Width;
for (var index = 0; index < columnCenters.Count - 1; index++)
{
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
if (fragment.Left < boundary && fragmentRight > boundary)
{
return true;
}
}
return false;
}
private static void AddLegendMatch(HashSet<string> symbols, string value, string pattern)
{
foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))
{
if (match.Groups.Count > 1)
{
symbols.Add(match.Groups[1].Value);
}
}
}
}

View File

@@ -0,0 +1,306 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class GroupedVariantCriticalTableParser
{
private static readonly ParsedCriticalGroup[] ExpectedGroups =
[
new("large", "Large Creatures", 1),
new("super_large", "Super Large Creatures", 2)
];
private static readonly ParsedCriticalColumn[] ExpectedColumns =
[
new("NORMAL", "Normal", "variant", 1),
new("SLAYING", "Slaying", "variant", 2)
];
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var groupHeaders = FindGroupHeaders(fragments);
var columnHeaders = FindColumnHeaders(fragments);
var validationErrors = new List<string>();
var validationWarnings = new List<string>();
var combinedColumnAnchors = columnHeaders
.OrderBy(item => item.Left)
.Select((item, index) =>
{
var group = ExpectedGroups[index / ExpectedColumns.Length];
var column = ExpectedColumns[index % ExpectedColumns.Length];
return (group.GroupKey, column.ColumnKey, CompositeKey: $"{group.GroupKey}:{column.ColumnKey}", item.CenterX);
})
.ToList();
var bodyStartTop = Math.Max(
groupHeaders.Max(item => item.Top),
columnHeaders.Max(item => item.Top))
+ CriticalTableParserSupport.HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
var leftCutoff = columnHeaders.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
bodyStartTop,
keyTop);
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
if (rowAnchors.Count == 0)
{
validationErrors.Add("No roll-band labels were found in the XML artifact.");
}
var columnCenters = combinedColumnAnchors
.Select(item => (item.CompositeKey, item.CenterX))
.ToList();
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
!groupHeaders.Contains(item) &&
!columnHeaders.Contains(item))
.ToList();
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<CellEntry>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? bodyStartTop
: ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList();
foreach (var anchor in combinedColumnAnchors)
{
var cellFragments = rowFragments
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == anchor.CompositeKey)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (cellFragments.Count == 0)
{
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', group '{anchor.GroupKey}', column '{anchor.ColumnKey}'.");
continue;
}
cellEntries.Add(new CellEntry(
anchor.GroupKey,
rowAnchors[rowIndex].Label,
rowIndex,
anchor.ColumnKey,
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
}
}
RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
foreach (var cellEntry in cellEntries
.OrderBy(item => item.RowIndex)
.ThenBy(item => item.GroupKey, StringComparer.Ordinal)
.ThenBy(item => item.ColumnKey, StringComparer.Ordinal))
{
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
if (segmentCount > 2)
{
validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.GroupKey}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
}
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
cellEntry.GroupKey,
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines,
rawCellText,
descriptionText,
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
cellEntry.GroupKey,
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
rawCellText,
descriptionText,
rawAffixText));
}
var expectedCellCount = rowAnchors.Count * ExpectedGroups.Length * ExpectedColumns.Length;
if (parsedCells.Count != expectedCellCount)
{
validationErrors.Add($"Expected {expectedCellCount} parsed cells but produced {parsedCells.Count}.");
}
var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
validationWarnings,
rowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.",
ExpectedGroups,
ExpectedColumns,
parsedRollBands,
parsedResults);
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport);
}
private static List<XmlTextFragment> FindGroupHeaders(IReadOnlyList<XmlTextFragment> fragments)
{
var expectedLabels = ExpectedGroups.Select(item => item.Label).ToList();
var headerCandidates = fragments
.Where(item => expectedLabels.Contains(item.Text.Trim(), StringComparer.OrdinalIgnoreCase))
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.Trim()).ToList();
if (labels.SequenceEqual(expectedLabels, StringComparer.OrdinalIgnoreCase))
{
return ordered;
}
}
throw new InvalidOperationException("Could not find the grouped-variant section headers in the XML artifact.");
}
private static List<XmlTextFragment> FindColumnHeaders(IReadOnlyList<XmlTextFragment> fragments)
{
var expectedLabels = new[] { "normal", "slaying", "normal", "slaying" };
var headerCandidates = fragments
.Where(item =>
{
var normalized = item.Text.Trim().ToLowerInvariant();
return normalized is "normal" or "slaying";
})
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.Trim().ToLowerInvariant()).ToList();
if (labels.SequenceEqual(expectedLabels))
{
return ordered;
}
}
throw new InvalidOperationException("Could not find the grouped-variant column header row in the XML artifact.");
}
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
var axes = cellEntries
.Select(item => (item.GroupKey, item.ColumnKey))
.Distinct()
.ToList();
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
{
foreach (var (groupKey, columnKey) in axes)
{
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.GroupKey == groupKey && item.ColumnKey == columnKey);
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.GroupKey == groupKey && item.ColumnKey == columnKey);
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
{
leadingAffixCount++;
}
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
{
continue;
}
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
next.Lines.RemoveRange(0, leadingAffixCount);
}
}
}
private static int ResolveRowBoundaryTop(
RowAnchor current,
RowAnchor next,
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
{
var linesBetweenLabels = bodyLines
.Where(item => item.Top >= current.Top && item.Top < next.Top)
.OrderBy(item => item.Top)
.ToList();
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
{
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
{
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
}
}
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
}
private sealed record RowAnchor(string Label, int Top, int SortOrder);
private sealed class CellEntry(string groupKey, string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string GroupKey { get; } = groupKey;
public string RollBandLabel { get; } = rollBandLabel;
public int RowIndex { get; } = rowIndex;
public string ColumnKey { get; } = columnKey;
public List<string> Lines { get; } = lines;
}
}

View File

@@ -1,6 +1,7 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalCellArtifact(
string? groupKey,
string rollBandLabel,
string columnKey,
IReadOnlyList<string> lines,
@@ -8,6 +9,7 @@ public sealed class ParsedCriticalCellArtifact(
string descriptionText,
string? rawAffixText)
{
public string? GroupKey { get; } = groupKey;
public string RollBandLabel { get; } = rollBandLabel;
public string ColumnKey { get; } = columnKey;
public IReadOnlyList<string> Lines { get; } = lines;

View File

@@ -0,0 +1,8 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalGroup(string groupKey, string label, int sortOrder)
{
public string GroupKey { get; } = groupKey;
public string Label { get; } = label;
public int SortOrder { get; } = sortOrder;
}

View File

@@ -1,12 +1,14 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalResult(
string? groupKey,
string columnKey,
string rollBandLabel,
string rawCellText,
string descriptionText,
string? rawAffixText)
{
public string? GroupKey { get; } = groupKey;
public string ColumnKey { get; } = columnKey;
public string RollBandLabel { get; } = rollBandLabel;
public string RawCellText { get; } = rawCellText;

View File

@@ -6,6 +6,7 @@ public sealed class ParsedCriticalTable(
string family,
string sourceDocument,
string? notes,
IReadOnlyList<ParsedCriticalGroup> groups,
IReadOnlyList<ParsedCriticalColumn> columns,
IReadOnlyList<ParsedCriticalRollBand> rollBands,
IReadOnlyList<ParsedCriticalResult> results)
@@ -15,6 +16,7 @@ public sealed class ParsedCriticalTable(
public string Family { get; } = family;
public string SourceDocument { get; } = sourceDocument;
public string? Notes { get; } = notes;
public IReadOnlyList<ParsedCriticalGroup> Groups { get; } = groups;
public IReadOnlyList<ParsedCriticalColumn> Columns { get; } = columns;
public IReadOnlyList<ParsedCriticalRollBand> RollBands { get; } = rollBands;
public IReadOnlyList<ParsedCriticalResult> Results { get; } = results;

View File

@@ -1,33 +1,20 @@
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParser
{
private const int HeaderToBodyMinimumGap = 20;
private const int FooterLabelExclusionGap = 15;
private const int FooterPageNumberExclusionGap = 80;
private const int RowLabelDuplicateTolerance = 15;
private const int TopGroupingTolerance = 2;
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[-])", RegexOptions.Compiled);
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled);
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
var fragments = LoadFragments(xmlContent);
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var headerFragments = FindHeaderFragments(fragments);
var validationErrors = new List<string>();
var validationWarnings = new List<string>();
var columnCenters = headerFragments
.OrderBy(item => item.Left)
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
.Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX))
.ToList();
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
@@ -35,12 +22,17 @@ public sealed class StandardCriticalTableParser
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var affixLegendSymbols = DetectAffixLegendSymbols(fragments, keyTop);
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments, keyTop);
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
bodyStartTop,
keyTop);
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
if (rowAnchors.Count == 0)
@@ -51,16 +43,17 @@ public sealed class StandardCriticalTableParser
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - TopGroupingTolerance &&
!IsFooterPageNumberFragment(item, keyTop) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList();
bodyFragments = SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyLines = BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder))
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<CellEntry>();
@@ -82,7 +75,7 @@ public sealed class StandardCriticalTableParser
foreach (var columnAnchor in columnCenters)
{
var cellFragments = rowFragments
.Where(item => ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
@@ -97,7 +90,7 @@ public sealed class StandardCriticalTableParser
rowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
BuildLines(cellFragments).ToList()));
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
}
}
@@ -108,7 +101,7 @@ public sealed class StandardCriticalTableParser
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
{
var segmentCount = CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
if (segmentCount > 2)
{
@@ -116,13 +109,14 @@ public sealed class StandardCriticalTableParser
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
}
var rawAffixLines = cellEntry.Lines.Where(line => IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
null,
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines,
@@ -131,6 +125,7 @@ public sealed class StandardCriticalTableParser
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
null,
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
rawCellText,
@@ -162,40 +157,12 @@ public sealed class StandardCriticalTableParser
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.",
[],
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new StandardCriticalTableParseResult(table, fragments, parsedCells, validationReport);
}
private static List<XmlTextFragment> LoadFragments(string xmlContent)
{
using var stringReader = new StringReader(xmlContent);
using var xmlReader = XmlReader.Create(
stringReader,
new XmlReaderSettings
{
DtdProcessing = DtdProcessing.Ignore
});
var document = XDocument.Load(xmlReader);
return document.Descendants("page")
.SelectMany(page =>
{
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
return page.Elements("text")
.Select(item => new XmlTextFragment(
pageNumber,
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
})
.ToList();
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport);
}
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
@@ -206,7 +173,7 @@ public sealed class StandardCriticalTableParser
.ThenBy(item => item.Left)
.ToList();
foreach (var group in GroupByTop(headerCandidates))
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
@@ -219,156 +186,6 @@ public sealed class StandardCriticalTableParser
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
}
private static List<XmlTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<XmlTextFragment> headerFragments,
int keyTop)
{
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
var candidates = fragments
.Where(item =>
item.Left < leftCutoff &&
item.Top >= bodyStartTop &&
item.Top < keyTop - FooterLabelExclusionGap &&
IsRollBandLabel(item.Text))
.OrderBy(item => item.Top)
.ToList();
var deduped = new List<XmlTextFragment>();
foreach (var candidate in candidates)
{
var previous = deduped.LastOrDefault();
if (previous is not null &&
string.Equals(previous.Text, candidate.Text, StringComparison.OrdinalIgnoreCase) &&
Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance)
{
continue;
}
deduped.Add(candidate);
}
return deduped;
}
private static bool IsRollBandLabel(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$");
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
{
if (label.EndsWith('+'))
{
return new ParsedCriticalRollBand(label, int.Parse(label[..^1]), null, sortOrder);
}
var parts = label.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
return parts.Length == 1
? new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder)
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
}
private static string ResolveColumn(double centerX, IReadOnlyList<ColumnAnchor> columns)
{
for (var index = 0; index < columns.Count - 1; index++)
{
var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
if (centerX < boundary)
{
return columns[index].Key;
}
}
return columns[^1].Key;
}
private static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
{
var lines = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
{
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
lines.Add([fragment]);
continue;
}
lines[^1].Add(fragment);
}
return lines
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
}
private static bool IsAffixLikeLine(string line, ISet<string> affixLegendSymbols)
{
var value = line.Trim();
if (value.Length == 0)
{
return false;
}
if (value == "-" || value == "\u2013" || value == "\u2014")
{
return true;
}
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase))
{
return value.Contains(':', StringComparison.Ordinal);
}
if (affixLegendSymbols.Count > 0 &&
affixLegendSymbols.Any(symbol => value.Contains(symbol, StringComparison.Ordinal)))
{
if (value.Any(char.IsDigit))
{
return true;
}
var remainder = value;
foreach (var symbol in affixLegendSymbols.OrderByDescending(item => item.Length))
{
remainder = remainder.Replace(symbol, string.Empty, StringComparison.Ordinal);
}
remainder = remainder
.Replace("+", string.Empty, StringComparison.Ordinal)
.Replace("-", string.Empty, StringComparison.Ordinal)
.Replace("(", string.Empty, StringComparison.Ordinal)
.Replace(")", string.Empty, StringComparison.Ordinal)
.Replace("/", string.Empty, StringComparison.Ordinal);
if (string.IsNullOrWhiteSpace(remainder))
{
return true;
}
}
return value.StartsWith("+", StringComparison.Ordinal) ||
value.StartsWith("\u2211", StringComparison.Ordinal) ||
value.StartsWith("\u220F", StringComparison.Ordinal) ||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
StandaloneModifierAffixLineRegex.IsMatch(value) ||
NumericAffixLineRegex.IsMatch(value) ||
value.Contains(" - ", StringComparison.Ordinal);
}
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries)
=> RepairLeadingAffixLeakage(cellEntries, new HashSet<string>(StringComparer.Ordinal));
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
@@ -380,14 +197,13 @@ public sealed class StandardCriticalTableParser
{
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
{
leadingAffixCount++;
}
@@ -403,199 +219,10 @@ public sealed class StandardCriticalTableParser
}
}
private static string CollapseWhitespace(string value) =>
Regex.Replace(value.Trim(), @"\s+", " ");
private static string NormalizeText(string value) =>
value
.Replace('\u00a0', ' ')
.Replace('\r', ' ')
.Replace('\n', ' ')
.Trim();
private static int CountLineTypeSegments(IReadOnlyList<string> lines, ISet<string> affixLegendSymbols)
{
var segmentCount = 0;
bool? previousIsAffix = null;
foreach (var line in lines)
{
var currentIsAffix = IsAffixLikeLine(line, affixLegendSymbols);
if (previousIsAffix == currentIsAffix)
{
continue;
}
segmentCount++;
previousIsAffix = currentIsAffix;
}
return segmentCount;
}
private static HashSet<string> DetectAffixLegendSymbols(IReadOnlyList<XmlTextFragment> fragments, int keyTop)
{
if (keyTop == int.MaxValue)
{
return [];
}
var footerLines = GroupByTop(fragments
.Where(item => item.Top >= keyTop - TopGroupingTolerance)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList())
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.ToList();
var symbols = new HashSet<string>(StringComparer.Ordinal);
foreach (var footerLine in footerLines)
{
AddLegendMatch(symbols, footerLine, @"must parry\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"no parry\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"stun(?:ned)?\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"bleed\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"powerpoint modification.*=\s*(\S)");
}
return symbols;
}
private static List<XmlTextFragment> SplitBoundaryCrossingAffixFragments(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count);
foreach (var fragment in bodyFragments)
{
splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols));
}
return splitFragments;
}
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingAffixFragment(
XmlTextFragment fragment,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols))
{
return [fragment];
}
var matches = MultiFragmentSplitRegex.Matches(fragment.Text);
if (matches.Count < 2)
{
return [fragment];
}
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
var splitFragments = new List<XmlTextFragment>(matches.Count);
foreach (Match match in matches)
{
var segmentText = CollapseWhitespace(match.Value);
if (segmentText.Length == 0)
{
continue;
}
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
splitFragments.Add(new XmlTextFragment(
fragment.PageNumber,
fragment.Top,
segmentLeft,
segmentWidth,
fragment.Height,
segmentText));
}
if (splitFragments.Count < 2)
{
return [fragment];
}
var originalColumn = ResolveColumn(fragment.CenterX, columnCenters);
var distinctColumns = splitFragments
.Select(item => ResolveColumn(item.CenterX, columnCenters))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase))
? splitFragments
: [fragment];
}
private static bool LooksLikeBoundaryCrossingAffixFragment(
XmlTextFragment fragment,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) ||
!fragment.Text.Contains(" ", StringComparison.Ordinal))
{
return false;
}
var fragmentRight = fragment.Left + fragment.Width;
for (var index = 0; index < columnCenters.Count - 1; index++)
{
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
if (fragment.Left < boundary && fragmentRight > boundary)
{
return true;
}
}
return false;
}
private static void AddLegendMatch(HashSet<string> symbols, string value, string pattern)
{
foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))
{
if (match.Groups.Count > 1)
{
symbols.Add(match.Groups[1].Value);
}
}
}
private static List<BodyLine> BuildBodyLines(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
var bodyLines = new List<BodyLine>();
foreach (var lineFragments in GroupByTop(bodyFragments.OrderBy(item => item.Top).ThenBy(item => item.Left).ToList()))
{
var columnTexts = lineFragments
.GroupBy(item => ResolveColumn(item.CenterX, columnCenters), StringComparer.OrdinalIgnoreCase)
.Select(group => CollapseWhitespace(string.Join(' ', group.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
var isAffixLike = columnTexts.Count > 0 &&
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols));
bodyLines.Add(new BodyLine(lineFragments[0].Top, isAffixLike));
}
return bodyLines;
}
private static int ResolveRowBoundaryTop(
RowAnchor current,
RowAnchor next,
IReadOnlyList<BodyLine> bodyLines)
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
{
var linesBetweenLabels = bodyLines
.Where(item => item.Top >= current.Top && item.Top < next.Top)
@@ -613,41 +240,8 @@ public sealed class StandardCriticalTableParser
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
}
private static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
{
if (keyTop == int.MaxValue)
{
return false;
}
return fragment.Top >= keyTop - FooterPageNumberExclusionGap &&
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
}
private static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
{
var groups = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments)
{
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
groups.Add([fragment]);
continue;
}
groups[^1].Add(fragment);
}
return groups;
}
private sealed record ColumnAnchor(string Key, double CenterX);
private sealed record RowAnchor(string Label, int Top, int SortOrder);
private sealed record BodyLine(int Top, bool IsAffixLike);
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string RollBandLabel { get; } = rollBandLabel;

View File

@@ -0,0 +1,276 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class VariantColumnCriticalTableParser
{
private static readonly ColumnDefinition[] ExpectedColumns =
[
new("NORMAL", "Normal"),
new("MAGIC", "Magic"),
new("MITHRIL", "Mithril"),
new("HOLY_ARMS", "Holy Arms"),
new("SLAYING", "Slaying")
];
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var headerFragments = FindHeaderFragments(fragments);
var validationErrors = new List<string>();
var validationWarnings = new List<string>();
var columnAnchors = headerFragments
.OrderBy(item => item.Left)
.Select(item =>
{
var definition = ResolveColumnDefinition(item.Text);
return (definition.Key, definition.Label, item.CenterX);
})
.ToList();
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
bodyStartTop,
keyTop);
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
if (rowAnchors.Count == 0)
{
validationErrors.Add("No roll-band labels were found in the XML artifact.");
}
var columnCenters = columnAnchors
.Select(item => (item.Key, item.CenterX))
.ToList();
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList();
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<CellEntry>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? bodyStartTop
: ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList();
foreach (var columnAnchor in columnAnchors)
{
var cellFragments = rowFragments
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (cellFragments.Count == 0)
{
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
continue;
}
cellEntries.Add(new CellEntry(
rowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
}
}
RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey, StringComparer.Ordinal))
{
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
if (segmentCount > 2)
{
validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
}
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
null,
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines,
rawCellText,
descriptionText,
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
null,
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
rawCellText,
descriptionText,
rawAffixText));
}
if (columnAnchors.Count != ExpectedColumns.Length)
{
validationErrors.Add($"Expected {ExpectedColumns.Length} variant columns but found {columnAnchors.Count}.");
}
if (parsedCells.Count != rowAnchors.Count * columnAnchors.Count)
{
validationErrors.Add($"Expected {rowAnchors.Count * columnAnchors.Count} parsed cells but produced {parsedCells.Count}.");
}
var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
validationWarnings,
rowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.",
[],
ExpectedColumns.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Label, "variant", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport);
}
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
{
var expectedLabels = ExpectedColumns
.Select(item => item.Label.ToLowerInvariant())
.ToList();
var headerCandidates = fragments
.Where(item => expectedLabels.Contains(item.Text.Trim().ToLowerInvariant(), StringComparer.Ordinal))
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.Trim().ToLowerInvariant()).ToList();
if (labels.SequenceEqual(expectedLabels))
{
return ordered;
}
}
throw new InvalidOperationException("Could not find the variant-column header row in the XML artifact.");
}
private static ColumnDefinition ResolveColumnDefinition(string value) =>
ExpectedColumns.SingleOrDefault(item => string.Equals(item.Label, value.Trim(), StringComparison.OrdinalIgnoreCase))
?? throw new InvalidOperationException($"Unsupported variant column label '{value}'.");
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
{
foreach (var columnKey in columnKeys)
{
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
{
leadingAffixCount++;
}
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
{
continue;
}
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
next.Lines.RemoveRange(0, leadingAffixCount);
}
}
}
private static int ResolveRowBoundaryTop(
RowAnchor current,
RowAnchor next,
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
{
var linesBetweenLabels = bodyLines
.Where(item => item.Top >= current.Top && item.Top < next.Top)
.OrderBy(item => item.Top)
.ToList();
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
{
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
{
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
}
}
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
}
private sealed record ColumnDefinition(string Key, string Label);
private sealed record RowAnchor(string Label, int Top, int SortOrder);
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string RollBandLabel { get; } = rollBandLabel;
public int RowIndex { get; } = rowIndex;
public string ColumnKey { get; } = columnKey;
public List<string> Lines { get; } = lines;
}
}