Use XML geometry for critical PDF import

This commit is contained in:
2026-03-14 01:25:43 +01:00
parent f70d610c92
commit 719355da90
10 changed files with 335 additions and 201 deletions

View File

@@ -0,0 +1,13 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ImportValidationReport(
bool isValid,
IReadOnlyList<string> errors,
int rowCount,
int cellCount)
{
public bool IsValid { get; } = isValid;
public IReadOnlyList<string> Errors { get; } = errors;
public int RowCount { get; } = rowCount;
public int CellCount { get; } = cellCount;
}

View File

@@ -0,0 +1,17 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalCellArtifact(
string rollBandLabel,
string columnKey,
IReadOnlyList<string> lines,
string rawCellText,
string descriptionText,
string? rawAffixText)
{
public string RollBandLabel { get; } = rollBandLabel;
public string ColumnKey { get; } = columnKey;
public IReadOnlyList<string> Lines { get; } = lines;
public string RawCellText { get; } = rawCellText;
public string DescriptionText { get; } = descriptionText;
public string? RawAffixText { get; } = rawAffixText;
}

View File

@@ -0,0 +1,13 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParseResult(
ParsedCriticalTable table,
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<ParsedCriticalCellArtifact> cells,
ImportValidationReport validationReport)
{
public ParsedCriticalTable Table { get; } = table;
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
public ImportValidationReport ValidationReport { get; } = validationReport;
}

View File

@@ -1,208 +1,206 @@
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParser
{
private static readonly Regex ColumnRegex = new(@"\b([A-E])\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex RollBandRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)\s*$", RegexOptions.Compiled);
private static readonly Regex RollBandLineRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)(?<rest>\s+.*)?$", RegexOptions.Compiled);
private const int HeaderToBodyMinimumGap = 20;
private const int TopGroupingTolerance = 2;
public ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText)
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
var lines = extractedText.Replace("\r\n", "\n", StringComparison.Ordinal)
.Replace('\f', '\n')
.Split('\n');
var fragments = LoadFragments(xmlContent);
var headerFragments = FindHeaderFragments(fragments);
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments);
var validationErrors = new List<string>();
var headerIndex = Array.FindIndex(lines, IsColumnHeaderLine);
if (headerIndex < 0)
{
throw new InvalidOperationException("The standard table header could not be found in the extracted text.");
}
var columnStarts = GetColumnStarts(lines[headerIndex]);
var boundaries = GetColumnBoundaries(columnStarts);
var columns = columnStarts
.Select((item, index) => new ParsedCriticalColumn(item.Label, item.Label, "severity", index + 1))
var columnCenters = headerFragments
.OrderBy(item => item.Left)
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
.ToList();
var firstRollBandIndex = FindNextRollBandIndex(lines, headerIndex + 1);
if (firstRollBandIndex < 0)
{
throw new InvalidOperationException("No roll bands were found in the extracted text.");
}
var keyLineIndex = Array.FindIndex(lines, firstRollBandIndex, item => item.TrimStart().StartsWith("Key:", StringComparison.OrdinalIgnoreCase));
if (keyLineIndex < 0)
{
keyLineIndex = lines.Length;
}
var leadingLines = lines[(headerIndex + 1)..firstRollBandIndex]
.Where(item => !string.IsNullOrWhiteSpace(item))
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
.ToList();
var rollBands = new List<ParsedCriticalRollBand>();
var results = new List<ParsedCriticalResult>();
var currentLabel = string.Empty;
var currentRowLines = new List<string>();
var rowIndex = 0;
void FlushCurrentRow()
if (rowAnchors.Count == 0)
{
if (string.IsNullOrEmpty(currentLabel))
{
return;
}
rowIndex++;
var rollBand = CreateRollBand(currentLabel, rowIndex);
rollBands.Add(rollBand);
var cellLines = SplitRowLines(currentRowLines, boundaries, columns.Count);
for (var columnIndex = 0; columnIndex < columns.Count; columnIndex++)
{
var rawCellLines = cellLines[columnIndex]
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
var rawAffixLines = rawCellLines
.Where(IsAffixLikeLine)
.ToList();
var descriptionLines = rawCellLines
.Where(item => !IsAffixLikeLine(item))
.ToList();
results.Add(new ParsedCriticalResult(
columns[columnIndex].ColumnKey,
rollBand.Label,
string.Join(Environment.NewLine, rawCellLines),
CollapseWhitespace(string.Join(' ', descriptionLines)),
rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines)));
}
currentLabel = string.Empty;
currentRowLines = new List<string>();
validationErrors.Add("No roll-band labels were found in the XML artifact.");
}
for (var lineIndex = firstRollBandIndex; lineIndex < keyLineIndex; lineIndex++)
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item => string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - 1 &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList();
var parsedRollBands = rowAnchors
.Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
if (TryParseRollBandLine(lines[lineIndex], out var label, out var trailingText))
var rowStart = rowIndex == 0
? bodyStartTop
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList();
foreach (var columnAnchor in columnCenters)
{
var trailingTextBelongsToCurrentRow = IsAffixLikeLine(trailingText);
var cellFragments = rowFragments
.Where(item => ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (!string.IsNullOrWhiteSpace(trailingText) &&
!string.IsNullOrEmpty(currentLabel) &&
!trailingTextBelongsToCurrentRow)
if (cellFragments.Count == 0)
{
currentRowLines.Add(trailingText);
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
continue;
}
FlushCurrentRow();
currentLabel = label;
if (rowIndex == 0)
{
currentRowLines.AddRange(leadingLines);
}
var lines = BuildLines(cellFragments);
var rawAffixLines = lines.Where(IsAffixLikeLine).ToList();
var descriptionLines = lines.Where(line => !IsAffixLikeLine(line)).ToList();
var rawCellText = string.Join(Environment.NewLine, lines);
var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
if (!string.IsNullOrWhiteSpace(trailingText) && trailingTextBelongsToCurrentRow)
{
currentRowLines.Add(trailingText);
}
parsedCells.Add(new ParsedCriticalCellArtifact(
rowAnchors[rowIndex].Label,
columnAnchor.Key,
lines,
rawCellText,
descriptionText,
rawAffixText));
continue;
}
if (!string.IsNullOrWhiteSpace(lines[lineIndex]))
{
currentRowLines.Add(lines[lineIndex]);
parsedResults.Add(new ParsedCriticalResult(
columnAnchor.Key,
rowAnchors[rowIndex].Label,
rawCellText,
descriptionText,
rawAffixText));
}
}
FlushCurrentRow();
if (columnCenters.Count != 5)
{
validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}.");
}
return new ParsedCriticalTable(
if (parsedCells.Count != rowAnchors.Count * columnCenters.Count)
{
validationErrors.Add(
$"Expected {rowAnchors.Count * columnCenters.Count} parsed cells but produced {parsedCells.Count}.");
}
var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
rowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF text extraction.",
columns,
rollBands,
results);
"Imported from PDF XML extraction.",
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new StandardCriticalTableParseResult(table, fragments, parsedCells, validationReport);
}
private static bool IsColumnHeaderLine(string line)
private static List<XmlTextFragment> LoadFragments(string xmlContent)
{
var matches = ColumnRegex.Matches(line);
return matches.Count == 5;
}
using var stringReader = new StringReader(xmlContent);
using var xmlReader = XmlReader.Create(
stringReader,
new XmlReaderSettings
{
DtdProcessing = DtdProcessing.Ignore
});
private static List<(string Label, int Start)> GetColumnStarts(string headerLine)
{
var matches = ColumnRegex.Matches(headerLine);
return matches
.Select(match => (match.Groups[1].Value.ToUpperInvariant(), match.Index))
var document = XDocument.Load(xmlReader);
return document.Descendants("page")
.SelectMany(page =>
{
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
return page.Elements("text")
.Select(item => new XmlTextFragment(
pageNumber,
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
})
.ToList();
}
private static int[] GetColumnBoundaries(IReadOnlyList<(string Label, int Start)> columns)
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
{
var boundaries = new int[columns.Count - 1];
for (var index = 0; index < boundaries.Length; index++)
{
boundaries[index] = (columns[index].Start + columns[index + 1].Start) / 2;
}
var groupedByTop = fragments
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
.GroupBy(item => item.Top)
.OrderBy(group => group.Key);
return boundaries;
}
private static int FindNextRollBandIndex(IReadOnlyList<string> lines, int startIndex)
{
for (var index = startIndex; index < lines.Count; index++)
foreach (var group in groupedByTop)
{
if (TryParseRollBandLine(lines[index], out _, out _))
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
if (labels.SequenceEqual(["A", "B", "C", "D", "E"]))
{
return index;
return ordered;
}
}
return -1;
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
}
private static bool TryParseRollBandLabel(string line, out string label)
private static List<XmlTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<XmlTextFragment> headerFragments)
{
var match = RollBandRegex.Match(line);
if (!match.Success)
{
label = string.Empty;
return false;
}
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
label = match.Groups[1].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
return true;
return fragments
.Where(item =>
item.Left < leftCutoff &&
item.Top >= bodyStartTop &&
IsRollBandLabel(item.Text))
.OrderBy(item => item.Top)
.ToList();
}
private static bool TryParseRollBandLine(string line, out string label, out string trailingText)
{
var match = RollBandLineRegex.Match(line);
if (!match.Success)
{
label = string.Empty;
trailingText = string.Empty;
return false;
}
label = match.Groups["label"].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
var restGroup = match.Groups["rest"];
trailingText = restGroup.Success
? string.Concat(new string(' ', restGroup.Index), restGroup.Value.TrimEnd())
: string.Empty;
return true;
}
private static bool IsRollBandLabel(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$");
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
{
@@ -217,35 +215,39 @@ public sealed class StandardCriticalTableParser
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
}
private static List<string>[] SplitRowLines(IReadOnlyList<string> rowLines, int[] boundaries, int columnCount)
private static string ResolveColumn(double centerX, IReadOnlyList<ColumnAnchor> columns)
{
var result = Enumerable.Range(0, columnCount)
.Select(_ => new List<string>())
.ToArray();
foreach (var line in rowLines)
for (var index = 0; index < columns.Count - 1; index++)
{
for (var columnIndex = 0; columnIndex < columnCount; columnIndex++)
var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
if (centerX < boundary)
{
var start = columnIndex == 0 ? 0 : boundaries[columnIndex - 1];
var end = columnIndex == columnCount - 1
? line.Length
: Math.Min(boundaries[columnIndex], line.Length);
if (start >= line.Length || end <= start)
{
continue;
}
var segment = line[start..end].Trim();
if (!string.IsNullOrWhiteSpace(segment))
{
result[columnIndex].Add(segment);
}
return columns[index].Key;
}
}
return result;
return columns[^1].Key;
}
private static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
{
var lines = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
{
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
lines.Add([fragment]);
continue;
}
lines[^1].Add(fragment);
}
return lines
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
}
private static bool IsAffixLikeLine(string line)
@@ -256,7 +258,7 @@ public sealed class StandardCriticalTableParser
return false;
}
if (value == "")
if (value == "-" || value == "\u2014")
{
return true;
}
@@ -270,16 +272,27 @@ public sealed class StandardCriticalTableParser
}
return value.StartsWith("+", StringComparison.Ordinal) ||
value.StartsWith('∑') ||
value.StartsWith('∏') ||
value.StartsWith('π') ||
value.StartsWith('∫') ||
value.StartsWith("\u2211", StringComparison.Ordinal) ||
value.StartsWith("\u220F", StringComparison.Ordinal) ||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
char.IsDigit(value[0]) ||
value.Contains(" ", StringComparison.Ordinal) ||
value.Contains(" - ", StringComparison.Ordinal) ||
value.Contains("(-", StringComparison.Ordinal) ||
value.Contains("(+", StringComparison.Ordinal);
}
private static string CollapseWhitespace(string value) =>
Regex.Replace(value.Trim(), @"\s+", " ");
private static string NormalizeText(string value) =>
value
.Replace('\u00a0', ' ')
.Replace('\r', ' ')
.Replace('\n', ' ')
.Trim();
private sealed record ColumnAnchor(string Key, double CenterX);
private sealed record RowAnchor(string Label, int Top, int SortOrder);
}

View File

@@ -0,0 +1,18 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class XmlTextFragment(
int pageNumber,
int top,
int left,
int width,
int height,
string text)
{
public int PageNumber { get; } = pageNumber;
public int Top { get; } = top;
public int Left { get; } = left;
public int Width { get; } = width;
public int Height { get; } = height;
public string Text { get; } = text;
public double CenterX => Left + (Width / 2.0);
}