From 4979cf87f7a7959afb6fbfaf6ae79424f0869cda Mon Sep 17 00:00:00 2001 From: Frank Tovar Date: Tue, 17 Mar 2026 22:27:57 +0100 Subject: [PATCH] Track critical cell source bounds --- .../Parsing/ColumnarCellEntry.cs | 9 ++- .../Parsing/ColumnarCellLine.cs | 7 ++ .../Parsing/CriticalTableParseResult.cs | 2 + .../Parsing/CriticalTableParserSupport.cs | 68 ++++++++++++++++--- .../GroupedVariantCriticalTableParser.cs | 3 +- .../Parsing/ParsedCriticalCellArtifact.cs | 6 +- .../Parsing/ParsedCriticalResult.cs | 6 +- .../Parsing/ParsedCriticalSourceRect.cs | 15 ++++ .../Parsing/ParsedPdfPageGeometry.cs | 8 +++ .../Parsing/StandardCriticalTableParser.cs | 3 +- .../VariantColumnCriticalTableParser.cs | 3 +- 11 files changed, 115 insertions(+), 15 deletions(-) create mode 100644 src/RolemasterDb.ImportTool/Parsing/ColumnarCellLine.cs create mode 100644 src/RolemasterDb.ImportTool/Parsing/ParsedCriticalSourceRect.cs create mode 100644 src/RolemasterDb.ImportTool/Parsing/ParsedPdfPageGeometry.cs diff --git a/src/RolemasterDb.ImportTool/Parsing/ColumnarCellEntry.cs b/src/RolemasterDb.ImportTool/Parsing/ColumnarCellEntry.cs index a1823dd..4035aca 100644 --- a/src/RolemasterDb.ImportTool/Parsing/ColumnarCellEntry.cs +++ b/src/RolemasterDb.ImportTool/Parsing/ColumnarCellEntry.cs @@ -1,10 +1,15 @@ namespace RolemasterDb.ImportTool.Parsing; -internal sealed class ColumnarCellEntry(string? groupKey, string rollBandLabel, int rowIndex, string columnKey, List lines) +internal sealed class ColumnarCellEntry( + string? groupKey, + string rollBandLabel, + int rowIndex, + string columnKey, + List lines) { public string? GroupKey { get; } = groupKey; public string RollBandLabel { get; } = rollBandLabel; public int RowIndex { get; } = rowIndex; public string ColumnKey { get; } = columnKey; - public List Lines { get; } = lines; + public List Lines { get; } = lines; } diff --git a/src/RolemasterDb.ImportTool/Parsing/ColumnarCellLine.cs b/src/RolemasterDb.ImportTool/Parsing/ColumnarCellLine.cs new file mode 100644 index 0000000..b0f7914 --- /dev/null +++ b/src/RolemasterDb.ImportTool/Parsing/ColumnarCellLine.cs @@ -0,0 +1,7 @@ +namespace RolemasterDb.ImportTool.Parsing; + +internal sealed class ColumnarCellLine(string text, List fragments) +{ + public string Text { get; } = text; + public List Fragments { get; } = fragments; +} diff --git a/src/RolemasterDb.ImportTool/Parsing/CriticalTableParseResult.cs b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParseResult.cs index 79bbdcb..1d45e98 100644 --- a/src/RolemasterDb.ImportTool/Parsing/CriticalTableParseResult.cs +++ b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParseResult.cs @@ -2,11 +2,13 @@ namespace RolemasterDb.ImportTool.Parsing; public sealed class CriticalTableParseResult( ParsedCriticalTable table, + IReadOnlyList pageGeometries, IReadOnlyList fragments, IReadOnlyList cells, ImportValidationReport validationReport) { public ParsedCriticalTable Table { get; } = table; + public IReadOnlyList PageGeometries { get; } = pageGeometries; public IReadOnlyList Fragments { get; } = fragments; public IReadOnlyList Cells { get; } = cells; public ImportValidationReport ValidationReport { get; } = validationReport; diff --git a/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs index f3e24b8..4143554 100644 --- a/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs +++ b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs @@ -53,6 +53,26 @@ internal static class CriticalTableParserSupport return RemoveRedundantContainedFragments(fragments); } + internal static List LoadPageGeometries(string xmlContent) + { + using var stringReader = new StringReader(xmlContent); + using var xmlReader = XmlReader.Create( + stringReader, + new XmlReaderSettings + { + DtdProcessing = DtdProcessing.Ignore + }); + + var document = XDocument.Load(xmlReader); + + return document.Descendants("page") + .Select(page => new ParsedPdfPageGeometry( + int.Parse(page.Attribute("number")?.Value ?? "1"), + int.Parse(page.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing page width attribute.")), + int.Parse(page.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing page height attribute.")))) + .ToList(); + } + internal static List FindRowLabelFragments( IReadOnlyList fragments, int leftCutoff, @@ -143,7 +163,7 @@ internal static class CriticalTableParserSupport return columns[^1].Key; } - internal static IReadOnlyList BuildLines(IReadOnlyList fragments) + internal static IReadOnlyList BuildLines(IReadOnlyList fragments) { var lines = new List>(); @@ -159,8 +179,10 @@ internal static class CriticalTableParserSupport } return lines - .Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text)))) - .Where(item => !string.IsNullOrWhiteSpace(item)) + .Select(line => new ColumnarCellLine( + CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))), + line.OrderBy(item => item.Left).ToList())) + .Where(item => !string.IsNullOrWhiteSpace(item.Text)) .ToList(); } @@ -516,7 +538,7 @@ internal static class CriticalTableParserSupport } var leadingAffixCount = 0; - while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols)) + while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount].Text, affixLegendSymbols)) { leadingAffixCount++; } @@ -564,7 +586,9 @@ internal static class CriticalTableParserSupport foreach (var cellEntry in cellEntries) { - var content = SharedParsing.CriticalCellTextParser.Parse(cellEntry.Lines, sharedLegend); + var lineTexts = cellEntry.Lines.Select(line => line.Text).ToList(); + var content = SharedParsing.CriticalCellTextParser.Parse(lineTexts, sharedLegend); + var sourceBounds = BuildSourceBounds(cellEntry.Lines.SelectMany(line => line.Fragments).ToList()); validationErrors.AddRange(content.ValidationErrors.Select(error => $"Cell '{BuildCellIdentifier(cellEntry)}': {error}")); @@ -575,13 +599,14 @@ internal static class CriticalTableParserSupport cellEntry.GroupKey, cellEntry.RollBandLabel, cellEntry.ColumnKey, - cellEntry.Lines.ToList(), + lineTexts, content.BaseLines, content.RawCellText, content.DescriptionText, content.RawAffixText, effects, - branches)); + branches, + sourceBounds)); parsedResults.Add(new ParsedCriticalResult( cellEntry.GroupKey, @@ -591,10 +616,37 @@ internal static class CriticalTableParserSupport content.DescriptionText, content.RawAffixText, effects, - branches)); + branches, + sourceBounds)); } } + private static ParsedCriticalSourceRect BuildSourceBounds(IReadOnlyList fragments) + { + if (fragments.Count == 0) + { + throw new InvalidOperationException("Cannot build source bounds for an empty fragment set."); + } + + var pageNumber = fragments[0].PageNumber; + if (fragments.Any(fragment => fragment.PageNumber != pageNumber)) + { + throw new InvalidOperationException("A parsed cell spans multiple PDF pages, which cannot be cropped reliably."); + } + + var left = fragments.Min(fragment => fragment.Left); + var top = fragments.Min(fragment => fragment.Top); + var right = fragments.Max(fragment => fragment.Left + fragment.Width); + var bottom = fragments.Max(fragment => fragment.Top + fragment.Height); + + return new ParsedCriticalSourceRect( + pageNumber, + left, + top, + Math.Max(1, right - left), + Math.Max(1, bottom - top)); + } + private static SharedParsing.AffixLegend ToSharedAffixLegend(AffixLegend affixLegend) => new( affixLegend.SymbolEffects, diff --git a/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs index e521d9e..9ad7e43 100644 --- a/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs @@ -17,6 +17,7 @@ public sealed class GroupedVariantCriticalTableParser public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) { var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); + var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent); var groupHeaders = FindGroupHeaders(fragments); var columnHeaders = FindColumnHeaders(fragments); var validationErrors = new List(); @@ -141,7 +142,7 @@ public sealed class GroupedVariantCriticalTableParser parsedRollBands, parsedResults); - return new CriticalTableParseResult(table, fragments, parsedCells, validationReport); + return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport); } private static List FindGroupHeaders(IReadOnlyList fragments) diff --git a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalCellArtifact.cs b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalCellArtifact.cs index c96fe20..ad33de0 100644 --- a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalCellArtifact.cs +++ b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalCellArtifact.cs @@ -10,7 +10,8 @@ public sealed class ParsedCriticalCellArtifact( string descriptionText, string? rawAffixText, IReadOnlyList effects, - IReadOnlyList branches) + IReadOnlyList branches, + ParsedCriticalSourceRect sourceBounds) { public string? GroupKey { get; } = groupKey; public string RollBandLabel { get; } = rollBandLabel; @@ -22,4 +23,7 @@ public sealed class ParsedCriticalCellArtifact( public string? RawAffixText { get; } = rawAffixText; public IReadOnlyList Effects { get; } = effects; public IReadOnlyList Branches { get; } = branches; + public ParsedCriticalSourceRect SourceBounds { get; } = sourceBounds; + public string? SourceImagePath { get; set; } + public CriticalSourceImageCrop? SourceImageCrop { get; set; } } diff --git a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalResult.cs b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalResult.cs index 78e31bd..8ddef64 100644 --- a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalResult.cs +++ b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalResult.cs @@ -8,7 +8,8 @@ public sealed class ParsedCriticalResult( string descriptionText, string? rawAffixText, IReadOnlyList effects, - IReadOnlyList branches) + IReadOnlyList branches, + ParsedCriticalSourceRect sourceBounds) { public string? GroupKey { get; } = groupKey; public string ColumnKey { get; } = columnKey; @@ -18,4 +19,7 @@ public sealed class ParsedCriticalResult( public string? RawAffixText { get; } = rawAffixText; public IReadOnlyList Effects { get; } = effects; public IReadOnlyList Branches { get; } = branches; + public ParsedCriticalSourceRect SourceBounds { get; } = sourceBounds; + public string? SourceImagePath { get; set; } + public CriticalSourceImageCrop? SourceImageCrop { get; set; } } diff --git a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalSourceRect.cs b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalSourceRect.cs new file mode 100644 index 0000000..26e7068 --- /dev/null +++ b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalSourceRect.cs @@ -0,0 +1,15 @@ +namespace RolemasterDb.ImportTool.Parsing; + +public sealed class ParsedCriticalSourceRect( + int pageNumber, + int left, + int top, + int width, + int height) +{ + public int PageNumber { get; } = pageNumber; + public int Left { get; } = left; + public int Top { get; } = top; + public int Width { get; } = width; + public int Height { get; } = height; +} diff --git a/src/RolemasterDb.ImportTool/Parsing/ParsedPdfPageGeometry.cs b/src/RolemasterDb.ImportTool/Parsing/ParsedPdfPageGeometry.cs new file mode 100644 index 0000000..27663d2 --- /dev/null +++ b/src/RolemasterDb.ImportTool/Parsing/ParsedPdfPageGeometry.cs @@ -0,0 +1,8 @@ +namespace RolemasterDb.ImportTool.Parsing; + +public sealed class ParsedPdfPageGeometry(int pageNumber, int width, int height) +{ + public int PageNumber { get; } = pageNumber; + public int Width { get; } = width; + public int Height { get; } = height; +} diff --git a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs index 50c975f..8560cac 100644 --- a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs @@ -5,6 +5,7 @@ public sealed class StandardCriticalTableParser public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) { var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); + var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent); var headerFragments = FindHeaderFragments(fragments); var validationErrors = new List(); var validationWarnings = new List(); @@ -121,7 +122,7 @@ public sealed class StandardCriticalTableParser parsedRollBands, parsedResults); - return new CriticalTableParseResult(table, fragments, parsedCells, validationReport); + return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport); } private static List FindHeaderFragments(IReadOnlyList fragments) diff --git a/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs index c2c53bf..ba9e3b0 100644 --- a/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs @@ -14,6 +14,7 @@ public sealed class VariantColumnCriticalTableParser public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) { var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); + var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent); var headerFragments = FindHeaderFragments(fragments); var validationErrors = new List(); var validationWarnings = new List(); @@ -137,7 +138,7 @@ public sealed class VariantColumnCriticalTableParser parsedRollBands, parsedResults); - return new CriticalTableParseResult(table, fragments, parsedCells, validationReport); + return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport); } private static List FindHeaderFragments(IReadOnlyList fragments)