Track critical cell source bounds

This commit is contained in:
2026-03-17 22:27:57 +01:00
parent 99e7da0d21
commit 4979cf87f7
11 changed files with 115 additions and 15 deletions

View File

@@ -1,10 +1,15 @@
namespace RolemasterDb.ImportTool.Parsing; namespace RolemasterDb.ImportTool.Parsing;
internal sealed class ColumnarCellEntry(string? groupKey, string rollBandLabel, int rowIndex, string columnKey, List<string> lines) internal sealed class ColumnarCellEntry(
string? groupKey,
string rollBandLabel,
int rowIndex,
string columnKey,
List<ColumnarCellLine> lines)
{ {
public string? GroupKey { get; } = groupKey; public string? GroupKey { get; } = groupKey;
public string RollBandLabel { get; } = rollBandLabel; public string RollBandLabel { get; } = rollBandLabel;
public int RowIndex { get; } = rowIndex; public int RowIndex { get; } = rowIndex;
public string ColumnKey { get; } = columnKey; public string ColumnKey { get; } = columnKey;
public List<string> Lines { get; } = lines; public List<ColumnarCellLine> Lines { get; } = lines;
} }

View File

@@ -0,0 +1,7 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed class ColumnarCellLine(string text, List<XmlTextFragment> fragments)
{
public string Text { get; } = text;
public List<XmlTextFragment> Fragments { get; } = fragments;
}

View File

@@ -2,11 +2,13 @@ namespace RolemasterDb.ImportTool.Parsing;
public sealed class CriticalTableParseResult( public sealed class CriticalTableParseResult(
ParsedCriticalTable table, ParsedCriticalTable table,
IReadOnlyList<ParsedPdfPageGeometry> pageGeometries,
IReadOnlyList<XmlTextFragment> fragments, IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<ParsedCriticalCellArtifact> cells, IReadOnlyList<ParsedCriticalCellArtifact> cells,
ImportValidationReport validationReport) ImportValidationReport validationReport)
{ {
public ParsedCriticalTable Table { get; } = table; public ParsedCriticalTable Table { get; } = table;
public IReadOnlyList<ParsedPdfPageGeometry> PageGeometries { get; } = pageGeometries;
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments; public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells; public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
public ImportValidationReport ValidationReport { get; } = validationReport; public ImportValidationReport ValidationReport { get; } = validationReport;

View File

@@ -53,6 +53,26 @@ internal static class CriticalTableParserSupport
return RemoveRedundantContainedFragments(fragments); return RemoveRedundantContainedFragments(fragments);
} }
internal static List<ParsedPdfPageGeometry> LoadPageGeometries(string xmlContent)
{
using var stringReader = new StringReader(xmlContent);
using var xmlReader = XmlReader.Create(
stringReader,
new XmlReaderSettings
{
DtdProcessing = DtdProcessing.Ignore
});
var document = XDocument.Load(xmlReader);
return document.Descendants("page")
.Select(page => new ParsedPdfPageGeometry(
int.Parse(page.Attribute("number")?.Value ?? "1"),
int.Parse(page.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing page width attribute.")),
int.Parse(page.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing page height attribute."))))
.ToList();
}
internal static List<XmlTextFragment> FindRowLabelFragments( internal static List<XmlTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments, IReadOnlyList<XmlTextFragment> fragments,
int leftCutoff, int leftCutoff,
@@ -143,7 +163,7 @@ internal static class CriticalTableParserSupport
return columns[^1].Key; return columns[^1].Key;
} }
internal static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments) internal static IReadOnlyList<ColumnarCellLine> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
{ {
var lines = new List<List<XmlTextFragment>>(); var lines = new List<List<XmlTextFragment>>();
@@ -159,8 +179,10 @@ internal static class CriticalTableParserSupport
} }
return lines return lines
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text)))) .Select(line => new ColumnarCellLine(
.Where(item => !string.IsNullOrWhiteSpace(item)) CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))),
line.OrderBy(item => item.Left).ToList()))
.Where(item => !string.IsNullOrWhiteSpace(item.Text))
.ToList(); .ToList();
} }
@@ -516,7 +538,7 @@ internal static class CriticalTableParserSupport
} }
var leadingAffixCount = 0; var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols)) while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount].Text, affixLegendSymbols))
{ {
leadingAffixCount++; leadingAffixCount++;
} }
@@ -564,7 +586,9 @@ internal static class CriticalTableParserSupport
foreach (var cellEntry in cellEntries) foreach (var cellEntry in cellEntries)
{ {
var content = SharedParsing.CriticalCellTextParser.Parse(cellEntry.Lines, sharedLegend); var lineTexts = cellEntry.Lines.Select(line => line.Text).ToList();
var content = SharedParsing.CriticalCellTextParser.Parse(lineTexts, sharedLegend);
var sourceBounds = BuildSourceBounds(cellEntry.Lines.SelectMany(line => line.Fragments).ToList());
validationErrors.AddRange(content.ValidationErrors.Select(error => validationErrors.AddRange(content.ValidationErrors.Select(error =>
$"Cell '{BuildCellIdentifier(cellEntry)}': {error}")); $"Cell '{BuildCellIdentifier(cellEntry)}': {error}"));
@@ -575,13 +599,14 @@ internal static class CriticalTableParserSupport
cellEntry.GroupKey, cellEntry.GroupKey,
cellEntry.RollBandLabel, cellEntry.RollBandLabel,
cellEntry.ColumnKey, cellEntry.ColumnKey,
cellEntry.Lines.ToList(), lineTexts,
content.BaseLines, content.BaseLines,
content.RawCellText, content.RawCellText,
content.DescriptionText, content.DescriptionText,
content.RawAffixText, content.RawAffixText,
effects, effects,
branches)); branches,
sourceBounds));
parsedResults.Add(new ParsedCriticalResult( parsedResults.Add(new ParsedCriticalResult(
cellEntry.GroupKey, cellEntry.GroupKey,
@@ -591,10 +616,37 @@ internal static class CriticalTableParserSupport
content.DescriptionText, content.DescriptionText,
content.RawAffixText, content.RawAffixText,
effects, effects,
branches)); branches,
sourceBounds));
} }
} }
private static ParsedCriticalSourceRect BuildSourceBounds(IReadOnlyList<XmlTextFragment> fragments)
{
if (fragments.Count == 0)
{
throw new InvalidOperationException("Cannot build source bounds for an empty fragment set.");
}
var pageNumber = fragments[0].PageNumber;
if (fragments.Any(fragment => fragment.PageNumber != pageNumber))
{
throw new InvalidOperationException("A parsed cell spans multiple PDF pages, which cannot be cropped reliably.");
}
var left = fragments.Min(fragment => fragment.Left);
var top = fragments.Min(fragment => fragment.Top);
var right = fragments.Max(fragment => fragment.Left + fragment.Width);
var bottom = fragments.Max(fragment => fragment.Top + fragment.Height);
return new ParsedCriticalSourceRect(
pageNumber,
left,
top,
Math.Max(1, right - left),
Math.Max(1, bottom - top));
}
private static SharedParsing.AffixLegend ToSharedAffixLegend(AffixLegend affixLegend) => private static SharedParsing.AffixLegend ToSharedAffixLegend(AffixLegend affixLegend) =>
new( new(
affixLegend.SymbolEffects, affixLegend.SymbolEffects,

View File

@@ -17,6 +17,7 @@ public sealed class GroupedVariantCriticalTableParser
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{ {
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent);
var groupHeaders = FindGroupHeaders(fragments); var groupHeaders = FindGroupHeaders(fragments);
var columnHeaders = FindColumnHeaders(fragments); var columnHeaders = FindColumnHeaders(fragments);
var validationErrors = new List<string>(); var validationErrors = new List<string>();
@@ -141,7 +142,7 @@ public sealed class GroupedVariantCriticalTableParser
parsedRollBands, parsedRollBands,
parsedResults); parsedResults);
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport); return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport);
} }
private static List<XmlTextFragment> FindGroupHeaders(IReadOnlyList<XmlTextFragment> fragments) private static List<XmlTextFragment> FindGroupHeaders(IReadOnlyList<XmlTextFragment> fragments)

View File

@@ -10,7 +10,8 @@ public sealed class ParsedCriticalCellArtifact(
string descriptionText, string descriptionText,
string? rawAffixText, string? rawAffixText,
IReadOnlyList<ParsedCriticalEffect> effects, IReadOnlyList<ParsedCriticalEffect> effects,
IReadOnlyList<ParsedCriticalBranch> branches) IReadOnlyList<ParsedCriticalBranch> branches,
ParsedCriticalSourceRect sourceBounds)
{ {
public string? GroupKey { get; } = groupKey; public string? GroupKey { get; } = groupKey;
public string RollBandLabel { get; } = rollBandLabel; public string RollBandLabel { get; } = rollBandLabel;
@@ -22,4 +23,7 @@ public sealed class ParsedCriticalCellArtifact(
public string? RawAffixText { get; } = rawAffixText; public string? RawAffixText { get; } = rawAffixText;
public IReadOnlyList<ParsedCriticalEffect> Effects { get; } = effects; public IReadOnlyList<ParsedCriticalEffect> Effects { get; } = effects;
public IReadOnlyList<ParsedCriticalBranch> Branches { get; } = branches; public IReadOnlyList<ParsedCriticalBranch> Branches { get; } = branches;
public ParsedCriticalSourceRect SourceBounds { get; } = sourceBounds;
public string? SourceImagePath { get; set; }
public CriticalSourceImageCrop? SourceImageCrop { get; set; }
} }

View File

@@ -8,7 +8,8 @@ public sealed class ParsedCriticalResult(
string descriptionText, string descriptionText,
string? rawAffixText, string? rawAffixText,
IReadOnlyList<ParsedCriticalEffect> effects, IReadOnlyList<ParsedCriticalEffect> effects,
IReadOnlyList<ParsedCriticalBranch> branches) IReadOnlyList<ParsedCriticalBranch> branches,
ParsedCriticalSourceRect sourceBounds)
{ {
public string? GroupKey { get; } = groupKey; public string? GroupKey { get; } = groupKey;
public string ColumnKey { get; } = columnKey; public string ColumnKey { get; } = columnKey;
@@ -18,4 +19,7 @@ public sealed class ParsedCriticalResult(
public string? RawAffixText { get; } = rawAffixText; public string? RawAffixText { get; } = rawAffixText;
public IReadOnlyList<ParsedCriticalEffect> Effects { get; } = effects; public IReadOnlyList<ParsedCriticalEffect> Effects { get; } = effects;
public IReadOnlyList<ParsedCriticalBranch> Branches { get; } = branches; public IReadOnlyList<ParsedCriticalBranch> Branches { get; } = branches;
public ParsedCriticalSourceRect SourceBounds { get; } = sourceBounds;
public string? SourceImagePath { get; set; }
public CriticalSourceImageCrop? SourceImageCrop { get; set; }
} }

View File

@@ -0,0 +1,15 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalSourceRect(
int pageNumber,
int left,
int top,
int width,
int height)
{
public int PageNumber { get; } = pageNumber;
public int Left { get; } = left;
public int Top { get; } = top;
public int Width { get; } = width;
public int Height { get; } = height;
}

View File

@@ -0,0 +1,8 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedPdfPageGeometry(int pageNumber, int width, int height)
{
public int PageNumber { get; } = pageNumber;
public int Width { get; } = width;
public int Height { get; } = height;
}

View File

@@ -5,6 +5,7 @@ public sealed class StandardCriticalTableParser
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{ {
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent);
var headerFragments = FindHeaderFragments(fragments); var headerFragments = FindHeaderFragments(fragments);
var validationErrors = new List<string>(); var validationErrors = new List<string>();
var validationWarnings = new List<string>(); var validationWarnings = new List<string>();
@@ -121,7 +122,7 @@ public sealed class StandardCriticalTableParser
parsedRollBands, parsedRollBands,
parsedResults); parsedResults);
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport); return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport);
} }
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments) private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)

View File

@@ -14,6 +14,7 @@ public sealed class VariantColumnCriticalTableParser
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{ {
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent);
var headerFragments = FindHeaderFragments(fragments); var headerFragments = FindHeaderFragments(fragments);
var validationErrors = new List<string>(); var validationErrors = new List<string>();
var validationWarnings = new List<string>(); var validationWarnings = new List<string>();
@@ -137,7 +138,7 @@ public sealed class VariantColumnCriticalTableParser
parsedRollBands, parsedRollBands,
parsedResults); parsedResults);
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport); return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport);
} }
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments) private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)