Add OCR import support for void critical table

This commit is contained in:
2026-03-19 23:16:09 +01:00
parent b4c8f8c142
commit 7bb0c1b8d1
35 changed files with 4379 additions and 285 deletions

View File

@@ -167,6 +167,15 @@
"extractionMethod": "xml",
"pdfPath": "sources/Unbalance.pdf",
"enabled": true
},
{
"slug": "void",
"displayName": "Void Critical Strike Table",
"family": "standard",
"extractionMethod": "ocr",
"axisTemplateSlug": "mana-standard-19",
"pdfPath": "sources/Void.pdf",
"enabled": true
}
]
}

Binary file not shown.

View File

@@ -478,12 +478,12 @@ public sealed class CriticalCellReparseIntegrationTests
initialResponse.Branches));
Assert.NotNull(saveResponse);
Assert.Contains(saveResponse!.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "2d10-16");
Assert.Contains(saveResponse!.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "+2d10-16");
var reopenedResponse = await lookupService.GetCriticalCellEditorAsync("mana", resultId);
Assert.NotNull(reopenedResponse);
Assert.Contains("-2d10-16pp", reopenedResponse!.QuickParseInput, StringComparison.Ordinal);
Assert.Contains(reopenedResponse.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "2d10-16");
Assert.Contains(reopenedResponse.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "+2d10-16");
var reparsed = await lookupService.ReparseCriticalCellAsync(
"mana",
@@ -643,20 +643,5 @@ public sealed class CriticalCellReparseIntegrationTests
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext);
}
private static string GetRepositoryRoot()
{
var probe = new DirectoryInfo(AppContext.BaseDirectory);
while (probe is not null)
{
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
{
return probe.FullName;
}
probe = probe.Parent;
}
throw new InvalidOperationException("Could not find the repository root for integration tests.");
}
private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
}

View File

@@ -6,6 +6,7 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
{
private static readonly PdfXmlExtractor Extractor = new();
private static readonly StandardCriticalTableParser StandardParser = new();
private static readonly StandardOcrBootstrapper StandardOcrBootstrapper = new();
[Fact]
public async Task Generated_artifacts_include_page_and_cell_source_images()
@@ -32,6 +33,34 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
Assert.True(File.Exists(artifactPaths.ResolveRelativePath(result.SourceImagePath!)));
}
[Fact]
public async Task Generated_ocr_artifacts_preserve_pixel_space_crop_metadata()
{
var (parseResult, artifactPaths) = await LoadPreparedVoidParseResultAsync();
var result = FindResult(parseResult, "96-99", "D");
var cellArtifact = parseResult.Cells.Single(item =>
item.GroupKey is null &&
item.RollBandLabel == "96-99" &&
item.ColumnKey == "D");
Assert.True(result.SourceBounds.PageNumber > 0);
Assert.True(result.SourceBounds.Width > 0);
Assert.True(result.SourceBounds.Height > 0);
Assert.NotNull(result.SourceImagePath);
Assert.NotNull(result.SourceImageCrop);
Assert.Equal(1, result.SourceImageCrop!.ScaleFactor);
Assert.Equal(PdfXmlExtractor.ScaledRenderDpi, result.SourceImageCrop.RenderDpi);
Assert.Equal(3600, result.SourceImageCrop.PageWidth);
Assert.Equal(5070, result.SourceImageCrop.PageHeight);
Assert.Equal(result.SourceBounds.Width, result.SourceImageCrop.BoundsWidth);
Assert.Equal(result.SourceBounds.Height, result.SourceImageCrop.BoundsHeight);
Assert.Equal(result.SourceImagePath, cellArtifact.SourceImagePath);
Assert.NotNull(cellArtifact.SourceImageCrop);
Assert.True(File.Exists(artifactPaths.GetPageImagePath(result.SourceBounds.PageNumber)));
Assert.True(File.Exists(artifactPaths.ResolveRelativePath(result.SourceImagePath!)));
}
private static async Task<(CriticalTableParseResult ParseResult, ImportArtifactPaths ArtifactPaths)> LoadPreparedSlashParseResultAsync()
{
var entry = LoadManifest().Tables.Single(item => item.Slug == "slash");
@@ -51,6 +80,25 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
return (parseResult, artifactPaths);
}
private static async Task<(CriticalTableParseResult ParseResult, ImportArtifactPaths ArtifactPaths)> LoadPreparedVoidParseResultAsync()
{
var entry = LoadManifest().Tables.Single(item => item.Slug == "void");
var source = new ExtractedCriticalSource(
"ocr",
"Imported from PDF OCR extraction.",
SourceRenderProfile.OcrPixels(PdfXmlExtractor.ScaledRenderDpi),
[new ParsedPdfPageGeometry(1, 3600, 5070)],
OcrCriticalSourceExtractor.ParseTsv(await File.ReadAllTextAsync(GetVoidFixturePath())));
var layout = StandardOcrBootstrapper.Bootstrap(source, StandardTableAxisTemplateCatalog.Resolve(entry.AxisTemplateSlug));
var parseResult = StandardParser.Parse(entry, source, layout);
var artifactRoot = Path.Combine(GetArtifactCacheRoot(), Guid.NewGuid().ToString("N"));
var artifactPaths = ImportArtifactPaths.Create(artifactRoot, entry.Slug);
var generator = new CriticalSourceImageArtifactGenerator(new PdfXmlExtractor());
await generator.GenerateAsync(Path.Combine(GetRepositoryRoot(), entry.PdfPath), artifactPaths, parseResult);
return (parseResult, artifactPaths);
}
private static ParsedCriticalResult FindResult(CriticalTableParseResult parseResult, string rollBandLabel, string columnKey) =>
parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
@@ -60,6 +108,9 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
private static CriticalImportManifest LoadManifest() =>
new CriticalImportManifestLoader().Load(Path.Combine(GetRepositoryRoot(), "sources", "critical-import-manifest.json"));
private static string GetVoidFixturePath() =>
Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.ImportTool.Tests", "Fixtures", "Void", "source.ocr.tsv");
private static string GetArtifactCacheRoot()
{
var cacheRoot = Path.Combine(Path.GetTempPath(), "RolemasterDb.ImportTool.MergeTests");
@@ -67,20 +118,5 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
return cacheRoot;
}
private static string GetRepositoryRoot()
{
var probe = new DirectoryInfo(AppContext.BaseDirectory);
while (probe is not null)
{
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
{
return probe.FullName;
}
probe = probe.Parent;
}
throw new InvalidOperationException("Could not find the repository root for integration tests.");
}
private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
}

View File

@@ -315,20 +315,5 @@ public sealed class CriticalImportMergeIntegrationTests
return cacheRoot;
}
private static string GetRepositoryRoot()
{
var probe = new DirectoryInfo(AppContext.BaseDirectory);
while (probe is not null)
{
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
{
return probe.FullName;
}
probe = probe.Parent;
}
throw new InvalidOperationException("Could not find the repository root for integration tests.");
}
private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
}

File diff suppressed because it is too large Load Diff

View File

@@ -33,11 +33,13 @@ public sealed class StandardCriticalTableParserIntegrationTests
"subdual",
"super_large_creature_weapon",
"tiny",
"unbalance"
"unbalance",
"void"
];
private static readonly PdfXmlExtractor Extractor = new();
private static readonly StandardCriticalTableParser StandardParser = new();
private static readonly StandardOcrBootstrapper StandardOcrBootstrapper = new();
private static readonly VariantColumnCriticalTableParser VariantColumnParser = new();
private static readonly GroupedVariantCriticalTableParser GroupedVariantParser = new();
@@ -57,6 +59,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
yield return new object[] { "mana", null!, "96-99", "E", "momentarily transformed" };
yield return new object[] { "mana", null!, "100", "E", "Mana consumes everything" };
yield return new object[] { "tiny", null!, "100", "E", "Vein and artery severed" };
yield return new object[] { "void", null!, "96-99", "D", "Foe inhales the void" };
yield return new object[] { "large_creature_weapon", null!, "01-05", "NORMAL", "Weapon shatters" };
yield return new object[] { "super_large_creature_weapon", null!, "31-40", "SLAYING", "Boom! Solid without question" };
yield return new object[] { "large_creature_magic", "large", "251+", "NORMAL", "Foe lowers his eyes within your reach" };
@@ -75,13 +78,16 @@ public sealed class StandardCriticalTableParserIntegrationTests
Assert.Equal(ExpectedEnabledSlugs, enabledTables.Select(item => item.Slug));
Assert.All(enabledTables, entry =>
{
Assert.Equal("xml", entry.ExtractionMethod);
Assert.True(
new[] { "xml", "ocr" }.Contains(entry.ExtractionMethod, StringComparer.Ordinal),
$"Unexpected extraction method '{entry.ExtractionMethod}' for '{entry.Slug}'.");
Assert.True(File.Exists(Path.Combine(GetRepositoryRoot(), entry.PdfPath)), $"Missing source PDF for '{entry.Slug}'.");
});
Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "large_creature_weapon").Family);
Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "super_large_creature_weapon").Family);
Assert.Equal("grouped_variant", enabledTables.Single(item => item.Slug == "large_creature_magic").Family);
Assert.Equal("ocr", enabledTables.Single(item => item.Slug == "void").ExtractionMethod);
}
[Theory]
@@ -604,6 +610,25 @@ public sealed class StandardCriticalTableParserIntegrationTests
Assert.StartsWith("Strike to foe's hip.", result.RawCellText, StringComparison.Ordinal);
}
[Fact]
public async Task Loader_persists_void_table_from_fixture()
{
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "void", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var databasePath = CreateTemporaryDatabaseCopy();
var loader = new CriticalImportLoader(databasePath);
await loader.LoadAsync(parseResult.Table);
await using var dbContext = CreateDbContext(databasePath);
var results = await dbContext.CriticalResults
.Include(item => item.CriticalTable)
.Where(item => item.CriticalTable.Slug == "void")
.CountAsync();
Assert.Equal(95, results);
}
[Fact]
public async Task Lookup_service_returns_effects_for_results_and_branches()
{
@@ -632,6 +657,25 @@ public sealed class StandardCriticalTableParserIntegrationTests
private static async Task<CriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
{
if (string.Equals(entry.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase))
{
var tsvContent = await File.ReadAllTextAsync(Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.ImportTool.Tests", "Fixtures", "Void", "source.ocr.tsv"));
var source = new ExtractedCriticalSource(
"ocr",
"Imported from PDF OCR extraction.",
SourceRenderProfile.OcrPixels(PdfXmlExtractor.ScaledRenderDpi),
[new ParsedPdfPageGeometry(1, 3600, 5070)],
OcrCriticalSourceExtractor.ParseTsv(tsvContent));
return entry.Family switch
{
"standard" => StandardParser.Parse(entry, source, StandardOcrBootstrapper.Bootstrap(source, StandardTableAxisTemplateCatalog.Resolve(entry.AxisTemplateSlug))),
"variant_column" => VariantColumnParser.Parse(entry, source),
"grouped_variant" => GroupedVariantParser.Parse(entry, source),
_ => throw new InvalidOperationException($"Unsupported manifest family '{entry.Family}'.")
};
}
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
if (!File.Exists(xmlPath))
@@ -701,20 +745,5 @@ public sealed class StandardCriticalTableParserIntegrationTests
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext);
}
private static string GetRepositoryRoot()
{
var probe = new DirectoryInfo(AppContext.BaseDirectory);
while (probe is not null)
{
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
{
return probe.FullName;
}
probe = probe.Parent;
}
throw new InvalidOperationException("Could not find the repository root for integration tests.");
}
private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
}

View File

@@ -0,0 +1,39 @@
namespace RolemasterDb.ImportTool.Tests;
internal static class TestRepositoryPaths
{
private const string RepositoryRootEnvironmentVariable = "ROLEMASTERDB_REPOSITORY_ROOT";
public static string GetRepositoryRoot()
{
var configuredRoot = Environment.GetEnvironmentVariable(RepositoryRootEnvironmentVariable);
if (!string.IsNullOrWhiteSpace(configuredRoot))
{
var fullPath = Path.GetFullPath(configuredRoot);
if (File.Exists(Path.Combine(fullPath, "RolemasterDB.slnx")))
{
return fullPath;
}
}
var probes = new[]
{
new DirectoryInfo(AppContext.BaseDirectory),
new DirectoryInfo(Directory.GetCurrentDirectory())
};
foreach (var probe in probes)
{
for (var current = probe; current is not null; current = current.Parent)
{
if (File.Exists(Path.Combine(current.FullName, "RolemasterDB.slnx")))
{
return current.FullName;
}
}
}
throw new InvalidOperationException(
$"Could not find the repository root for integration tests. Set {RepositoryRootEnvironmentVariable} to the repository path.");
}
}

View File

@@ -7,6 +7,7 @@ public sealed class CriticalImportCommandRunner
private readonly CriticalImportManifestLoader manifestLoader = new();
private readonly ImportArtifactWriter artifactWriter = new();
private readonly PdfXmlExtractor pdfXmlExtractor = new();
private readonly StandardOcrBootstrapper standardOcrBootstrapper = new();
private readonly CriticalSourceImageArtifactGenerator sourceImageArtifactGenerator;
private readonly StandardCriticalTableParser standardParser = new();
private readonly VariantColumnCriticalTableParser variantColumnParser = new();
@@ -35,8 +36,9 @@ public sealed class CriticalImportCommandRunner
{
var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug);
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath);
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.XmlPath}");
var extractor = CreateSourceExtractor(entry);
await extractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.GetSourceArtifactPath(entry.ExtractionMethod)}");
return 0;
}
@@ -44,15 +46,8 @@ public sealed class CriticalImportCommandRunner
{
var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug);
if (!File.Exists(artifactPaths.XmlPath))
{
Console.Error.WriteLine($"Missing XML artifact: {artifactPaths.XmlPath}");
return 1;
}
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
var parseResult = Parse(entry, xmlContent);
var extractedSource = await LoadExtractedSourceAsync(entry, artifactPaths);
var parseResult = Parse(entry, extractedSource);
await sourceImageArtifactGenerator.GenerateAsync(
ResolveRepositoryPath(entry.PdfPath),
artifactPaths,
@@ -104,14 +99,14 @@ public sealed class CriticalImportCommandRunner
{
var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug);
if (!File.Exists(artifactPaths.XmlPath))
var extractor = CreateSourceExtractor(entry);
if (!File.Exists(artifactPaths.GetSourceArtifactPath(entry.ExtractionMethod)))
{
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath);
await extractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
}
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
var parseResult = Parse(entry, xmlContent);
var extractedSource = await extractor.LoadAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
var parseResult = Parse(entry, extractedSource);
await sourceImageArtifactGenerator.GenerateAsync(
ResolveRepositoryPath(entry.PdfPath),
artifactPaths,
@@ -143,26 +138,61 @@ public sealed class CriticalImportCommandRunner
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
}
private CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
private async Task<ExtractedCriticalSource> LoadExtractedSourceAsync(CriticalImportManifestEntry entry, ImportArtifactPaths artifactPaths)
{
var extractor = CreateSourceExtractor(entry);
var sourceArtifactPath = artifactPaths.GetSourceArtifactPath(entry.ExtractionMethod);
if (!File.Exists(sourceArtifactPath))
{
Console.Error.WriteLine($"Missing source artifact: {sourceArtifactPath}");
throw new FileNotFoundException($"Missing source artifact: {sourceArtifactPath}", sourceArtifactPath);
}
return await extractor.LoadAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
}
private CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source)
{
if (string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
{
return standardParser.Parse(entry, xmlContent);
if (string.Equals(entry.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase))
{
var template = StandardTableAxisTemplateCatalog.Resolve(entry.AxisTemplateSlug);
var layout = standardOcrBootstrapper.Bootstrap(source, template);
return standardParser.Parse(entry, source, layout);
}
return standardParser.Parse(entry, source);
}
if (string.Equals(entry.Family, "variant_column", StringComparison.OrdinalIgnoreCase))
{
return variantColumnParser.Parse(entry, xmlContent);
return variantColumnParser.Parse(entry, source);
}
if (string.Equals(entry.Family, "grouped_variant", StringComparison.OrdinalIgnoreCase))
{
return groupedVariantParser.Parse(entry, xmlContent);
return groupedVariantParser.Parse(entry, source);
}
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by the importer.");
}
private ICriticalSourceExtractor CreateSourceExtractor(CriticalImportManifestEntry entry)
{
if (string.Equals(entry.ExtractionMethod, "xml", StringComparison.OrdinalIgnoreCase))
{
return new XmlCriticalSourceExtractor(pdfXmlExtractor);
}
if (string.Equals(entry.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase))
{
return new OcrCriticalSourceExtractor(pdfXmlExtractor);
}
throw new InvalidOperationException($"Extraction method '{entry.ExtractionMethod}' is not supported by the importer.");
}
private static ImportArtifactPaths CreateArtifactPaths(string slug) =>
ImportArtifactPaths.Create(RepositoryPaths.Discover().ArtifactsRootPath, slug);

View File

@@ -6,6 +6,7 @@ public sealed class CriticalImportManifestEntry
public string DisplayName { get; set; } = string.Empty;
public string Family { get; set; } = string.Empty;
public string ExtractionMethod { get; set; } = string.Empty;
public string? AxisTemplateSlug { get; set; }
public string PdfPath { get; set; } = string.Empty;
public bool Enabled { get; set; } = true;
}

View File

@@ -23,6 +23,7 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
pdfPath,
pageGeometry.PageNumber,
artifactPaths.GetPageImagePath(pageGeometry.PageNumber),
parseResult.RenderProfile.RenderDpi,
cancellationToken);
}
@@ -38,7 +39,7 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
$"Missing page geometry for page {result.SourceBounds.PageNumber} in table '{parseResult.Table.Slug}'.");
}
var crop = CreateCrop(result.SourceBounds, pageGeometry);
var crop = CreateCrop(result.SourceBounds, pageGeometry, parseResult.RenderProfile);
var relativePath = artifactPaths.GetRelativeCellImagePath(result.GroupKey, result.ColumnKey, result.RollBandLabel);
var fullPath = artifactPaths.ResolveRelativePath(relativePath);
@@ -50,6 +51,7 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
crop.CropWidth,
crop.CropHeight,
fullPath,
parseResult.RenderProfile.RenderDpi,
cancellationToken);
result.SourceImagePath = relativePath;
@@ -66,7 +68,8 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
private static CriticalSourceImageCrop CreateCrop(
ParsedCriticalSourceRect sourceBounds,
ParsedPdfPageGeometry pageGeometry)
ParsedPdfPageGeometry pageGeometry,
SourceRenderProfile renderProfile)
{
var cropLeft = Math.Max(0, sourceBounds.Left - CropPaddingX);
var cropTop = Math.Max(0, sourceBounds.Top - CropPaddingY);
@@ -75,18 +78,18 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
return new CriticalSourceImageCrop(
sourceBounds.PageNumber,
PdfXmlExtractor.ScaleCoordinate(pageGeometry.Width),
PdfXmlExtractor.ScaleCoordinate(pageGeometry.Height),
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Left),
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Top),
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Width),
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Height),
PdfXmlExtractor.ScaleCoordinate(cropLeft),
PdfXmlExtractor.ScaleCoordinate(cropTop),
PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropRight - cropLeft)),
PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropBottom - cropTop)),
PdfXmlExtractor.ScaledRenderDpi,
PdfXmlExtractor.RenderScaleFactor);
renderProfile.ScaleCoordinate(pageGeometry.Width),
renderProfile.ScaleCoordinate(pageGeometry.Height),
renderProfile.ScaleCoordinate(sourceBounds.Left),
renderProfile.ScaleCoordinate(sourceBounds.Top),
renderProfile.ScaleCoordinate(sourceBounds.Width),
renderProfile.ScaleCoordinate(sourceBounds.Height),
renderProfile.ScaleCoordinate(cropLeft),
renderProfile.ScaleCoordinate(cropTop),
renderProfile.ScaleCoordinate(Math.Max(1, cropRight - cropLeft)),
renderProfile.ScaleCoordinate(Math.Max(1, cropBottom - cropTop)),
renderProfile.RenderDpi,
renderProfile.ScaleFactor);
}
private static string CreateCellKey(string? groupKey, string rollBandLabel, string columnKey) =>

View File

@@ -2,7 +2,7 @@ using CommandLine;
namespace RolemasterDb.ImportTool;
[Verb("extract", HelpText = "Extract a critical table PDF into a text artifact.")]
[Verb("extract", HelpText = "Extract a critical table PDF into its source artifact.")]
public sealed class ExtractOptions
{
[Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to extract.")]

View File

@@ -0,0 +1,17 @@
using RolemasterDb.ImportTool.Parsing;
namespace RolemasterDb.ImportTool;
public sealed class ExtractedCriticalSource(
string extractionMethod,
string importNotes,
SourceRenderProfile renderProfile,
IReadOnlyList<ParsedPdfPageGeometry> pageGeometries,
IReadOnlyList<PositionedTextFragment> fragments)
{
public string ExtractionMethod { get; } = extractionMethod;
public string ImportNotes { get; } = importNotes;
public SourceRenderProfile RenderProfile { get; } = renderProfile;
public IReadOnlyList<ParsedPdfPageGeometry> PageGeometries { get; } = pageGeometries;
public IReadOnlyList<PositionedTextFragment> Fragments { get; } = fragments;
}

View File

@@ -0,0 +1,11 @@
namespace RolemasterDb.ImportTool;
public interface ICriticalSourceExtractor
{
Task ExtractAsync(string pdfPath, ImportArtifactPaths artifactPaths, CancellationToken cancellationToken = default);
Task<ExtractedCriticalSource> LoadAsync(
string pdfPath,
ImportArtifactPaths artifactPaths,
CancellationToken cancellationToken = default);
}

View File

@@ -9,9 +9,11 @@ public sealed class ImportArtifactPaths
string tableSlug,
string directoryPath,
string xmlPath,
string ocrTsvPath,
string fragmentsJsonPath,
string parsedCellsJsonPath,
string validationReportPath,
string ocrPagesDirectoryPath,
string pagesDirectoryPath,
string cellsDirectoryPath)
{
@@ -19,9 +21,11 @@ public sealed class ImportArtifactPaths
TableSlug = tableSlug;
DirectoryPath = directoryPath;
XmlPath = xmlPath;
OcrTsvPath = ocrTsvPath;
FragmentsJsonPath = fragmentsJsonPath;
ParsedCellsJsonPath = parsedCellsJsonPath;
ValidationReportPath = validationReportPath;
OcrPagesDirectoryPath = ocrPagesDirectoryPath;
PagesDirectoryPath = pagesDirectoryPath;
CellsDirectoryPath = cellsDirectoryPath;
}
@@ -30,15 +34,18 @@ public sealed class ImportArtifactPaths
public string TableSlug { get; }
public string DirectoryPath { get; }
public string XmlPath { get; }
public string OcrTsvPath { get; }
public string FragmentsJsonPath { get; }
public string ParsedCellsJsonPath { get; }
public string ValidationReportPath { get; }
public string OcrPagesDirectoryPath { get; }
public string PagesDirectoryPath { get; }
public string CellsDirectoryPath { get; }
public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug)
{
var directoryPath = Path.Combine(artifactsRootPath, tableSlug);
var ocrPagesDirectoryPath = Path.Combine(directoryPath, "ocr-pages");
var pagesDirectoryPath = Path.Combine(directoryPath, "pages");
var cellsDirectoryPath = Path.Combine(directoryPath, "cells");
@@ -47,13 +54,23 @@ public sealed class ImportArtifactPaths
tableSlug,
directoryPath,
Path.Combine(directoryPath, "source.xml"),
Path.Combine(directoryPath, "source.ocr.tsv"),
Path.Combine(directoryPath, "fragments.json"),
Path.Combine(directoryPath, "parsed-cells.json"),
Path.Combine(directoryPath, "validation-report.json"),
ocrPagesDirectoryPath,
pagesDirectoryPath,
cellsDirectoryPath);
}
public string GetSourceArtifactPath(string extractionMethod) =>
string.Equals(extractionMethod, "ocr", StringComparison.OrdinalIgnoreCase)
? OcrTsvPath
: XmlPath;
public string GetOcrPageImagePath(int pageNumber) =>
Path.Combine(OcrPagesDirectoryPath, $"page-{pageNumber:000}.png");
public string GetPageImagePath(int pageNumber) =>
Path.Combine(PagesDirectoryPath, $"page-{pageNumber:000}.png");

View File

@@ -2,7 +2,7 @@ using CommandLine;
namespace RolemasterDb.ImportTool;
[Verb("load", HelpText = "Load a parsed critical table from its extracted text artifact.")]
[Verb("load", HelpText = "Load a parsed critical table from its extracted source artifact.")]
public sealed class LoadOptions
{
[Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to load.")]

View File

@@ -0,0 +1,204 @@
using System.Globalization;
using System.Text;
using RolemasterDb.ImportTool.Parsing;
namespace RolemasterDb.ImportTool;
public sealed class OcrCriticalSourceExtractor(PdfXmlExtractor pdfXmlExtractor) : ICriticalSourceExtractor
{
private const int OcrRenderDpi = PdfXmlExtractor.ScaledRenderDpi;
private const string TesseractExeDefaultPath = @"C:\Program Files\Sejda PDF Desktop\resources\vendor\tesseract-windows-x64\tesseract.exe";
private const string TessdataDefaultPath = @"C:\Program Files\Sejda PDF Desktop\resources\vendor\tessdata";
public async Task ExtractAsync(string pdfPath, ImportArtifactPaths artifactPaths, CancellationToken cancellationToken = default)
{
Directory.CreateDirectory(artifactPaths.DirectoryPath);
Directory.CreateDirectory(artifactPaths.OcrPagesDirectoryPath);
var info = await pdfXmlExtractor.ReadDocumentInfoAsync(pdfPath, cancellationToken);
if (info.PageCount != 1)
{
throw new InvalidOperationException("The OCR extractor currently supports only single-page critical tables.");
}
var pageImagePath = artifactPaths.GetOcrPageImagePath(1);
await pdfXmlExtractor.RenderPagePngAsync(pdfPath, 1, pageImagePath, OcrRenderDpi, cancellationToken);
var tsvContent = await RunTesseractAsync(pageImagePath, cancellationToken);
await File.WriteAllTextAsync(artifactPaths.OcrTsvPath, tsvContent, cancellationToken);
}
public async Task<ExtractedCriticalSource> LoadAsync(
string pdfPath,
ImportArtifactPaths artifactPaths,
CancellationToken cancellationToken = default)
{
if (!File.Exists(artifactPaths.OcrTsvPath))
{
throw new FileNotFoundException($"Missing OCR artifact: {artifactPaths.OcrTsvPath}", artifactPaths.OcrTsvPath);
}
var pageImagePath = artifactPaths.GetOcrPageImagePath(1);
if (!File.Exists(pageImagePath))
{
throw new FileNotFoundException($"Missing OCR page image artifact: {pageImagePath}", pageImagePath);
}
var tsvContent = await File.ReadAllTextAsync(artifactPaths.OcrTsvPath, cancellationToken);
var (pageWidth, pageHeight) = ReadPngDimensions(pageImagePath);
return new ExtractedCriticalSource(
"ocr",
"Imported from PDF OCR extraction.",
SourceRenderProfile.OcrPixels(OcrRenderDpi),
[new ParsedPdfPageGeometry(1, pageWidth, pageHeight)],
ParseTsv(tsvContent));
}
internal static IReadOnlyList<PositionedTextFragment> ParseTsv(string tsvContent)
{
var lines = tsvContent
.Split(["\r\n", "\n"], StringSplitOptions.RemoveEmptyEntries)
.ToList();
if (lines.Count == 0)
{
return [];
}
var fragments = new List<PositionedTextFragment>();
foreach (var line in lines.Skip(1))
{
var columns = line.Split('\t');
if (columns.Length < 12 || columns[0] != "5")
{
continue;
}
var text = CriticalTableParserSupport.NormalizeText(string.Join('\t', columns.Skip(11)));
if (string.IsNullOrWhiteSpace(text))
{
continue;
}
fragments.Add(new PositionedTextFragment(
int.Parse(columns[1], CultureInfo.InvariantCulture),
int.Parse(columns[7], CultureInfo.InvariantCulture),
int.Parse(columns[6], CultureInfo.InvariantCulture),
int.Parse(columns[8], CultureInfo.InvariantCulture),
int.Parse(columns[9], CultureInfo.InvariantCulture),
text,
ParseConfidence(columns[10])));
}
return fragments;
}
private static int? ParseConfidence(string value) =>
int.TryParse(value, NumberStyles.Integer, CultureInfo.InvariantCulture, out var confidence) && confidence >= 0
? confidence
: null;
private static (int Width, int Height) ReadPngDimensions(string path)
{
using var stream = File.OpenRead(path);
using var reader = new BinaryReader(stream, Encoding.UTF8, leaveOpen: false);
var signature = reader.ReadBytes(8);
var expectedSignature = new byte[] { 137, 80, 78, 71, 13, 10, 26, 10 };
if (!signature.SequenceEqual(expectedSignature))
{
throw new InvalidOperationException($"'{path}' is not a PNG file.");
}
_ = ReadBigEndianInt32(reader);
var chunkType = Encoding.ASCII.GetString(reader.ReadBytes(4));
if (!string.Equals(chunkType, "IHDR", StringComparison.Ordinal))
{
throw new InvalidOperationException($"'{path}' is missing a PNG IHDR header.");
}
var width = ReadBigEndianInt32(reader);
var height = ReadBigEndianInt32(reader);
return (width, height);
}
private static int ReadBigEndianInt32(BinaryReader reader)
{
var bytes = reader.ReadBytes(4);
if (bytes.Length != 4)
{
throw new EndOfStreamException("Unexpected end of stream.");
}
if (BitConverter.IsLittleEndian)
{
Array.Reverse(bytes);
}
return BitConverter.ToInt32(bytes, 0);
}
private static async Task<string> RunTesseractAsync(string imagePath, CancellationToken cancellationToken)
{
var startInfo = new System.Diagnostics.ProcessStartInfo
{
FileName = ResolveTesseractExecutable(),
RedirectStandardError = true,
RedirectStandardOutput = true,
UseShellExecute = false,
CreateNoWindow = true
};
startInfo.Environment["TESSDATA_PREFIX"] = ResolveTessdataPath();
startInfo.ArgumentList.Add(imagePath);
startInfo.ArgumentList.Add("stdout");
startInfo.ArgumentList.Add("--psm");
startInfo.ArgumentList.Add("11");
startInfo.ArgumentList.Add("tsv");
using var process = new System.Diagnostics.Process { StartInfo = startInfo };
process.Start();
var output = await process.StandardOutput.ReadToEndAsync(cancellationToken);
await process.WaitForExitAsync(cancellationToken);
if (process.ExitCode != 0)
{
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
throw new InvalidOperationException($"tesseract failed for '{imagePath}': {error}");
}
return output;
}
private static string ResolveTesseractExecutable()
{
var configuredPath = Environment.GetEnvironmentVariable("ROLEMASTERDB_TESSERACT_PATH");
if (!string.IsNullOrWhiteSpace(configuredPath) && File.Exists(configuredPath))
{
return configuredPath;
}
if (File.Exists(TesseractExeDefaultPath))
{
return TesseractExeDefaultPath;
}
return "tesseract";
}
private static string ResolveTessdataPath()
{
var configuredPath = Environment.GetEnvironmentVariable("ROLEMASTERDB_TESSDATA_PREFIX");
if (!string.IsNullOrWhiteSpace(configuredPath) && Directory.Exists(configuredPath))
{
return configuredPath;
}
if (Directory.Exists(TessdataDefaultPath))
{
return TessdataDefaultPath;
}
return string.Empty;
}
}

View File

@@ -1,7 +1,7 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed class ColumnarCellLine(string text, List<XmlTextFragment> fragments)
internal sealed class ColumnarCellLine(string text, List<PositionedTextFragment> fragments)
{
public string Text { get; } = text;
public List<XmlTextFragment> Fragments { get; } = fragments;
public List<PositionedTextFragment> Fragments { get; } = fragments;
}

View File

@@ -3,13 +3,15 @@ namespace RolemasterDb.ImportTool.Parsing;
public sealed class CriticalTableParseResult(
ParsedCriticalTable table,
IReadOnlyList<ParsedPdfPageGeometry> pageGeometries,
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<PositionedTextFragment> fragments,
SourceRenderProfile renderProfile,
IReadOnlyList<ParsedCriticalCellArtifact> cells,
ImportValidationReport validationReport)
{
public ParsedCriticalTable Table { get; } = table;
public IReadOnlyList<ParsedPdfPageGeometry> PageGeometries { get; } = pageGeometries;
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
public IReadOnlyList<PositionedTextFragment> Fragments { get; } = fragments;
public SourceRenderProfile RenderProfile { get; } = renderProfile;
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
public ImportValidationReport ValidationReport { get; } = validationReport;
}

View File

@@ -22,7 +22,7 @@ internal static class CriticalTableParserSupport
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|)\d+\)$", RegexOptions.Compiled);
private static readonly Regex BoundaryBonusLineRegex = new(@"^(?:all allies|all foe's allies|all foes|all opponents)\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
internal static List<XmlTextFragment> LoadFragments(string xmlContent)
internal static List<PositionedTextFragment> LoadFragments(string xmlContent)
{
using var stringReader = new StringReader(xmlContent);
using var xmlReader = XmlReader.Create(
@@ -39,7 +39,7 @@ internal static class CriticalTableParserSupport
{
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
return page.Elements("text")
.Select(item => new XmlTextFragment(
.Select(item => new PositionedTextFragment(
pageNumber,
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
@@ -73,8 +73,8 @@ internal static class CriticalTableParserSupport
.ToList();
}
internal static List<XmlTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments,
internal static List<PositionedTextFragment> FindRowLabelFragments(
IReadOnlyList<PositionedTextFragment> fragments,
int leftCutoff,
int bodyStartTop,
int keyTop)
@@ -89,7 +89,7 @@ internal static class CriticalTableParserSupport
.ThenBy(item => item.Left)
.ToList();
var merged = new List<XmlTextFragment>();
var merged = new List<PositionedTextFragment>();
for (var index = 0; index < candidates.Count; index++)
{
@@ -107,7 +107,7 @@ internal static class CriticalTableParserSupport
}
}
var deduped = new List<XmlTextFragment>();
var deduped = new List<PositionedTextFragment>();
foreach (var candidate in merged)
{
@@ -128,7 +128,7 @@ internal static class CriticalTableParserSupport
internal static bool IsRollBandLabel(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:\s*-\s*\d{2,3})?$|^\d{2,3}\+$");
internal static bool IsPotentialRowLabelFragment(XmlTextFragment fragment, int leftCutoff) =>
internal static bool IsPotentialRowLabelFragment(PositionedTextFragment fragment, int leftCutoff) =>
fragment.Left < leftCutoff &&
(IsRollBandLabel(fragment.Text) || LooksLikeSplitRollBandStart(fragment.Text));
@@ -163,9 +163,9 @@ internal static class CriticalTableParserSupport
return columns[^1].Key;
}
internal static IReadOnlyList<ColumnarCellLine> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
internal static IReadOnlyList<ColumnarCellLine> BuildLines(IReadOnlyList<PositionedTextFragment> fragments)
{
var lines = new List<List<XmlTextFragment>>();
var lines = new List<List<PositionedTextFragment>>();
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
{
@@ -292,9 +292,9 @@ internal static class CriticalTableParserSupport
.Replace('', '\'')
.Trim();
private static List<XmlTextFragment> RemoveRedundantContainedFragments(IReadOnlyList<XmlTextFragment> fragments)
private static List<PositionedTextFragment> RemoveRedundantContainedFragments(IReadOnlyList<PositionedTextFragment> fragments)
{
var redundant = new HashSet<XmlTextFragment>();
var redundant = new HashSet<PositionedTextFragment>();
foreach (var group in fragments.GroupBy(item => (item.PageNumber, item.Top, item.Height)))
{
@@ -331,7 +331,7 @@ internal static class CriticalTableParserSupport
.ToList();
}
private static bool IsHorizontallyContained(XmlTextFragment candidate, XmlTextFragment container)
private static bool IsHorizontallyContained(PositionedTextFragment candidate, PositionedTextFragment container)
{
const int containmentTolerance = 1;
@@ -353,7 +353,7 @@ internal static class CriticalTableParserSupport
return normalized.Length == 0 ? null : normalized;
}
internal static int FindKeyTop(IReadOnlyList<XmlTextFragment> fragments) =>
internal static int FindKeyTop(IReadOnlyList<PositionedTextFragment> fragments) =>
fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
@@ -362,7 +362,7 @@ internal static class CriticalTableParserSupport
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
internal static AffixLegend ParseAffixLegend(IReadOnlyList<XmlTextFragment> fragments, int keyTop)
internal static AffixLegend ParseAffixLegend(IReadOnlyList<PositionedTextFragment> fragments, int keyTop)
{
if (keyTop == int.MaxValue)
{
@@ -401,12 +401,12 @@ internal static class CriticalTableParserSupport
supportsPowerPointModifier: footerText.Contains("powerpoint modification", StringComparison.OrdinalIgnoreCase));
}
internal static List<XmlTextFragment> SplitBoundaryCrossingFragments(
IReadOnlyList<XmlTextFragment> bodyFragments,
internal static List<PositionedTextFragment> SplitBoundaryCrossingFragments(
IReadOnlyList<PositionedTextFragment> bodyFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols)
{
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count);
var splitFragments = new List<PositionedTextFragment>(bodyFragments.Count);
foreach (var fragment in bodyFragments)
{
@@ -417,7 +417,7 @@ internal static class CriticalTableParserSupport
}
internal static List<(int Top, bool IsAffixLike)> BuildBodyLines(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<PositionedTextFragment> bodyFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols)
{
@@ -440,7 +440,7 @@ internal static class CriticalTableParserSupport
return bodyLines;
}
internal static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
internal static bool IsFooterPageNumberFragment(PositionedTextFragment fragment, int keyTop)
{
if (keyTop == int.MaxValue)
{
@@ -451,9 +451,9 @@ internal static class CriticalTableParserSupport
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
}
internal static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
internal static IEnumerable<List<PositionedTextFragment>> GroupByTop(IReadOnlyList<PositionedTextFragment> fragments)
{
var groups = new List<List<XmlTextFragment>>();
var groups = new List<List<PositionedTextFragment>>();
foreach (var fragment in fragments)
{
@@ -469,7 +469,7 @@ internal static class CriticalTableParserSupport
return groups;
}
internal static List<RowAnchor> CreateRowAnchors(IReadOnlyList<XmlTextFragment> rowLabelFragments) =>
internal static List<RowAnchor> CreateRowAnchors(IReadOnlyList<PositionedTextFragment> rowLabelFragments) =>
rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1))
@@ -489,13 +489,13 @@ internal static class CriticalTableParserSupport
rowAnchors[0].Top - HeaderToRowLabelMinimumGap - TopGroupingTolerance));
}
internal static List<XmlTextFragment> BuildBodyFragments(
IReadOnlyList<XmlTextFragment> fragments,
internal static List<PositionedTextFragment> BuildBodyFragments(
IReadOnlyList<PositionedTextFragment> fragments,
int bodyStartTop,
int keyTop,
int leftCutoff,
IReadOnlyList<RowAnchor> rowAnchors,
IReadOnlyCollection<XmlTextFragment> excludedFragments,
IReadOnlyCollection<PositionedTextFragment> excludedFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols)
{
@@ -580,7 +580,9 @@ internal static class CriticalTableParserSupport
AffixLegend affixLegend,
List<ParsedCriticalCellArtifact> parsedCells,
List<ParsedCriticalResult> parsedResults,
List<string> validationErrors)
List<string> validationErrors,
List<string>? validationWarnings = null,
bool downgradeCellContentValidationToWarnings = false)
{
var sharedLegend = ToSharedAffixLegend(affixLegend);
@@ -589,8 +591,16 @@ internal static class CriticalTableParserSupport
var lineTexts = cellEntry.Lines.Select(line => line.Text).ToList();
var content = SharedParsing.CriticalCellTextParser.Parse(lineTexts, sharedLegend);
var sourceBounds = BuildSourceBounds(cellEntry.Lines.SelectMany(line => line.Fragments).ToList());
validationErrors.AddRange(content.ValidationErrors.Select(error =>
$"Cell '{BuildCellIdentifier(cellEntry)}': {error}"));
var contentIssues = content.ValidationErrors.Select(error =>
$"Cell '{BuildCellIdentifier(cellEntry)}': {error}");
if (downgradeCellContentValidationToWarnings)
{
validationWarnings?.AddRange(contentIssues);
}
else
{
validationErrors.AddRange(contentIssues);
}
var effects = content.Effects.Select(ToImportToolEffect).ToList();
var branches = content.Branches.Select(ToImportToolBranch).ToList();
@@ -621,7 +631,7 @@ internal static class CriticalTableParserSupport
}
}
private static ParsedCriticalSourceRect BuildSourceBounds(IReadOnlyList<XmlTextFragment> fragments)
private static ParsedCriticalSourceRect BuildSourceBounds(IReadOnlyList<PositionedTextFragment> fragments)
{
if (fragments.Count == 0)
{
@@ -688,7 +698,7 @@ internal static class CriticalTableParserSupport
private static bool LooksLikeSplitRollBandStart(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}\s*-$");
private static bool TryMergeSplitRollBand(IReadOnlyList<XmlTextFragment> candidates, int index, out XmlTextFragment mergedCandidate)
private static bool TryMergeSplitRollBand(IReadOnlyList<PositionedTextFragment> candidates, int index, out PositionedTextFragment mergedCandidate)
{
var current = candidates[index];
if (!LooksLikeSplitRollBandStart(current.Text) || index + 1 >= candidates.Count)
@@ -712,7 +722,7 @@ internal static class CriticalTableParserSupport
var mergedLabel = $"{startDigits}-{next.Text.Trim()}";
var right = Math.Max(current.Left + current.Width, next.Left + next.Width);
mergedCandidate = new XmlTextFragment(
mergedCandidate = new PositionedTextFragment(
current.PageNumber,
current.Top,
Math.Min(current.Left, next.Left),
@@ -722,8 +732,8 @@ internal static class CriticalTableParserSupport
return true;
}
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingFragment(
XmlTextFragment fragment,
private static IReadOnlyList<PositionedTextFragment> SplitBoundaryCrossingFragment(
PositionedTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols)
{
@@ -746,8 +756,8 @@ internal static class CriticalTableParserSupport
return [fragment];
}
private static IReadOnlyList<XmlTextFragment> BuildSplitFragmentsFromMatches(
XmlTextFragment fragment,
private static IReadOnlyList<PositionedTextFragment> BuildSplitFragmentsFromMatches(
PositionedTextFragment fragment,
MatchCollection matches,
IReadOnlyList<(string Key, double CenterX)> columnCenters)
{
@@ -757,7 +767,7 @@ internal static class CriticalTableParserSupport
}
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
var splitFragments = new List<XmlTextFragment>(matches.Count);
var splitFragments = new List<PositionedTextFragment>(matches.Count);
foreach (Match match in matches)
{
@@ -770,7 +780,7 @@ internal static class CriticalTableParserSupport
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
splitFragments.Add(new XmlTextFragment(
splitFragments.Add(new PositionedTextFragment(
fragment.PageNumber,
fragment.Top,
segmentLeft,
@@ -796,9 +806,9 @@ internal static class CriticalTableParserSupport
}
private static bool TrySplitProseFragmentAtBoundaries(
XmlTextFragment fragment,
PositionedTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
out IReadOnlyList<XmlTextFragment> splitFragments)
out IReadOnlyList<PositionedTextFragment> splitFragments)
{
splitFragments = null!;
@@ -808,7 +818,7 @@ internal static class CriticalTableParserSupport
return false;
}
var segments = new List<XmlTextFragment>();
var segments = new List<PositionedTextFragment>();
var segmentStart = 0;
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
@@ -839,7 +849,7 @@ internal static class CriticalTableParserSupport
}
private static List<int> FindBoundarySplitIndexes(
XmlTextFragment fragment,
PositionedTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters)
{
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
@@ -907,8 +917,8 @@ internal static class CriticalTableParserSupport
return bestIndex;
}
private static XmlTextFragment? CreateFragmentSegment(
XmlTextFragment fragment,
private static PositionedTextFragment? CreateFragmentSegment(
PositionedTextFragment fragment,
int startIndex,
int length,
double characterWidth)
@@ -940,7 +950,7 @@ internal static class CriticalTableParserSupport
var actualLength = trimmedEnd - trimmedStart + 1;
var segmentText = CollapseWhitespace(fragment.Text.Substring(actualStart, actualLength));
return new XmlTextFragment(
return new PositionedTextFragment(
fragment.PageNumber,
fragment.Top,
fragment.Left + (int)Math.Round(characterWidth * actualStart),
@@ -950,7 +960,7 @@ internal static class CriticalTableParserSupport
}
private static bool CrossesColumnBoundary(
XmlTextFragment fragment,
PositionedTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters)
{
var fragmentRight = fragment.Left + fragment.Width;

View File

@@ -14,10 +14,10 @@ public sealed class GroupedVariantCriticalTableParser
new("SLAYING", "Slaying", "variant", 2)
];
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source)
{
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent);
var fragments = source.Fragments;
var pageGeometries = source.PageGeometries;
var groupHeaders = FindGroupHeaders(fragments);
var columnHeaders = FindColumnHeaders(fragments);
var validationErrors = new List<string>();
@@ -50,7 +50,7 @@ public sealed class GroupedVariantCriticalTableParser
if (rowAnchors.Count == 0)
{
validationErrors.Add("No roll-band labels were found in the XML artifact.");
validationErrors.Add("No roll-band labels were found in the source artifact.");
}
var columnCenters = combinedColumnAnchors
@@ -136,16 +136,28 @@ public sealed class GroupedVariantCriticalTableParser
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.",
source.ImportNotes,
ExpectedGroups,
ExpectedColumns,
parsedRollBands,
parsedResults);
return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport);
return new CriticalTableParseResult(table, pageGeometries, fragments, source.RenderProfile, parsedCells, validationReport);
}
private static List<XmlTextFragment> FindGroupHeaders(IReadOnlyList<XmlTextFragment> fragments)
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
return Parse(
entry,
new ExtractedCriticalSource(
"xml",
"Imported from PDF XML extraction.",
SourceRenderProfile.XmlAligned(),
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
CriticalTableParserSupport.LoadFragments(xmlContent)));
}
private static List<PositionedTextFragment> FindGroupHeaders(IReadOnlyList<PositionedTextFragment> fragments)
{
var expectedLabels = ExpectedGroups.Select(item => item.Label).ToList();
var headerCandidates = fragments
@@ -164,10 +176,10 @@ public sealed class GroupedVariantCriticalTableParser
}
}
throw new InvalidOperationException("Could not find the grouped-variant section headers in the XML artifact.");
throw new InvalidOperationException("Could not find the grouped-variant section headers in the source artifact.");
}
private static List<XmlTextFragment> FindColumnHeaders(IReadOnlyList<XmlTextFragment> fragments)
private static List<PositionedTextFragment> FindColumnHeaders(IReadOnlyList<PositionedTextFragment> fragments)
{
var expectedLabels = new[] { "normal", "slaying", "normal", "slaying" };
var headerCandidates = fragments
@@ -190,6 +202,6 @@ public sealed class GroupedVariantCriticalTableParser
}
}
throw new InvalidOperationException("Could not find the grouped-variant column header row in the XML artifact.");
throw new InvalidOperationException("Could not find the grouped-variant column header row in the source artifact.");
}
}

View File

@@ -0,0 +1,20 @@
namespace RolemasterDb.ImportTool.Parsing;
public class PositionedTextFragment(
int pageNumber,
int top,
int left,
int width,
int height,
string text,
int? confidence = null)
{
public int PageNumber { get; } = pageNumber;
public int Top { get; } = top;
public int Left { get; } = left;
public int Width { get; } = width;
public int Height { get; } = height;
public string Text { get; } = text;
public int? Confidence { get; } = confidence;
public double CenterX => Left + (Width / 2.0);
}

View File

@@ -2,12 +2,14 @@ namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParseResult(
ParsedCriticalTable table,
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<PositionedTextFragment> fragments,
SourceRenderProfile renderProfile,
IReadOnlyList<ParsedCriticalCellArtifact> cells,
ImportValidationReport validationReport)
{
public ParsedCriticalTable Table { get; } = table;
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
public IReadOnlyList<PositionedTextFragment> Fragments { get; } = fragments;
public SourceRenderProfile RenderProfile { get; } = renderProfile;
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
public ImportValidationReport ValidationReport { get; } = validationReport;
}

View File

@@ -2,23 +2,140 @@ namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParser
{
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
internal CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source, StandardTableLayout? layout = null)
{
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent);
var headerFragments = FindHeaderFragments(fragments);
var fragments = source.Fragments;
var pageGeometries = source.PageGeometries;
var validationErrors = new List<string>();
var validationWarnings = new List<string>();
layout ??= BuildLayout(fragments, validationErrors);
validationWarnings.AddRange(layout.Warnings);
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, layout.KeyTop);
var affixLegendSymbols = affixLegend.ClassificationSymbols;
var bodyFragments = CriticalTableParserSupport.BuildBodyFragments(
fragments,
layout.BodyStartTop,
layout.KeyTop,
layout.LeftCutoff,
layout.RowAnchors,
layout.ExcludedFragments,
layout.ColumnCenters,
affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, layout.ColumnCenters, affixLegendSymbols);
var parsedRollBands = layout.RowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<ColumnarCellEntry>();
for (var rowIndex = 0; rowIndex < layout.RowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? layout.BodyStartTop
: CriticalTableParserSupport.ResolveRowBoundaryTop(layout.RowAnchors[rowIndex - 1], layout.RowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == layout.RowAnchors.Count - 1
? layout.KeyTop - 1
: CriticalTableParserSupport.ResolveRowBoundaryTop(layout.RowAnchors[rowIndex], layout.RowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList();
foreach (var columnAnchor in layout.ColumnCenters)
{
var cellFragments = rowFragments
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, layout.ColumnCenters) == columnAnchor.Key)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (cellFragments.Count == 0)
{
validationErrors.Add($"Missing content for roll band '{layout.RowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
continue;
}
cellEntries.Add(new ColumnarCellEntry(
null,
layout.RowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
}
}
CriticalTableParserSupport.RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
CriticalTableParserSupport.BuildParsedArtifacts(
cellEntries,
affixLegend,
parsedCells,
parsedResults,
validationErrors,
validationWarnings,
downgradeCellContentValidationToWarnings: string.Equals(source.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase));
if (layout.ColumnCenters.Count != 5)
{
validationErrors.Add($"Expected 5 standard-table columns but found {layout.ColumnCenters.Count}.");
}
if (parsedCells.Count != layout.RowAnchors.Count * layout.ColumnCenters.Count)
{
validationErrors.Add(
$"Expected {layout.RowAnchors.Count * layout.ColumnCenters.Count} parsed cells but produced {parsedCells.Count}.");
}
var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
validationWarnings,
layout.RowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
source.ImportNotes,
[],
layout.ColumnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new CriticalTableParseResult(table, pageGeometries, fragments, source.RenderProfile, parsedCells, validationReport);
}
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
return Parse(
entry,
new ExtractedCriticalSource(
"xml",
"Imported from PDF XML extraction.",
SourceRenderProfile.XmlAligned(),
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
CriticalTableParserSupport.LoadFragments(xmlContent)));
}
private static StandardTableLayout BuildLayout(
IReadOnlyList<PositionedTextFragment> fragments,
ICollection<string> validationErrors)
{
var headerFragments = FindHeaderFragments(fragments);
var columnCenters = headerFragments
.OrderBy(item => item.Left)
.Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX))
.ToList();
var headerTop = headerFragments.Max(item => item.Top);
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
var affixLegendSymbols = affixLegend.ClassificationSymbols;
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
@@ -30,102 +147,13 @@ public sealed class StandardCriticalTableParser
if (rowAnchors.Count == 0)
{
validationErrors.Add("No roll-band labels were found in the XML artifact.");
validationErrors.Add("No roll-band labels were found in the source artifact.");
}
var bodyFragments = CriticalTableParserSupport.BuildBodyFragments(
fragments,
bodyStartTop,
keyTop,
leftCutoff,
rowAnchors,
headerFragments,
columnCenters,
affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<ColumnarCellEntry>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? bodyStartTop
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList();
foreach (var columnAnchor in columnCenters)
{
var cellFragments = rowFragments
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (cellFragments.Count == 0)
{
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
continue;
}
cellEntries.Add(new ColumnarCellEntry(
null,
rowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
}
}
CriticalTableParserSupport.RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
CriticalTableParserSupport.BuildParsedArtifacts(cellEntries, affixLegend, parsedCells, parsedResults, validationErrors);
if (columnCenters.Count != 5)
{
validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}.");
}
if (parsedCells.Count != rowAnchors.Count * columnCenters.Count)
{
validationErrors.Add(
$"Expected {rowAnchors.Count * columnCenters.Count} parsed cells but produced {parsedCells.Count}.");
}
var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
validationWarnings,
rowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.",
[],
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport);
return new StandardTableLayout(headerFragments, columnCenters, rowAnchors, headerTop, bodyStartTop, keyTop, leftCutoff);
}
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
private static List<PositionedTextFragment> FindHeaderFragments(IReadOnlyList<PositionedTextFragment> fragments)
{
var headerCandidates = fragments
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
@@ -143,6 +171,6 @@ public sealed class StandardCriticalTableParser
}
}
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
throw new InvalidOperationException("Could not find the standard-table A-E header row in the source artifact.");
}
}

View File

@@ -0,0 +1,150 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed class StandardOcrBootstrapper
{
private const int AnchorConfidenceWarningThreshold = 85;
private const int HeaderTopTolerance = 12;
public StandardTableLayout Bootstrap(ExtractedCriticalSource source, StandardTableAxisTemplate template)
{
var fragments = source.Fragments;
var headerFragments = FindHeaderFragments(fragments, template);
var columnCenters = headerFragments
.OrderBy(item => item.Left)
.Select(item => (Key: NormalizeHeaderText(item.Text), CenterX: item.CenterX))
.ToList();
var headerTop = headerFragments.Max(item => item.Top);
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
var leftCutoff = ResolveRowLabelLeftCutoff(headerFragments);
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
keyTop);
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
var warnings = new List<string>();
if (rowAnchors.Count != template.RollBandLabels.Count)
{
throw new InvalidOperationException(
$"OCR bootstrap found {rowAnchors.Count} row anchors but template '{template.Slug}' expects {template.RollBandLabels.Count}.");
}
var actualLabels = rowAnchors.Select(item => item.Label).ToList();
if (!actualLabels.SequenceEqual(template.RollBandLabels, StringComparer.Ordinal))
{
throw new InvalidOperationException(
$"OCR bootstrap row anchors do not match template '{template.Slug}'.");
}
var fuzzyHeaders = headerFragments
.Where(item => !string.Equals(item.Text, NormalizeHeaderText(item.Text), StringComparison.Ordinal))
.ToList();
if (fuzzyHeaders.Count > 0)
{
warnings.Add(
$"OCR header normalization was applied for: {string.Join(", ", fuzzyHeaders.Select(item => $"'{item.Text}' -> '{NormalizeHeaderText(item.Text)}'"))}.");
}
var lowConfidenceAnchors = headerFragments
.Concat(rowLabelFragments)
.Where(item => item.Confidence is int confidence && confidence < AnchorConfidenceWarningThreshold)
.Select(item => $"'{item.Text}' ({item.Confidence})")
.ToList();
if (lowConfidenceAnchors.Count > 0)
{
warnings.Add($"Low-confidence OCR anchors: {string.Join(", ", lowConfidenceAnchors)}.");
}
return new StandardTableLayout(
headerFragments,
columnCenters,
rowAnchors,
headerTop,
bodyStartTop,
keyTop,
leftCutoff,
warnings);
}
private static List<PositionedTextFragment> FindHeaderFragments(
IReadOnlyList<PositionedTextFragment> fragments,
StandardTableAxisTemplate template)
{
var headerCandidates = fragments
.Where(item => TryNormalizeHeaderText(item.Text, out _))
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
foreach (var group in GroupHeaderCandidates(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => NormalizeHeaderText(item.Text)).ToList();
if (labels.SequenceEqual(template.ColumnKeys, StringComparer.Ordinal))
{
return ordered;
}
}
throw new InvalidOperationException("Could not find the OCR standard-table A-E header row.");
}
private static string NormalizeHeaderText(string value)
{
if (!TryNormalizeHeaderText(value, out var normalized))
{
throw new InvalidOperationException($"Unsupported OCR header fragment '{value}'.");
}
return normalized;
}
private static bool TryNormalizeHeaderText(string value, out string normalized)
{
normalized = value.Trim().ToUpperInvariant();
if (normalized is "A" or "B" or "D" or "E")
{
return true;
}
if (normalized is "C" or "CC")
{
normalized = "C";
return true;
}
return false;
}
private static IEnumerable<List<PositionedTextFragment>> GroupHeaderCandidates(IReadOnlyList<PositionedTextFragment> fragments)
{
var groups = new List<List<PositionedTextFragment>>();
foreach (var fragment in fragments)
{
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > HeaderTopTolerance)
{
groups.Add([fragment]);
continue;
}
groups[^1].Add(fragment);
}
return groups;
}
private static int ResolveRowLabelLeftCutoff(IReadOnlyList<PositionedTextFragment> headerFragments)
{
var ordered = headerFragments.OrderBy(item => item.Left).ToList();
if (ordered.Count < 2)
{
return Math.Max(0, ordered[0].Left - 10);
}
var firstColumnGap = ordered[1].Left - ordered[0].Left;
return Math.Max(0, ordered[0].Left - (firstColumnGap / 2));
}
}

View File

@@ -0,0 +1,11 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed class StandardTableAxisTemplate(
string slug,
IReadOnlyList<string> columnKeys,
IReadOnlyList<string> rollBandLabels)
{
public string Slug { get; } = slug;
public IReadOnlyList<string> ColumnKeys { get; } = columnKeys;
public IReadOnlyList<string> RollBandLabels { get; } = rollBandLabels;
}

View File

@@ -0,0 +1,17 @@
namespace RolemasterDb.ImportTool.Parsing;
internal static class StandardTableAxisTemplateCatalog
{
internal static StandardTableAxisTemplate Resolve(string? slug)
{
if (string.Equals(slug, "mana-standard-19", StringComparison.OrdinalIgnoreCase))
{
return new StandardTableAxisTemplate(
"mana-standard-19",
["A", "B", "C", "D", "E"],
["01-05", "06-10", "11-15", "16-20", "21-35", "36-45", "46-50", "51-55", "56-60", "61-65", "66", "67-70", "71-75", "76-80", "81-85", "86-90", "91-95", "96-99", "100"]);
}
throw new InvalidOperationException($"Unsupported standard-table axis template '{slug ?? "<null>"}'.");
}
}

View File

@@ -0,0 +1,21 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed class StandardTableLayout(
IReadOnlyList<PositionedTextFragment> excludedFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlyList<RowAnchor> rowAnchors,
int headerTop,
int bodyStartTop,
int keyTop,
int leftCutoff,
IReadOnlyList<string>? warnings = null)
{
public IReadOnlyList<PositionedTextFragment> ExcludedFragments { get; } = excludedFragments;
public IReadOnlyList<(string Key, double CenterX)> ColumnCenters { get; } = columnCenters;
public IReadOnlyList<RowAnchor> RowAnchors { get; } = rowAnchors;
public int HeaderTop { get; } = headerTop;
public int BodyStartTop { get; } = bodyStartTop;
public int KeyTop { get; } = keyTop;
public int LeftCutoff { get; } = leftCutoff;
public IReadOnlyList<string> Warnings { get; } = warnings ?? [];
}

View File

@@ -11,10 +11,10 @@ public sealed class VariantColumnCriticalTableParser
new("SLAYING", "Slaying")
];
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source)
{
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent);
var fragments = source.Fragments;
var pageGeometries = source.PageGeometries;
var headerFragments = FindHeaderFragments(fragments);
var validationErrors = new List<string>();
var validationWarnings = new List<string>();
@@ -43,7 +43,7 @@ public sealed class VariantColumnCriticalTableParser
if (rowAnchors.Count == 0)
{
validationErrors.Add("No roll-band labels were found in the XML artifact.");
validationErrors.Add("No roll-band labels were found in the source artifact.");
}
var columnCenters = columnAnchors
@@ -132,16 +132,28 @@ public sealed class VariantColumnCriticalTableParser
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.",
source.ImportNotes,
[],
ExpectedColumns.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Label, "variant", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport);
return new CriticalTableParseResult(table, pageGeometries, fragments, source.RenderProfile, parsedCells, validationReport);
}
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
return Parse(
entry,
new ExtractedCriticalSource(
"xml",
"Imported from PDF XML extraction.",
SourceRenderProfile.XmlAligned(),
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
CriticalTableParserSupport.LoadFragments(xmlContent)));
}
private static List<PositionedTextFragment> FindHeaderFragments(IReadOnlyList<PositionedTextFragment> fragments)
{
var expectedLabels = ExpectedColumns
.Select(item => item.Label.ToLowerInvariant())
@@ -163,7 +175,7 @@ public sealed class VariantColumnCriticalTableParser
}
}
throw new InvalidOperationException("Could not find the variant-column header row in the XML artifact.");
throw new InvalidOperationException("Could not find the variant-column header row in the source artifact.");
}
private static ColumnDefinition ResolveColumnDefinition(string value) =>

View File

@@ -7,12 +7,6 @@ public sealed class XmlTextFragment(
int width,
int height,
string text)
: PositionedTextFragment(pageNumber, top, left, width, height, text)
{
public int PageNumber { get; } = pageNumber;
public int Top { get; } = top;
public int Left { get; } = left;
public int Width { get; } = width;
public int Height { get; } = height;
public string Text { get; } = text;
public double CenterX => Left + (Width / 2.0);
}

View File

@@ -0,0 +1,8 @@
namespace RolemasterDb.ImportTool;
public sealed class PdfDocumentInfo(int pageCount, double pageWidthPoints, double pageHeightPoints)
{
public int PageCount { get; } = pageCount;
public double PageWidthPoints { get; } = pageWidthPoints;
public double PageHeightPoints { get; } = pageHeightPoints;
}

View File

@@ -1,4 +1,6 @@
using System.Diagnostics;
using System.Globalization;
using System.Text.RegularExpressions;
namespace RolemasterDb.ImportTool;
@@ -7,6 +9,7 @@ public sealed class PdfXmlExtractor
public const int RenderScaleFactor = 4;
public const int XmlAlignedRenderDpi = 108;
public const int ScaledRenderDpi = XmlAlignedRenderDpi * RenderScaleFactor;
private const string PortableMiKTeXPath = @"D:\Code\miktex-portable\texmfs\install\miktex\bin\x64";
public static int ScaleCoordinate(int value) => checked(value * RenderScaleFactor);
@@ -16,7 +19,7 @@ public sealed class PdfXmlExtractor
var startInfo = new ProcessStartInfo
{
FileName = "pdftohtml",
FileName = ResolveExecutable("ROLEMASTERDB_PDFTOHTML_PATH", "pdftohtml.exe"),
RedirectStandardError = true,
RedirectStandardOutput = true,
UseShellExecute = false,
@@ -40,12 +43,57 @@ public sealed class PdfXmlExtractor
}
}
public async Task<PdfDocumentInfo> ReadDocumentInfoAsync(string pdfPath, CancellationToken cancellationToken = default)
{
var startInfo = new ProcessStartInfo
{
FileName = ResolveExecutable("ROLEMASTERDB_PDFINFO_PATH", "pdfinfo.exe"),
RedirectStandardError = true,
RedirectStandardOutput = true,
UseShellExecute = false,
CreateNoWindow = true
};
startInfo.ArgumentList.Add(pdfPath);
using var process = new Process { StartInfo = startInfo };
process.Start();
var output = await process.StandardOutput.ReadToEndAsync(cancellationToken);
await process.WaitForExitAsync(cancellationToken);
if (process.ExitCode != 0)
{
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
throw new InvalidOperationException($"pdfinfo failed for '{pdfPath}': {error}");
}
var pageCountMatch = Regex.Match(output, @"Pages:\s*(\d+)", RegexOptions.Multiline);
var sizeMatch = Regex.Match(output, @"Page size:\s*([0-9.]+)\s*x\s*([0-9.]+)\s*pts", RegexOptions.Multiline);
if (!pageCountMatch.Success || !sizeMatch.Success)
{
throw new InvalidOperationException($"pdfinfo output for '{pdfPath}' could not be parsed.");
}
return new PdfDocumentInfo(
int.Parse(pageCountMatch.Groups[1].Value, CultureInfo.InvariantCulture),
double.Parse(sizeMatch.Groups[1].Value, CultureInfo.InvariantCulture),
double.Parse(sizeMatch.Groups[2].Value, CultureInfo.InvariantCulture));
}
public Task RenderPagePngAsync(
string pdfPath,
int pageNumber,
string outputPath,
CancellationToken cancellationToken = default) =>
RenderPngAsync(pdfPath, pageNumber, outputPath, null, null, null, null, cancellationToken);
RenderPagePngAsync(pdfPath, pageNumber, outputPath, ScaledRenderDpi, cancellationToken);
public Task RenderPagePngAsync(
string pdfPath,
int pageNumber,
string outputPath,
int renderDpi,
CancellationToken cancellationToken = default) =>
RenderPngAsync(pdfPath, pageNumber, outputPath, renderDpi, null, null, null, null, cancellationToken);
public Task RenderCropPngAsync(
string pdfPath,
@@ -56,12 +104,25 @@ public sealed class PdfXmlExtractor
int height,
string outputPath,
CancellationToken cancellationToken = default) =>
RenderPngAsync(pdfPath, pageNumber, outputPath, left, top, width, height, cancellationToken);
RenderCropPngAsync(pdfPath, pageNumber, left, top, width, height, outputPath, ScaledRenderDpi, cancellationToken);
public Task RenderCropPngAsync(
string pdfPath,
int pageNumber,
int left,
int top,
int width,
int height,
string outputPath,
int renderDpi,
CancellationToken cancellationToken = default) =>
RenderPngAsync(pdfPath, pageNumber, outputPath, renderDpi, left, top, width, height, cancellationToken);
private static async Task RenderPngAsync(
string pdfPath,
int pageNumber,
string outputPath,
int renderDpi,
int? left,
int? top,
int? width,
@@ -72,7 +133,7 @@ public sealed class PdfXmlExtractor
var startInfo = new ProcessStartInfo
{
FileName = "pdftoppm",
FileName = ResolveExecutable("ROLEMASTERDB_PDFTOPPM_PATH", "pdftoppm.exe"),
RedirectStandardError = true,
RedirectStandardOutput = true,
UseShellExecute = false,
@@ -81,7 +142,7 @@ public sealed class PdfXmlExtractor
startInfo.ArgumentList.Add("-png");
startInfo.ArgumentList.Add("-r");
startInfo.ArgumentList.Add(ScaledRenderDpi.ToString());
startInfo.ArgumentList.Add(renderDpi.ToString(CultureInfo.InvariantCulture));
startInfo.ArgumentList.Add("-f");
startInfo.ArgumentList.Add(pageNumber.ToString());
startInfo.ArgumentList.Add("-l");
@@ -118,4 +179,21 @@ public sealed class PdfXmlExtractor
throw new InvalidOperationException($"pdftoppm completed but did not create '{outputPath}'.");
}
}
private static string ResolveExecutable(string environmentVariableName, string executableName)
{
var configuredPath = Environment.GetEnvironmentVariable(environmentVariableName);
if (!string.IsNullOrWhiteSpace(configuredPath) && File.Exists(configuredPath))
{
return configuredPath;
}
var portablePath = Path.Combine(PortableMiKTeXPath, executableName);
if (File.Exists(portablePath))
{
return portablePath;
}
return Path.GetFileNameWithoutExtension(executableName);
}
}

View File

@@ -0,0 +1,3 @@
using System.Runtime.CompilerServices;
[assembly: InternalsVisibleTo("RolemasterDb.ImportTool.Tests")]

View File

@@ -0,0 +1,15 @@
namespace RolemasterDb.ImportTool;
public sealed class SourceRenderProfile(int renderDpi, int scaleFactor)
{
public int RenderDpi { get; } = renderDpi;
public int ScaleFactor { get; } = scaleFactor;
public int ScaleCoordinate(int value) => checked(value * ScaleFactor);
public static SourceRenderProfile XmlAligned() =>
new(PdfXmlExtractor.ScaledRenderDpi, PdfXmlExtractor.RenderScaleFactor);
public static SourceRenderProfile OcrPixels(int renderDpi) =>
new(renderDpi, 1);
}

View File

@@ -0,0 +1,28 @@
using RolemasterDb.ImportTool.Parsing;
namespace RolemasterDb.ImportTool;
public sealed class XmlCriticalSourceExtractor(PdfXmlExtractor pdfXmlExtractor) : ICriticalSourceExtractor
{
public async Task ExtractAsync(string pdfPath, ImportArtifactPaths artifactPaths, CancellationToken cancellationToken = default) =>
await pdfXmlExtractor.ExtractAsync(pdfPath, artifactPaths.XmlPath, cancellationToken);
public async Task<ExtractedCriticalSource> LoadAsync(
string pdfPath,
ImportArtifactPaths artifactPaths,
CancellationToken cancellationToken = default)
{
if (!File.Exists(artifactPaths.XmlPath))
{
throw new FileNotFoundException($"Missing XML artifact: {artifactPaths.XmlPath}", artifactPaths.XmlPath);
}
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath, cancellationToken);
return new ExtractedCriticalSource(
"xml",
"Imported from PDF XML extraction.",
SourceRenderProfile.XmlAligned(),
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
CriticalTableParserSupport.LoadFragments(xmlContent));
}
}