From 8cbcf666959a8911df7116ee95c00401d97cff12 Mon Sep 17 00:00:00 2001 From: Frank Tovar Date: Wed, 18 Mar 2026 00:44:58 +0100 Subject: [PATCH] Add high-res critical image refresh import --- docs/critical_import_tool.md | 28 ++++++++++-- ...mportArtifactGenerationIntegrationTests.cs | 4 ++ .../CriticalImportMergeIntegrationTests.cs | 42 ++++++++++++++++++ .../CriticalImportCommandRunner.cs | 34 ++++++++++++++ .../CriticalImportLoader.cs | 44 +++++++++++++++++++ .../CriticalSourceImageArtifactGenerator.cs | 23 +++++----- .../Parsing/CriticalSourceImageCrop.cs | 4 +- .../PdfXmlExtractor.cs | 6 ++- src/RolemasterDb.ImportTool/Program.cs | 3 +- .../ReimportImagesOptions.cs | 13 ++++++ 10 files changed, 183 insertions(+), 18 deletions(-) create mode 100644 src/RolemasterDb.ImportTool/ReimportImagesOptions.cs diff --git a/docs/critical_import_tool.md b/docs/critical_import_tool.md index 85020ce..b570ed4 100644 --- a/docs/critical_import_tool.md +++ b/docs/critical_import_tool.md @@ -33,7 +33,7 @@ The current implementation supports: - `variant_column` critical tables with non-severity columns - `grouped_variant` critical tables with a group axis plus variant columns - XML-based extraction using `pdftohtml -xml` -- XML-aligned page rendering and per-cell PNG crops using `pdftoppm -png -r 108` +- XML-aligned page rendering and per-cell PNG crops using `pdftoppm -png -r 432` - geometry-based parsing across the currently enabled table set: - `arcane-aether` - `arcane-nether` @@ -359,6 +359,22 @@ Example: dotnet run --project .\src\RolemasterDb.ImportTool\RolemasterDb.ImportTool.csproj -- import slash ``` +### `reimport-images ` + +Reuses `source.xml`, regenerates page PNGs and cell PNGs, rewrites the JSON artifacts, and refreshes only source-image metadata in SQLite. + +Use this when: + +- crop resolution or render settings changed +- you want better source images without reloading result text +- you want to keep curated and uncurated content untouched while refreshing artifacts + +Example: + +```powershell +dotnet run --project .\src\RolemasterDb.ImportTool\RolemasterDb.ImportTool.csproj -- reimport-images slash +``` + ## Manifest The importer manifest is stored at: @@ -433,7 +449,7 @@ Each parsed cell now includes: ### `pages/page-001.png` -Rendered PDF page images at `108 DPI`, which matches the coordinate space emitted by `pdftohtml -xml`. +Rendered PDF page images at `432 DPI`, using a central render scale factor of `4` over the XML coordinate space emitted by `pdftohtml -xml`. Use this when: @@ -607,10 +623,14 @@ The importer now uses two Poppler tools: - `pdftohtml -xml -i -noframes` - extracts geometry-aware XML text -- `pdftoppm -png -r 108` +- `pdftoppm -png -r 432` - renders page PNGs and per-cell crop PNGs -The `108 DPI` render setting is deliberate: for the current PDFs and Poppler output, it produces page images whose pixel dimensions match the XML `page width` and `page height`, so crop coordinates can be applied directly without an extra scale-conversion step. +The importer keeps a central render scale factor of `4`. The XML still defines bounds in its original coordinate space, but rendered PNGs and stored crop metadata now use the scaled coordinate space and a `432 DPI` render setting. In practice: + +- XML coordinates are multiplied by `4` before crop extraction +- page and crop metadata stored with each result reflect the scaled PNG coordinate space +- crop alignment remains deterministic without changing the parsing pipeline ## Interaction With Web App Startup diff --git a/src/RolemasterDb.ImportTool.Tests/CriticalImportArtifactGenerationIntegrationTests.cs b/src/RolemasterDb.ImportTool.Tests/CriticalImportArtifactGenerationIntegrationTests.cs index 3fb09e1..5e79d8c 100644 --- a/src/RolemasterDb.ImportTool.Tests/CriticalImportArtifactGenerationIntegrationTests.cs +++ b/src/RolemasterDb.ImportTool.Tests/CriticalImportArtifactGenerationIntegrationTests.cs @@ -22,6 +22,10 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests Assert.True(result.SourceBounds.Height > 0); Assert.NotNull(result.SourceImagePath); Assert.NotNull(result.SourceImageCrop); + Assert.Equal(PdfXmlExtractor.RenderScaleFactor, result.SourceImageCrop!.ScaleFactor); + Assert.Equal(PdfXmlExtractor.ScaledRenderDpi, result.SourceImageCrop.RenderDpi); + Assert.Equal(result.SourceBounds.Width * PdfXmlExtractor.RenderScaleFactor, result.SourceImageCrop.BoundsWidth); + Assert.Equal(result.SourceBounds.Height * PdfXmlExtractor.RenderScaleFactor, result.SourceImageCrop.BoundsHeight); Assert.Equal(result.SourceImagePath, cellArtifact.SourceImagePath); Assert.NotNull(cellArtifact.SourceImageCrop); Assert.True(File.Exists(artifactPaths.GetPageImagePath(result.SourceBounds.PageNumber))); diff --git a/src/RolemasterDb.ImportTool.Tests/CriticalImportMergeIntegrationTests.cs b/src/RolemasterDb.ImportTool.Tests/CriticalImportMergeIntegrationTests.cs index 96490a4..5293b72 100644 --- a/src/RolemasterDb.ImportTool.Tests/CriticalImportMergeIntegrationTests.cs +++ b/src/RolemasterDb.ImportTool.Tests/CriticalImportMergeIntegrationTests.cs @@ -191,6 +191,48 @@ public sealed class CriticalImportMergeIntegrationTests } } + [Fact] + public async Task Reimport_images_only_refreshes_provenance_without_touching_curated_content() + { + var (parseResult, _) = await LoadPreparedSlashParseResultAsync(); + var databasePath = CreateEmptyDatabasePath(); + var loader = new CriticalImportLoader(databasePath); + + await loader.LoadAsync(parseResult.Table); + + await using (var dbContext = CreateDbContext(databasePath)) + { + var result = await LoadResultAsync(dbContext, "36-45", "B"); + + result.IsCurated = true; + result.RawCellText = "Curated raw text"; + result.DescriptionText = "Curated description"; + result.RawAffixText = "+12H"; + result.ParseStatus = "manually_curated"; + result.SourcePageNumber = null; + result.SourceImagePath = null; + result.SourceImageCropJson = null; + + await dbContext.SaveChangesAsync(); + } + + await loader.RefreshImageArtifactsAsync(parseResult.Table); + + await using (var dbContext = CreateDbContext(databasePath)) + { + var result = await LoadResultAsync(dbContext, "36-45", "B"); + + Assert.True(result.IsCurated); + Assert.Equal("Curated raw text", result.RawCellText); + Assert.Equal("Curated description", result.DescriptionText); + Assert.Equal("+12H", result.RawAffixText); + Assert.Equal("manually_curated", result.ParseStatus); + Assert.NotNull(result.SourcePageNumber); + Assert.False(string.IsNullOrWhiteSpace(result.SourceImagePath)); + Assert.False(string.IsNullOrWhiteSpace(result.SourceImageCropJson)); + } + } + private static ParsedCriticalTable CreateTrimmedTable( ParsedCriticalTable table, params (string RollBandLabel, string ColumnKey)[] excludedResults) diff --git a/src/RolemasterDb.ImportTool/CriticalImportCommandRunner.cs b/src/RolemasterDb.ImportTool/CriticalImportCommandRunner.cs index 5bdae92..4fc0f90 100644 --- a/src/RolemasterDb.ImportTool/CriticalImportCommandRunner.cs +++ b/src/RolemasterDb.ImportTool/CriticalImportCommandRunner.cs @@ -100,6 +100,40 @@ public sealed class CriticalImportCommandRunner }); } + public async Task RunAsync(ReimportImagesOptions options) + { + var entry = GetManifestEntry(options.Table); + var artifactPaths = CreateArtifactPaths(entry.Slug); + + if (!File.Exists(artifactPaths.XmlPath)) + { + await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath); + } + + var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath); + var parseResult = Parse(entry, xmlContent); + await sourceImageArtifactGenerator.GenerateAsync( + ResolveRepositoryPath(entry.PdfPath), + artifactPaths, + parseResult, + CancellationToken.None); + await artifactWriter.WriteAsync(artifactPaths, parseResult, CancellationToken.None); + + if (!parseResult.ValidationReport.IsValid) + { + throw new InvalidOperationException( + $"Validation failed for '{entry.Slug}'. See {artifactPaths.ValidationReportPath} for details."); + } + + var loader = new CriticalImportLoader(ResolveDatabasePath(options.DatabasePath)); + var refreshedCount = await loader.RefreshImageArtifactsAsync(parseResult.Table); + + Console.WriteLine( + $"Refreshed image artifacts for {entry.Slug}: {refreshedCount} results updated."); + + return 0; + } + private CriticalImportManifestEntry GetManifestEntry(string tableSlug) { var manifest = manifestLoader.Load(RepositoryPaths.Discover().ManifestPath); diff --git a/src/RolemasterDb.ImportTool/CriticalImportLoader.cs b/src/RolemasterDb.ImportTool/CriticalImportLoader.cs index 7ca83c5..6947fbb 100644 --- a/src/RolemasterDb.ImportTool/CriticalImportLoader.cs +++ b/src/RolemasterDb.ImportTool/CriticalImportLoader.cs @@ -128,6 +128,50 @@ public sealed class CriticalImportLoader(string databasePath) return new ImportCommandResult(entity.Slug, entity.Columns.Count, entity.RollBands.Count, entity.Results.Count); } + public async Task RefreshImageArtifactsAsync(ParsedCriticalTable table, CancellationToken cancellationToken = default) + { + await using var dbContext = CreateDbContext(); + await dbContext.Database.EnsureCreatedAsync(cancellationToken); + await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext, cancellationToken); + await using var transaction = await dbContext.Database.BeginTransactionAsync(cancellationToken); + + var entity = await dbContext.CriticalTables + .AsSplitQuery() + .Include(item => item.Results) + .ThenInclude(result => result.CriticalGroup) + .Include(item => item.Results) + .ThenInclude(result => result.CriticalColumn) + .Include(item => item.Results) + .ThenInclude(result => result.CriticalRollBand) + .SingleOrDefaultAsync(item => item.Slug == table.Slug, cancellationToken); + + if (entity is null) + { + throw new InvalidOperationException($"Critical table '{table.Slug}' does not exist in the target database."); + } + + var existingResultsByKey = entity.Results.ToDictionary( + item => CreateResultKey(item.CriticalGroup?.GroupKey, item.CriticalColumn.ColumnKey, item.CriticalRollBand.Label), + StringComparer.Ordinal); + + var refreshedCount = 0; + foreach (var item in table.Results) + { + var resultKey = CreateResultKey(item.GroupKey, item.ColumnKey, item.RollBandLabel); + if (!existingResultsByKey.TryGetValue(resultKey, out var existingResult)) + { + continue; + } + + ApplyImporterProvenance(existingResult, item); + refreshedCount++; + } + + await dbContext.SaveChangesAsync(cancellationToken); + await transaction.CommitAsync(cancellationToken); + return refreshedCount; + } + private RolemasterDbContext CreateDbContext() { var options = new DbContextOptionsBuilder() diff --git a/src/RolemasterDb.ImportTool/CriticalSourceImageArtifactGenerator.cs b/src/RolemasterDb.ImportTool/CriticalSourceImageArtifactGenerator.cs index 6320258..b6caaad 100644 --- a/src/RolemasterDb.ImportTool/CriticalSourceImageArtifactGenerator.cs +++ b/src/RolemasterDb.ImportTool/CriticalSourceImageArtifactGenerator.cs @@ -75,17 +75,18 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE return new CriticalSourceImageCrop( sourceBounds.PageNumber, - pageGeometry.Width, - pageGeometry.Height, - sourceBounds.Left, - sourceBounds.Top, - sourceBounds.Width, - sourceBounds.Height, - cropLeft, - cropTop, - Math.Max(1, cropRight - cropLeft), - Math.Max(1, cropBottom - cropTop), - PdfXmlExtractor.XmlAlignedRenderDpi); + PdfXmlExtractor.ScaleCoordinate(pageGeometry.Width), + PdfXmlExtractor.ScaleCoordinate(pageGeometry.Height), + PdfXmlExtractor.ScaleCoordinate(sourceBounds.Left), + PdfXmlExtractor.ScaleCoordinate(sourceBounds.Top), + PdfXmlExtractor.ScaleCoordinate(sourceBounds.Width), + PdfXmlExtractor.ScaleCoordinate(sourceBounds.Height), + PdfXmlExtractor.ScaleCoordinate(cropLeft), + PdfXmlExtractor.ScaleCoordinate(cropTop), + PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropRight - cropLeft)), + PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropBottom - cropTop)), + PdfXmlExtractor.ScaledRenderDpi, + PdfXmlExtractor.RenderScaleFactor); } private static string CreateCellKey(string? groupKey, string rollBandLabel, string columnKey) => diff --git a/src/RolemasterDb.ImportTool/Parsing/CriticalSourceImageCrop.cs b/src/RolemasterDb.ImportTool/Parsing/CriticalSourceImageCrop.cs index 169dec9..1c7f13d 100644 --- a/src/RolemasterDb.ImportTool/Parsing/CriticalSourceImageCrop.cs +++ b/src/RolemasterDb.ImportTool/Parsing/CriticalSourceImageCrop.cs @@ -12,7 +12,8 @@ public sealed class CriticalSourceImageCrop( int cropTop, int cropWidth, int cropHeight, - int renderDpi) + int renderDpi, + int scaleFactor) { public int PageNumber { get; } = pageNumber; public int PageWidth { get; } = pageWidth; @@ -26,4 +27,5 @@ public sealed class CriticalSourceImageCrop( public int CropWidth { get; } = cropWidth; public int CropHeight { get; } = cropHeight; public int RenderDpi { get; } = renderDpi; + public int ScaleFactor { get; } = scaleFactor; } diff --git a/src/RolemasterDb.ImportTool/PdfXmlExtractor.cs b/src/RolemasterDb.ImportTool/PdfXmlExtractor.cs index b6d23d8..b81a330 100644 --- a/src/RolemasterDb.ImportTool/PdfXmlExtractor.cs +++ b/src/RolemasterDb.ImportTool/PdfXmlExtractor.cs @@ -4,7 +4,11 @@ namespace RolemasterDb.ImportTool; public sealed class PdfXmlExtractor { + public const int RenderScaleFactor = 4; public const int XmlAlignedRenderDpi = 108; + public const int ScaledRenderDpi = XmlAlignedRenderDpi * RenderScaleFactor; + + public static int ScaleCoordinate(int value) => checked(value * RenderScaleFactor); public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default) { @@ -77,7 +81,7 @@ public sealed class PdfXmlExtractor startInfo.ArgumentList.Add("-png"); startInfo.ArgumentList.Add("-r"); - startInfo.ArgumentList.Add(XmlAlignedRenderDpi.ToString()); + startInfo.ArgumentList.Add(ScaledRenderDpi.ToString()); startInfo.ArgumentList.Add("-f"); startInfo.ArgumentList.Add(pageNumber.ToString()); startInfo.ArgumentList.Add("-l"); diff --git a/src/RolemasterDb.ImportTool/Program.cs b/src/RolemasterDb.ImportTool/Program.cs index 6d2a3ec..dd3b4e5 100644 --- a/src/RolemasterDb.ImportTool/Program.cs +++ b/src/RolemasterDb.ImportTool/Program.cs @@ -4,12 +4,13 @@ using RolemasterDb.ImportTool; var runner = new CriticalImportCommandRunner(); -var exitCode = await Parser.Default.ParseArguments(args) +var exitCode = await Parser.Default.ParseArguments(args) .MapResult( (ResetOptions options) => ExecuteAsync(() => runner.RunAsync(options)), (ExtractOptions options) => ExecuteAsync(() => runner.RunAsync(options)), (LoadOptions options) => ExecuteAsync(() => runner.RunAsync(options)), (ImportOptions options) => ExecuteAsync(() => runner.RunAsync(options)), + (ReimportImagesOptions options) => ExecuteAsync(() => runner.RunAsync(options)), _ => Task.FromResult(1)); return exitCode; diff --git a/src/RolemasterDb.ImportTool/ReimportImagesOptions.cs b/src/RolemasterDb.ImportTool/ReimportImagesOptions.cs new file mode 100644 index 0000000..04e70f2 --- /dev/null +++ b/src/RolemasterDb.ImportTool/ReimportImagesOptions.cs @@ -0,0 +1,13 @@ +using CommandLine; + +namespace RolemasterDb.ImportTool; + +[Verb("reimport-images", HelpText = "Regenerate critical table page and cell images and refresh only image metadata in SQLite.")] +public sealed class ReimportImagesOptions +{ + [Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to refresh.")] + public string Table { get; set; } = string.Empty; + + [Option('d', "db", HelpText = "Optional SQLite database path.")] + public string? DatabasePath { get; set; } +}