Add high-res critical image refresh import

This commit is contained in:
2026-03-18 00:44:58 +01:00
parent 30fd257ea5
commit 8cbcf66695
10 changed files with 183 additions and 18 deletions

View File

@@ -33,7 +33,7 @@ The current implementation supports:
- `variant_column` critical tables with non-severity columns - `variant_column` critical tables with non-severity columns
- `grouped_variant` critical tables with a group axis plus variant columns - `grouped_variant` critical tables with a group axis plus variant columns
- XML-based extraction using `pdftohtml -xml` - XML-based extraction using `pdftohtml -xml`
- XML-aligned page rendering and per-cell PNG crops using `pdftoppm -png -r 108` - XML-aligned page rendering and per-cell PNG crops using `pdftoppm -png -r 432`
- geometry-based parsing across the currently enabled table set: - geometry-based parsing across the currently enabled table set:
- `arcane-aether` - `arcane-aether`
- `arcane-nether` - `arcane-nether`
@@ -359,6 +359,22 @@ Example:
dotnet run --project .\src\RolemasterDb.ImportTool\RolemasterDb.ImportTool.csproj -- import slash dotnet run --project .\src\RolemasterDb.ImportTool\RolemasterDb.ImportTool.csproj -- import slash
``` ```
### `reimport-images <table>`
Reuses `source.xml`, regenerates page PNGs and cell PNGs, rewrites the JSON artifacts, and refreshes only source-image metadata in SQLite.
Use this when:
- crop resolution or render settings changed
- you want better source images without reloading result text
- you want to keep curated and uncurated content untouched while refreshing artifacts
Example:
```powershell
dotnet run --project .\src\RolemasterDb.ImportTool\RolemasterDb.ImportTool.csproj -- reimport-images slash
```
## Manifest ## Manifest
The importer manifest is stored at: The importer manifest is stored at:
@@ -433,7 +449,7 @@ Each parsed cell now includes:
### `pages/page-001.png` ### `pages/page-001.png`
Rendered PDF page images at `108 DPI`, which matches the coordinate space emitted by `pdftohtml -xml`. Rendered PDF page images at `432 DPI`, using a central render scale factor of `4` over the XML coordinate space emitted by `pdftohtml -xml`.
Use this when: Use this when:
@@ -607,10 +623,14 @@ The importer now uses two Poppler tools:
- `pdftohtml -xml -i -noframes` - `pdftohtml -xml -i -noframes`
- extracts geometry-aware XML text - extracts geometry-aware XML text
- `pdftoppm -png -r 108` - `pdftoppm -png -r 432`
- renders page PNGs and per-cell crop PNGs - renders page PNGs and per-cell crop PNGs
The `108 DPI` render setting is deliberate: for the current PDFs and Poppler output, it produces page images whose pixel dimensions match the XML `page width` and `page height`, so crop coordinates can be applied directly without an extra scale-conversion step. The importer keeps a central render scale factor of `4`. The XML still defines bounds in its original coordinate space, but rendered PNGs and stored crop metadata now use the scaled coordinate space and a `432 DPI` render setting. In practice:
- XML coordinates are multiplied by `4` before crop extraction
- page and crop metadata stored with each result reflect the scaled PNG coordinate space
- crop alignment remains deterministic without changing the parsing pipeline
## Interaction With Web App Startup ## Interaction With Web App Startup

View File

@@ -22,6 +22,10 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
Assert.True(result.SourceBounds.Height > 0); Assert.True(result.SourceBounds.Height > 0);
Assert.NotNull(result.SourceImagePath); Assert.NotNull(result.SourceImagePath);
Assert.NotNull(result.SourceImageCrop); Assert.NotNull(result.SourceImageCrop);
Assert.Equal(PdfXmlExtractor.RenderScaleFactor, result.SourceImageCrop!.ScaleFactor);
Assert.Equal(PdfXmlExtractor.ScaledRenderDpi, result.SourceImageCrop.RenderDpi);
Assert.Equal(result.SourceBounds.Width * PdfXmlExtractor.RenderScaleFactor, result.SourceImageCrop.BoundsWidth);
Assert.Equal(result.SourceBounds.Height * PdfXmlExtractor.RenderScaleFactor, result.SourceImageCrop.BoundsHeight);
Assert.Equal(result.SourceImagePath, cellArtifact.SourceImagePath); Assert.Equal(result.SourceImagePath, cellArtifact.SourceImagePath);
Assert.NotNull(cellArtifact.SourceImageCrop); Assert.NotNull(cellArtifact.SourceImageCrop);
Assert.True(File.Exists(artifactPaths.GetPageImagePath(result.SourceBounds.PageNumber))); Assert.True(File.Exists(artifactPaths.GetPageImagePath(result.SourceBounds.PageNumber)));

View File

@@ -191,6 +191,48 @@ public sealed class CriticalImportMergeIntegrationTests
} }
} }
[Fact]
public async Task Reimport_images_only_refreshes_provenance_without_touching_curated_content()
{
var (parseResult, _) = await LoadPreparedSlashParseResultAsync();
var databasePath = CreateEmptyDatabasePath();
var loader = new CriticalImportLoader(databasePath);
await loader.LoadAsync(parseResult.Table);
await using (var dbContext = CreateDbContext(databasePath))
{
var result = await LoadResultAsync(dbContext, "36-45", "B");
result.IsCurated = true;
result.RawCellText = "Curated raw text";
result.DescriptionText = "Curated description";
result.RawAffixText = "+12H";
result.ParseStatus = "manually_curated";
result.SourcePageNumber = null;
result.SourceImagePath = null;
result.SourceImageCropJson = null;
await dbContext.SaveChangesAsync();
}
await loader.RefreshImageArtifactsAsync(parseResult.Table);
await using (var dbContext = CreateDbContext(databasePath))
{
var result = await LoadResultAsync(dbContext, "36-45", "B");
Assert.True(result.IsCurated);
Assert.Equal("Curated raw text", result.RawCellText);
Assert.Equal("Curated description", result.DescriptionText);
Assert.Equal("+12H", result.RawAffixText);
Assert.Equal("manually_curated", result.ParseStatus);
Assert.NotNull(result.SourcePageNumber);
Assert.False(string.IsNullOrWhiteSpace(result.SourceImagePath));
Assert.False(string.IsNullOrWhiteSpace(result.SourceImageCropJson));
}
}
private static ParsedCriticalTable CreateTrimmedTable( private static ParsedCriticalTable CreateTrimmedTable(
ParsedCriticalTable table, ParsedCriticalTable table,
params (string RollBandLabel, string ColumnKey)[] excludedResults) params (string RollBandLabel, string ColumnKey)[] excludedResults)

View File

@@ -100,6 +100,40 @@ public sealed class CriticalImportCommandRunner
}); });
} }
public async Task<int> RunAsync(ReimportImagesOptions options)
{
var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug);
if (!File.Exists(artifactPaths.XmlPath))
{
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath);
}
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
var parseResult = Parse(entry, xmlContent);
await sourceImageArtifactGenerator.GenerateAsync(
ResolveRepositoryPath(entry.PdfPath),
artifactPaths,
parseResult,
CancellationToken.None);
await artifactWriter.WriteAsync(artifactPaths, parseResult, CancellationToken.None);
if (!parseResult.ValidationReport.IsValid)
{
throw new InvalidOperationException(
$"Validation failed for '{entry.Slug}'. See {artifactPaths.ValidationReportPath} for details.");
}
var loader = new CriticalImportLoader(ResolveDatabasePath(options.DatabasePath));
var refreshedCount = await loader.RefreshImageArtifactsAsync(parseResult.Table);
Console.WriteLine(
$"Refreshed image artifacts for {entry.Slug}: {refreshedCount} results updated.");
return 0;
}
private CriticalImportManifestEntry GetManifestEntry(string tableSlug) private CriticalImportManifestEntry GetManifestEntry(string tableSlug)
{ {
var manifest = manifestLoader.Load(RepositoryPaths.Discover().ManifestPath); var manifest = manifestLoader.Load(RepositoryPaths.Discover().ManifestPath);

View File

@@ -128,6 +128,50 @@ public sealed class CriticalImportLoader(string databasePath)
return new ImportCommandResult(entity.Slug, entity.Columns.Count, entity.RollBands.Count, entity.Results.Count); return new ImportCommandResult(entity.Slug, entity.Columns.Count, entity.RollBands.Count, entity.Results.Count);
} }
public async Task<int> RefreshImageArtifactsAsync(ParsedCriticalTable table, CancellationToken cancellationToken = default)
{
await using var dbContext = CreateDbContext();
await dbContext.Database.EnsureCreatedAsync(cancellationToken);
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext, cancellationToken);
await using var transaction = await dbContext.Database.BeginTransactionAsync(cancellationToken);
var entity = await dbContext.CriticalTables
.AsSplitQuery()
.Include(item => item.Results)
.ThenInclude(result => result.CriticalGroup)
.Include(item => item.Results)
.ThenInclude(result => result.CriticalColumn)
.Include(item => item.Results)
.ThenInclude(result => result.CriticalRollBand)
.SingleOrDefaultAsync(item => item.Slug == table.Slug, cancellationToken);
if (entity is null)
{
throw new InvalidOperationException($"Critical table '{table.Slug}' does not exist in the target database.");
}
var existingResultsByKey = entity.Results.ToDictionary(
item => CreateResultKey(item.CriticalGroup?.GroupKey, item.CriticalColumn.ColumnKey, item.CriticalRollBand.Label),
StringComparer.Ordinal);
var refreshedCount = 0;
foreach (var item in table.Results)
{
var resultKey = CreateResultKey(item.GroupKey, item.ColumnKey, item.RollBandLabel);
if (!existingResultsByKey.TryGetValue(resultKey, out var existingResult))
{
continue;
}
ApplyImporterProvenance(existingResult, item);
refreshedCount++;
}
await dbContext.SaveChangesAsync(cancellationToken);
await transaction.CommitAsync(cancellationToken);
return refreshedCount;
}
private RolemasterDbContext CreateDbContext() private RolemasterDbContext CreateDbContext()
{ {
var options = new DbContextOptionsBuilder<RolemasterDbContext>() var options = new DbContextOptionsBuilder<RolemasterDbContext>()

View File

@@ -75,17 +75,18 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
return new CriticalSourceImageCrop( return new CriticalSourceImageCrop(
sourceBounds.PageNumber, sourceBounds.PageNumber,
pageGeometry.Width, PdfXmlExtractor.ScaleCoordinate(pageGeometry.Width),
pageGeometry.Height, PdfXmlExtractor.ScaleCoordinate(pageGeometry.Height),
sourceBounds.Left, PdfXmlExtractor.ScaleCoordinate(sourceBounds.Left),
sourceBounds.Top, PdfXmlExtractor.ScaleCoordinate(sourceBounds.Top),
sourceBounds.Width, PdfXmlExtractor.ScaleCoordinate(sourceBounds.Width),
sourceBounds.Height, PdfXmlExtractor.ScaleCoordinate(sourceBounds.Height),
cropLeft, PdfXmlExtractor.ScaleCoordinate(cropLeft),
cropTop, PdfXmlExtractor.ScaleCoordinate(cropTop),
Math.Max(1, cropRight - cropLeft), PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropRight - cropLeft)),
Math.Max(1, cropBottom - cropTop), PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropBottom - cropTop)),
PdfXmlExtractor.XmlAlignedRenderDpi); PdfXmlExtractor.ScaledRenderDpi,
PdfXmlExtractor.RenderScaleFactor);
} }
private static string CreateCellKey(string? groupKey, string rollBandLabel, string columnKey) => private static string CreateCellKey(string? groupKey, string rollBandLabel, string columnKey) =>

View File

@@ -12,7 +12,8 @@ public sealed class CriticalSourceImageCrop(
int cropTop, int cropTop,
int cropWidth, int cropWidth,
int cropHeight, int cropHeight,
int renderDpi) int renderDpi,
int scaleFactor)
{ {
public int PageNumber { get; } = pageNumber; public int PageNumber { get; } = pageNumber;
public int PageWidth { get; } = pageWidth; public int PageWidth { get; } = pageWidth;
@@ -26,4 +27,5 @@ public sealed class CriticalSourceImageCrop(
public int CropWidth { get; } = cropWidth; public int CropWidth { get; } = cropWidth;
public int CropHeight { get; } = cropHeight; public int CropHeight { get; } = cropHeight;
public int RenderDpi { get; } = renderDpi; public int RenderDpi { get; } = renderDpi;
public int ScaleFactor { get; } = scaleFactor;
} }

View File

@@ -4,7 +4,11 @@ namespace RolemasterDb.ImportTool;
public sealed class PdfXmlExtractor public sealed class PdfXmlExtractor
{ {
public const int RenderScaleFactor = 4;
public const int XmlAlignedRenderDpi = 108; public const int XmlAlignedRenderDpi = 108;
public const int ScaledRenderDpi = XmlAlignedRenderDpi * RenderScaleFactor;
public static int ScaleCoordinate(int value) => checked(value * RenderScaleFactor);
public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default) public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default)
{ {
@@ -77,7 +81,7 @@ public sealed class PdfXmlExtractor
startInfo.ArgumentList.Add("-png"); startInfo.ArgumentList.Add("-png");
startInfo.ArgumentList.Add("-r"); startInfo.ArgumentList.Add("-r");
startInfo.ArgumentList.Add(XmlAlignedRenderDpi.ToString()); startInfo.ArgumentList.Add(ScaledRenderDpi.ToString());
startInfo.ArgumentList.Add("-f"); startInfo.ArgumentList.Add("-f");
startInfo.ArgumentList.Add(pageNumber.ToString()); startInfo.ArgumentList.Add(pageNumber.ToString());
startInfo.ArgumentList.Add("-l"); startInfo.ArgumentList.Add("-l");

View File

@@ -4,12 +4,13 @@ using RolemasterDb.ImportTool;
var runner = new CriticalImportCommandRunner(); var runner = new CriticalImportCommandRunner();
var exitCode = await Parser.Default.ParseArguments<ResetOptions, ExtractOptions, LoadOptions, ImportOptions>(args) var exitCode = await Parser.Default.ParseArguments<ResetOptions, ExtractOptions, LoadOptions, ImportOptions, ReimportImagesOptions>(args)
.MapResult( .MapResult(
(ResetOptions options) => ExecuteAsync(() => runner.RunAsync(options)), (ResetOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
(ExtractOptions options) => ExecuteAsync(() => runner.RunAsync(options)), (ExtractOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
(LoadOptions options) => ExecuteAsync(() => runner.RunAsync(options)), (LoadOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
(ImportOptions options) => ExecuteAsync(() => runner.RunAsync(options)), (ImportOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
(ReimportImagesOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
_ => Task.FromResult(1)); _ => Task.FromResult(1));
return exitCode; return exitCode;

View File

@@ -0,0 +1,13 @@
using CommandLine;
namespace RolemasterDb.ImportTool;
[Verb("reimport-images", HelpText = "Regenerate critical table page and cell images and refresh only image metadata in SQLite.")]
public sealed class ReimportImagesOptions
{
[Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to refresh.")]
public string Table { get; set; } = string.Empty;
[Option('d', "db", HelpText = "Optional SQLite database path.")]
public string? DatabasePath { get; set; }
}