Add high-res critical image refresh import
This commit is contained in:
@@ -33,7 +33,7 @@ The current implementation supports:
|
|||||||
- `variant_column` critical tables with non-severity columns
|
- `variant_column` critical tables with non-severity columns
|
||||||
- `grouped_variant` critical tables with a group axis plus variant columns
|
- `grouped_variant` critical tables with a group axis plus variant columns
|
||||||
- XML-based extraction using `pdftohtml -xml`
|
- XML-based extraction using `pdftohtml -xml`
|
||||||
- XML-aligned page rendering and per-cell PNG crops using `pdftoppm -png -r 108`
|
- XML-aligned page rendering and per-cell PNG crops using `pdftoppm -png -r 432`
|
||||||
- geometry-based parsing across the currently enabled table set:
|
- geometry-based parsing across the currently enabled table set:
|
||||||
- `arcane-aether`
|
- `arcane-aether`
|
||||||
- `arcane-nether`
|
- `arcane-nether`
|
||||||
@@ -359,6 +359,22 @@ Example:
|
|||||||
dotnet run --project .\src\RolemasterDb.ImportTool\RolemasterDb.ImportTool.csproj -- import slash
|
dotnet run --project .\src\RolemasterDb.ImportTool\RolemasterDb.ImportTool.csproj -- import slash
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### `reimport-images <table>`
|
||||||
|
|
||||||
|
Reuses `source.xml`, regenerates page PNGs and cell PNGs, rewrites the JSON artifacts, and refreshes only source-image metadata in SQLite.
|
||||||
|
|
||||||
|
Use this when:
|
||||||
|
|
||||||
|
- crop resolution or render settings changed
|
||||||
|
- you want better source images without reloading result text
|
||||||
|
- you want to keep curated and uncurated content untouched while refreshing artifacts
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
dotnet run --project .\src\RolemasterDb.ImportTool\RolemasterDb.ImportTool.csproj -- reimport-images slash
|
||||||
|
```
|
||||||
|
|
||||||
## Manifest
|
## Manifest
|
||||||
|
|
||||||
The importer manifest is stored at:
|
The importer manifest is stored at:
|
||||||
@@ -433,7 +449,7 @@ Each parsed cell now includes:
|
|||||||
|
|
||||||
### `pages/page-001.png`
|
### `pages/page-001.png`
|
||||||
|
|
||||||
Rendered PDF page images at `108 DPI`, which matches the coordinate space emitted by `pdftohtml -xml`.
|
Rendered PDF page images at `432 DPI`, using a central render scale factor of `4` over the XML coordinate space emitted by `pdftohtml -xml`.
|
||||||
|
|
||||||
Use this when:
|
Use this when:
|
||||||
|
|
||||||
@@ -607,10 +623,14 @@ The importer now uses two Poppler tools:
|
|||||||
|
|
||||||
- `pdftohtml -xml -i -noframes`
|
- `pdftohtml -xml -i -noframes`
|
||||||
- extracts geometry-aware XML text
|
- extracts geometry-aware XML text
|
||||||
- `pdftoppm -png -r 108`
|
- `pdftoppm -png -r 432`
|
||||||
- renders page PNGs and per-cell crop PNGs
|
- renders page PNGs and per-cell crop PNGs
|
||||||
|
|
||||||
The `108 DPI` render setting is deliberate: for the current PDFs and Poppler output, it produces page images whose pixel dimensions match the XML `page width` and `page height`, so crop coordinates can be applied directly without an extra scale-conversion step.
|
The importer keeps a central render scale factor of `4`. The XML still defines bounds in its original coordinate space, but rendered PNGs and stored crop metadata now use the scaled coordinate space and a `432 DPI` render setting. In practice:
|
||||||
|
|
||||||
|
- XML coordinates are multiplied by `4` before crop extraction
|
||||||
|
- page and crop metadata stored with each result reflect the scaled PNG coordinate space
|
||||||
|
- crop alignment remains deterministic without changing the parsing pipeline
|
||||||
|
|
||||||
## Interaction With Web App Startup
|
## Interaction With Web App Startup
|
||||||
|
|
||||||
|
|||||||
@@ -22,6 +22,10 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
|
|||||||
Assert.True(result.SourceBounds.Height > 0);
|
Assert.True(result.SourceBounds.Height > 0);
|
||||||
Assert.NotNull(result.SourceImagePath);
|
Assert.NotNull(result.SourceImagePath);
|
||||||
Assert.NotNull(result.SourceImageCrop);
|
Assert.NotNull(result.SourceImageCrop);
|
||||||
|
Assert.Equal(PdfXmlExtractor.RenderScaleFactor, result.SourceImageCrop!.ScaleFactor);
|
||||||
|
Assert.Equal(PdfXmlExtractor.ScaledRenderDpi, result.SourceImageCrop.RenderDpi);
|
||||||
|
Assert.Equal(result.SourceBounds.Width * PdfXmlExtractor.RenderScaleFactor, result.SourceImageCrop.BoundsWidth);
|
||||||
|
Assert.Equal(result.SourceBounds.Height * PdfXmlExtractor.RenderScaleFactor, result.SourceImageCrop.BoundsHeight);
|
||||||
Assert.Equal(result.SourceImagePath, cellArtifact.SourceImagePath);
|
Assert.Equal(result.SourceImagePath, cellArtifact.SourceImagePath);
|
||||||
Assert.NotNull(cellArtifact.SourceImageCrop);
|
Assert.NotNull(cellArtifact.SourceImageCrop);
|
||||||
Assert.True(File.Exists(artifactPaths.GetPageImagePath(result.SourceBounds.PageNumber)));
|
Assert.True(File.Exists(artifactPaths.GetPageImagePath(result.SourceBounds.PageNumber)));
|
||||||
|
|||||||
@@ -191,6 +191,48 @@ public sealed class CriticalImportMergeIntegrationTests
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Reimport_images_only_refreshes_provenance_without_touching_curated_content()
|
||||||
|
{
|
||||||
|
var (parseResult, _) = await LoadPreparedSlashParseResultAsync();
|
||||||
|
var databasePath = CreateEmptyDatabasePath();
|
||||||
|
var loader = new CriticalImportLoader(databasePath);
|
||||||
|
|
||||||
|
await loader.LoadAsync(parseResult.Table);
|
||||||
|
|
||||||
|
await using (var dbContext = CreateDbContext(databasePath))
|
||||||
|
{
|
||||||
|
var result = await LoadResultAsync(dbContext, "36-45", "B");
|
||||||
|
|
||||||
|
result.IsCurated = true;
|
||||||
|
result.RawCellText = "Curated raw text";
|
||||||
|
result.DescriptionText = "Curated description";
|
||||||
|
result.RawAffixText = "+12H";
|
||||||
|
result.ParseStatus = "manually_curated";
|
||||||
|
result.SourcePageNumber = null;
|
||||||
|
result.SourceImagePath = null;
|
||||||
|
result.SourceImageCropJson = null;
|
||||||
|
|
||||||
|
await dbContext.SaveChangesAsync();
|
||||||
|
}
|
||||||
|
|
||||||
|
await loader.RefreshImageArtifactsAsync(parseResult.Table);
|
||||||
|
|
||||||
|
await using (var dbContext = CreateDbContext(databasePath))
|
||||||
|
{
|
||||||
|
var result = await LoadResultAsync(dbContext, "36-45", "B");
|
||||||
|
|
||||||
|
Assert.True(result.IsCurated);
|
||||||
|
Assert.Equal("Curated raw text", result.RawCellText);
|
||||||
|
Assert.Equal("Curated description", result.DescriptionText);
|
||||||
|
Assert.Equal("+12H", result.RawAffixText);
|
||||||
|
Assert.Equal("manually_curated", result.ParseStatus);
|
||||||
|
Assert.NotNull(result.SourcePageNumber);
|
||||||
|
Assert.False(string.IsNullOrWhiteSpace(result.SourceImagePath));
|
||||||
|
Assert.False(string.IsNullOrWhiteSpace(result.SourceImageCropJson));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static ParsedCriticalTable CreateTrimmedTable(
|
private static ParsedCriticalTable CreateTrimmedTable(
|
||||||
ParsedCriticalTable table,
|
ParsedCriticalTable table,
|
||||||
params (string RollBandLabel, string ColumnKey)[] excludedResults)
|
params (string RollBandLabel, string ColumnKey)[] excludedResults)
|
||||||
|
|||||||
@@ -100,6 +100,40 @@ public sealed class CriticalImportCommandRunner
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async Task<int> RunAsync(ReimportImagesOptions options)
|
||||||
|
{
|
||||||
|
var entry = GetManifestEntry(options.Table);
|
||||||
|
var artifactPaths = CreateArtifactPaths(entry.Slug);
|
||||||
|
|
||||||
|
if (!File.Exists(artifactPaths.XmlPath))
|
||||||
|
{
|
||||||
|
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
|
||||||
|
var parseResult = Parse(entry, xmlContent);
|
||||||
|
await sourceImageArtifactGenerator.GenerateAsync(
|
||||||
|
ResolveRepositoryPath(entry.PdfPath),
|
||||||
|
artifactPaths,
|
||||||
|
parseResult,
|
||||||
|
CancellationToken.None);
|
||||||
|
await artifactWriter.WriteAsync(artifactPaths, parseResult, CancellationToken.None);
|
||||||
|
|
||||||
|
if (!parseResult.ValidationReport.IsValid)
|
||||||
|
{
|
||||||
|
throw new InvalidOperationException(
|
||||||
|
$"Validation failed for '{entry.Slug}'. See {artifactPaths.ValidationReportPath} for details.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var loader = new CriticalImportLoader(ResolveDatabasePath(options.DatabasePath));
|
||||||
|
var refreshedCount = await loader.RefreshImageArtifactsAsync(parseResult.Table);
|
||||||
|
|
||||||
|
Console.WriteLine(
|
||||||
|
$"Refreshed image artifacts for {entry.Slug}: {refreshedCount} results updated.");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
private CriticalImportManifestEntry GetManifestEntry(string tableSlug)
|
private CriticalImportManifestEntry GetManifestEntry(string tableSlug)
|
||||||
{
|
{
|
||||||
var manifest = manifestLoader.Load(RepositoryPaths.Discover().ManifestPath);
|
var manifest = manifestLoader.Load(RepositoryPaths.Discover().ManifestPath);
|
||||||
|
|||||||
@@ -128,6 +128,50 @@ public sealed class CriticalImportLoader(string databasePath)
|
|||||||
return new ImportCommandResult(entity.Slug, entity.Columns.Count, entity.RollBands.Count, entity.Results.Count);
|
return new ImportCommandResult(entity.Slug, entity.Columns.Count, entity.RollBands.Count, entity.Results.Count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async Task<int> RefreshImageArtifactsAsync(ParsedCriticalTable table, CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
await using var dbContext = CreateDbContext();
|
||||||
|
await dbContext.Database.EnsureCreatedAsync(cancellationToken);
|
||||||
|
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext, cancellationToken);
|
||||||
|
await using var transaction = await dbContext.Database.BeginTransactionAsync(cancellationToken);
|
||||||
|
|
||||||
|
var entity = await dbContext.CriticalTables
|
||||||
|
.AsSplitQuery()
|
||||||
|
.Include(item => item.Results)
|
||||||
|
.ThenInclude(result => result.CriticalGroup)
|
||||||
|
.Include(item => item.Results)
|
||||||
|
.ThenInclude(result => result.CriticalColumn)
|
||||||
|
.Include(item => item.Results)
|
||||||
|
.ThenInclude(result => result.CriticalRollBand)
|
||||||
|
.SingleOrDefaultAsync(item => item.Slug == table.Slug, cancellationToken);
|
||||||
|
|
||||||
|
if (entity is null)
|
||||||
|
{
|
||||||
|
throw new InvalidOperationException($"Critical table '{table.Slug}' does not exist in the target database.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var existingResultsByKey = entity.Results.ToDictionary(
|
||||||
|
item => CreateResultKey(item.CriticalGroup?.GroupKey, item.CriticalColumn.ColumnKey, item.CriticalRollBand.Label),
|
||||||
|
StringComparer.Ordinal);
|
||||||
|
|
||||||
|
var refreshedCount = 0;
|
||||||
|
foreach (var item in table.Results)
|
||||||
|
{
|
||||||
|
var resultKey = CreateResultKey(item.GroupKey, item.ColumnKey, item.RollBandLabel);
|
||||||
|
if (!existingResultsByKey.TryGetValue(resultKey, out var existingResult))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ApplyImporterProvenance(existingResult, item);
|
||||||
|
refreshedCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
await dbContext.SaveChangesAsync(cancellationToken);
|
||||||
|
await transaction.CommitAsync(cancellationToken);
|
||||||
|
return refreshedCount;
|
||||||
|
}
|
||||||
|
|
||||||
private RolemasterDbContext CreateDbContext()
|
private RolemasterDbContext CreateDbContext()
|
||||||
{
|
{
|
||||||
var options = new DbContextOptionsBuilder<RolemasterDbContext>()
|
var options = new DbContextOptionsBuilder<RolemasterDbContext>()
|
||||||
|
|||||||
@@ -75,17 +75,18 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
|
|||||||
|
|
||||||
return new CriticalSourceImageCrop(
|
return new CriticalSourceImageCrop(
|
||||||
sourceBounds.PageNumber,
|
sourceBounds.PageNumber,
|
||||||
pageGeometry.Width,
|
PdfXmlExtractor.ScaleCoordinate(pageGeometry.Width),
|
||||||
pageGeometry.Height,
|
PdfXmlExtractor.ScaleCoordinate(pageGeometry.Height),
|
||||||
sourceBounds.Left,
|
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Left),
|
||||||
sourceBounds.Top,
|
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Top),
|
||||||
sourceBounds.Width,
|
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Width),
|
||||||
sourceBounds.Height,
|
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Height),
|
||||||
cropLeft,
|
PdfXmlExtractor.ScaleCoordinate(cropLeft),
|
||||||
cropTop,
|
PdfXmlExtractor.ScaleCoordinate(cropTop),
|
||||||
Math.Max(1, cropRight - cropLeft),
|
PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropRight - cropLeft)),
|
||||||
Math.Max(1, cropBottom - cropTop),
|
PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropBottom - cropTop)),
|
||||||
PdfXmlExtractor.XmlAlignedRenderDpi);
|
PdfXmlExtractor.ScaledRenderDpi,
|
||||||
|
PdfXmlExtractor.RenderScaleFactor);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string CreateCellKey(string? groupKey, string rollBandLabel, string columnKey) =>
|
private static string CreateCellKey(string? groupKey, string rollBandLabel, string columnKey) =>
|
||||||
|
|||||||
@@ -12,7 +12,8 @@ public sealed class CriticalSourceImageCrop(
|
|||||||
int cropTop,
|
int cropTop,
|
||||||
int cropWidth,
|
int cropWidth,
|
||||||
int cropHeight,
|
int cropHeight,
|
||||||
int renderDpi)
|
int renderDpi,
|
||||||
|
int scaleFactor)
|
||||||
{
|
{
|
||||||
public int PageNumber { get; } = pageNumber;
|
public int PageNumber { get; } = pageNumber;
|
||||||
public int PageWidth { get; } = pageWidth;
|
public int PageWidth { get; } = pageWidth;
|
||||||
@@ -26,4 +27,5 @@ public sealed class CriticalSourceImageCrop(
|
|||||||
public int CropWidth { get; } = cropWidth;
|
public int CropWidth { get; } = cropWidth;
|
||||||
public int CropHeight { get; } = cropHeight;
|
public int CropHeight { get; } = cropHeight;
|
||||||
public int RenderDpi { get; } = renderDpi;
|
public int RenderDpi { get; } = renderDpi;
|
||||||
|
public int ScaleFactor { get; } = scaleFactor;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,11 @@ namespace RolemasterDb.ImportTool;
|
|||||||
|
|
||||||
public sealed class PdfXmlExtractor
|
public sealed class PdfXmlExtractor
|
||||||
{
|
{
|
||||||
|
public const int RenderScaleFactor = 4;
|
||||||
public const int XmlAlignedRenderDpi = 108;
|
public const int XmlAlignedRenderDpi = 108;
|
||||||
|
public const int ScaledRenderDpi = XmlAlignedRenderDpi * RenderScaleFactor;
|
||||||
|
|
||||||
|
public static int ScaleCoordinate(int value) => checked(value * RenderScaleFactor);
|
||||||
|
|
||||||
public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default)
|
public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
@@ -77,7 +81,7 @@ public sealed class PdfXmlExtractor
|
|||||||
|
|
||||||
startInfo.ArgumentList.Add("-png");
|
startInfo.ArgumentList.Add("-png");
|
||||||
startInfo.ArgumentList.Add("-r");
|
startInfo.ArgumentList.Add("-r");
|
||||||
startInfo.ArgumentList.Add(XmlAlignedRenderDpi.ToString());
|
startInfo.ArgumentList.Add(ScaledRenderDpi.ToString());
|
||||||
startInfo.ArgumentList.Add("-f");
|
startInfo.ArgumentList.Add("-f");
|
||||||
startInfo.ArgumentList.Add(pageNumber.ToString());
|
startInfo.ArgumentList.Add(pageNumber.ToString());
|
||||||
startInfo.ArgumentList.Add("-l");
|
startInfo.ArgumentList.Add("-l");
|
||||||
|
|||||||
@@ -4,12 +4,13 @@ using RolemasterDb.ImportTool;
|
|||||||
|
|
||||||
var runner = new CriticalImportCommandRunner();
|
var runner = new CriticalImportCommandRunner();
|
||||||
|
|
||||||
var exitCode = await Parser.Default.ParseArguments<ResetOptions, ExtractOptions, LoadOptions, ImportOptions>(args)
|
var exitCode = await Parser.Default.ParseArguments<ResetOptions, ExtractOptions, LoadOptions, ImportOptions, ReimportImagesOptions>(args)
|
||||||
.MapResult(
|
.MapResult(
|
||||||
(ResetOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
|
(ResetOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
|
||||||
(ExtractOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
|
(ExtractOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
|
||||||
(LoadOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
|
(LoadOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
|
||||||
(ImportOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
|
(ImportOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
|
||||||
|
(ReimportImagesOptions options) => ExecuteAsync(() => runner.RunAsync(options)),
|
||||||
_ => Task.FromResult(1));
|
_ => Task.FromResult(1));
|
||||||
|
|
||||||
return exitCode;
|
return exitCode;
|
||||||
|
|||||||
13
src/RolemasterDb.ImportTool/ReimportImagesOptions.cs
Normal file
13
src/RolemasterDb.ImportTool/ReimportImagesOptions.cs
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
using CommandLine;
|
||||||
|
|
||||||
|
namespace RolemasterDb.ImportTool;
|
||||||
|
|
||||||
|
[Verb("reimport-images", HelpText = "Regenerate critical table page and cell images and refresh only image metadata in SQLite.")]
|
||||||
|
public sealed class ReimportImagesOptions
|
||||||
|
{
|
||||||
|
[Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to refresh.")]
|
||||||
|
public string Table { get; set; } = string.Empty;
|
||||||
|
|
||||||
|
[Option('d', "db", HelpText = "Optional SQLite database path.")]
|
||||||
|
public string? DatabasePath { get; set; }
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user