diff --git a/docs/critical_import_tool.md b/docs/critical_import_tool.md index eb8b74d..85020ce 100644 --- a/docs/critical_import_tool.md +++ b/docs/critical_import_tool.md @@ -33,6 +33,7 @@ The current implementation supports: - `variant_column` critical tables with non-severity columns - `grouped_variant` critical tables with a group axis plus variant columns - XML-based extraction using `pdftohtml -xml` +- XML-aligned page rendering and per-cell PNG crops using `pdftoppm -png -r 108` - geometry-based parsing across the currently enabled table set: - `arcane-aether` - `arcane-nether` @@ -60,6 +61,11 @@ The current implementation supports: - conditional branch extraction into `critical_branch` - footer/page-number filtering during body parsing - transactional loading into SQLite +- importer-managed source provenance for each parsed result: + - source page number + - source crop bounds + - deterministic crop-image path +- non-destructive merge loading that preserves curated rows - conditional branch display through the web critical lookup The current implementation does not yet support: @@ -75,8 +81,9 @@ The importer workflow is: 2. Extract the source PDF into an artifact format. 3. Parse the extracted artifact into an in-memory table model. 4. Write debug artifacts to disk. -5. Validate the parsed result. -6. If validation succeeds, load the parsed data into SQLite in a transaction. +5. Render page and cell reference PNGs. +6. Validate the parsed result. +7. If validation succeeds, merge the parsed data into SQLite in a transaction. The importer uses the same EF Core context and domain model as the web app, but it owns the critical-data population flow. @@ -413,6 +420,36 @@ Use this when: - validating a specific row and column - checking whether a fragment was assigned to the correct cell - confirming description and affix splitting +- confirming page and crop provenance for a specific result + +Each parsed cell now includes: + +- `sourceBounds` + - XML-aligned page number and bounding rectangle for the final repaired cell content +- `sourceImagePath` + - importer-managed relative PNG path when image generation succeeded +- `sourceImageCrop` + - the final crop rectangle written to disk + +### `pages/page-001.png` + +Rendered PDF page images at `108 DPI`, which matches the coordinate space emitted by `pdftohtml -xml`. + +Use this when: + +- visually checking page-level alignment +- comparing XML coordinates against the rendered source page +- confirming crop placement without re-running the importer + +### `cells/____.png` + +One deterministic PNG crop per parsed critical result. + +Use this when: + +- curating a result in the web editor +- verifying the importer matched the intended source cell +- debugging crop padding or page-boundary issues ### `validation-report.json` @@ -547,17 +584,33 @@ The current load path: 1. ensures the SQLite database exists 2. upgrades older SQLite files to the current importer-owned critical schema where needed -3. deletes the existing subtree for the targeted critical table -4. inserts: - - `critical_table` - - `critical_column` - - `critical_roll_band` - - `critical_result` - - `critical_branch` - - `critical_effect` -5. commits only after the full table is saved +3. reconciles the targeted table, axes, and existing results by logical identity +4. inserts newly discovered rows +5. updates uncurated rows in place +6. preserves curated rows and their edited child rows +7. refreshes importer-managed source provenance and crop-image metadata +8. deletes unmatched rows only when they are still uncurated +9. commits only after the full merge is saved -This means importer iterations can target one table without resetting unrelated database content. +Result identity is keyed by: + +- table slug +- optional group key +- column key +- roll-band label + +This means importer iterations can target one table without resetting unrelated database content, while still protecting manually curated rows from later parser changes. + +## Image Toolchain + +The importer now uses two Poppler tools: + +- `pdftohtml -xml -i -noframes` + - extracts geometry-aware XML text +- `pdftoppm -png -r 108` + - renders page PNGs and per-cell crop PNGs + +The `108 DPI` render setting is deliberate: for the current PDFs and Poppler output, it produces page images whose pixel dimensions match the XML `page width` and `page height`, so crop coordinates can be applied directly without an extra scale-conversion step. ## Interaction With Web App Startup diff --git a/src/RolemasterDb.App/Data/RolemasterDbSchemaUpgrader.cs b/src/RolemasterDb.App/Data/RolemasterDbSchemaUpgrader.cs index 6dc400d..bbd29f7 100644 --- a/src/RolemasterDb.App/Data/RolemasterDbSchemaUpgrader.cs +++ b/src/RolemasterDb.App/Data/RolemasterDbSchemaUpgrader.cs @@ -94,6 +94,11 @@ public static class RolemasterDbSchemaUpgrader private static async Task EnsureCriticalResultCurationColumnsAsync(RolemasterDbContext dbContext, CancellationToken cancellationToken) { + if (!await TableExistsAsync(dbContext, "CriticalResults", cancellationToken)) + { + return; + } + if (!await ColumnExistsAsync(dbContext, "CriticalResults", "IsCurated", cancellationToken)) { await dbContext.Database.ExecuteSqlRawAsync( @@ -195,4 +200,38 @@ public static class RolemasterDbSchemaUpgrader } } } + + private static async Task TableExistsAsync( + RolemasterDbContext dbContext, + string tableName, + CancellationToken cancellationToken) + { + var connection = dbContext.Database.GetDbConnection(); + var shouldClose = connection.State != System.Data.ConnectionState.Open; + if (shouldClose) + { + await connection.OpenAsync(cancellationToken); + } + + try + { + await using var command = connection.CreateCommand(); + command.CommandText = "SELECT COUNT(*) FROM sqlite_master WHERE type = 'table' AND name = $tableName;"; + + var parameter = command.CreateParameter(); + parameter.ParameterName = "$tableName"; + parameter.Value = tableName; + command.Parameters.Add(parameter); + + var result = await command.ExecuteScalarAsync(cancellationToken); + return Convert.ToInt32(result) > 0; + } + finally + { + if (shouldClose) + { + await connection.CloseAsync(); + } + } + } } diff --git a/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs b/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs index d8c29dc..a869205 100644 --- a/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs +++ b/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs @@ -691,9 +691,16 @@ public sealed class StandardCriticalTableParserIntegrationTests { var databasePath = Path.Combine(GetArtifactCacheRoot(), $"rolemaster-{Guid.NewGuid():N}.db"); File.Copy(Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.App", "rolemaster.db"), databasePath, true); + UpgradeDatabase(databasePath).GetAwaiter().GetResult(); return databasePath; } + private static async Task UpgradeDatabase(string databasePath) + { + await using var dbContext = CreateDbContext(databasePath); + await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext); + } + private static string GetRepositoryRoot() { var probe = new DirectoryInfo(AppContext.BaseDirectory);