Document and harden curated critical imports
This commit is contained in:
@@ -33,6 +33,7 @@ The current implementation supports:
|
||||
- `variant_column` critical tables with non-severity columns
|
||||
- `grouped_variant` critical tables with a group axis plus variant columns
|
||||
- XML-based extraction using `pdftohtml -xml`
|
||||
- XML-aligned page rendering and per-cell PNG crops using `pdftoppm -png -r 108`
|
||||
- geometry-based parsing across the currently enabled table set:
|
||||
- `arcane-aether`
|
||||
- `arcane-nether`
|
||||
@@ -60,6 +61,11 @@ The current implementation supports:
|
||||
- conditional branch extraction into `critical_branch`
|
||||
- footer/page-number filtering during body parsing
|
||||
- transactional loading into SQLite
|
||||
- importer-managed source provenance for each parsed result:
|
||||
- source page number
|
||||
- source crop bounds
|
||||
- deterministic crop-image path
|
||||
- non-destructive merge loading that preserves curated rows
|
||||
- conditional branch display through the web critical lookup
|
||||
|
||||
The current implementation does not yet support:
|
||||
@@ -75,8 +81,9 @@ The importer workflow is:
|
||||
2. Extract the source PDF into an artifact format.
|
||||
3. Parse the extracted artifact into an in-memory table model.
|
||||
4. Write debug artifacts to disk.
|
||||
5. Validate the parsed result.
|
||||
6. If validation succeeds, load the parsed data into SQLite in a transaction.
|
||||
5. Render page and cell reference PNGs.
|
||||
6. Validate the parsed result.
|
||||
7. If validation succeeds, merge the parsed data into SQLite in a transaction.
|
||||
|
||||
The importer uses the same EF Core context and domain model as the web app, but it owns the critical-data population flow.
|
||||
|
||||
@@ -413,6 +420,36 @@ Use this when:
|
||||
- validating a specific row and column
|
||||
- checking whether a fragment was assigned to the correct cell
|
||||
- confirming description and affix splitting
|
||||
- confirming page and crop provenance for a specific result
|
||||
|
||||
Each parsed cell now includes:
|
||||
|
||||
- `sourceBounds`
|
||||
- XML-aligned page number and bounding rectangle for the final repaired cell content
|
||||
- `sourceImagePath`
|
||||
- importer-managed relative PNG path when image generation succeeded
|
||||
- `sourceImageCrop`
|
||||
- the final crop rectangle written to disk
|
||||
|
||||
### `pages/page-001.png`
|
||||
|
||||
Rendered PDF page images at `108 DPI`, which matches the coordinate space emitted by `pdftohtml -xml`.
|
||||
|
||||
Use this when:
|
||||
|
||||
- visually checking page-level alignment
|
||||
- comparing XML coordinates against the rendered source page
|
||||
- confirming crop placement without re-running the importer
|
||||
|
||||
### `cells/<group>__<column>__<roll-band>.png`
|
||||
|
||||
One deterministic PNG crop per parsed critical result.
|
||||
|
||||
Use this when:
|
||||
|
||||
- curating a result in the web editor
|
||||
- verifying the importer matched the intended source cell
|
||||
- debugging crop padding or page-boundary issues
|
||||
|
||||
### `validation-report.json`
|
||||
|
||||
@@ -547,17 +584,33 @@ The current load path:
|
||||
|
||||
1. ensures the SQLite database exists
|
||||
2. upgrades older SQLite files to the current importer-owned critical schema where needed
|
||||
3. deletes the existing subtree for the targeted critical table
|
||||
4. inserts:
|
||||
- `critical_table`
|
||||
- `critical_column`
|
||||
- `critical_roll_band`
|
||||
- `critical_result`
|
||||
- `critical_branch`
|
||||
- `critical_effect`
|
||||
5. commits only after the full table is saved
|
||||
3. reconciles the targeted table, axes, and existing results by logical identity
|
||||
4. inserts newly discovered rows
|
||||
5. updates uncurated rows in place
|
||||
6. preserves curated rows and their edited child rows
|
||||
7. refreshes importer-managed source provenance and crop-image metadata
|
||||
8. deletes unmatched rows only when they are still uncurated
|
||||
9. commits only after the full merge is saved
|
||||
|
||||
This means importer iterations can target one table without resetting unrelated database content.
|
||||
Result identity is keyed by:
|
||||
|
||||
- table slug
|
||||
- optional group key
|
||||
- column key
|
||||
- roll-band label
|
||||
|
||||
This means importer iterations can target one table without resetting unrelated database content, while still protecting manually curated rows from later parser changes.
|
||||
|
||||
## Image Toolchain
|
||||
|
||||
The importer now uses two Poppler tools:
|
||||
|
||||
- `pdftohtml -xml -i -noframes`
|
||||
- extracts geometry-aware XML text
|
||||
- `pdftoppm -png -r 108`
|
||||
- renders page PNGs and per-cell crop PNGs
|
||||
|
||||
The `108 DPI` render setting is deliberate: for the current PDFs and Poppler output, it produces page images whose pixel dimensions match the XML `page width` and `page height`, so crop coordinates can be applied directly without an extra scale-conversion step.
|
||||
|
||||
## Interaction With Web App Startup
|
||||
|
||||
|
||||
@@ -94,6 +94,11 @@ public static class RolemasterDbSchemaUpgrader
|
||||
|
||||
private static async Task EnsureCriticalResultCurationColumnsAsync(RolemasterDbContext dbContext, CancellationToken cancellationToken)
|
||||
{
|
||||
if (!await TableExistsAsync(dbContext, "CriticalResults", cancellationToken))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (!await ColumnExistsAsync(dbContext, "CriticalResults", "IsCurated", cancellationToken))
|
||||
{
|
||||
await dbContext.Database.ExecuteSqlRawAsync(
|
||||
@@ -195,4 +200,38 @@ public static class RolemasterDbSchemaUpgrader
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<bool> TableExistsAsync(
|
||||
RolemasterDbContext dbContext,
|
||||
string tableName,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var connection = dbContext.Database.GetDbConnection();
|
||||
var shouldClose = connection.State != System.Data.ConnectionState.Open;
|
||||
if (shouldClose)
|
||||
{
|
||||
await connection.OpenAsync(cancellationToken);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await using var command = connection.CreateCommand();
|
||||
command.CommandText = "SELECT COUNT(*) FROM sqlite_master WHERE type = 'table' AND name = $tableName;";
|
||||
|
||||
var parameter = command.CreateParameter();
|
||||
parameter.ParameterName = "$tableName";
|
||||
parameter.Value = tableName;
|
||||
command.Parameters.Add(parameter);
|
||||
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken);
|
||||
return Convert.ToInt32(result) > 0;
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (shouldClose)
|
||||
{
|
||||
await connection.CloseAsync();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -691,9 +691,16 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
||||
{
|
||||
var databasePath = Path.Combine(GetArtifactCacheRoot(), $"rolemaster-{Guid.NewGuid():N}.db");
|
||||
File.Copy(Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.App", "rolemaster.db"), databasePath, true);
|
||||
UpgradeDatabase(databasePath).GetAwaiter().GetResult();
|
||||
return databasePath;
|
||||
}
|
||||
|
||||
private static async Task UpgradeDatabase(string databasePath)
|
||||
{
|
||||
await using var dbContext = CreateDbContext(databasePath);
|
||||
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext);
|
||||
}
|
||||
|
||||
private static string GetRepositoryRoot()
|
||||
{
|
||||
var probe = new DirectoryInfo(AppContext.BaseDirectory);
|
||||
|
||||
Reference in New Issue
Block a user