Document and harden curated critical imports
This commit is contained in:
@@ -33,6 +33,7 @@ The current implementation supports:
|
|||||||
- `variant_column` critical tables with non-severity columns
|
- `variant_column` critical tables with non-severity columns
|
||||||
- `grouped_variant` critical tables with a group axis plus variant columns
|
- `grouped_variant` critical tables with a group axis plus variant columns
|
||||||
- XML-based extraction using `pdftohtml -xml`
|
- XML-based extraction using `pdftohtml -xml`
|
||||||
|
- XML-aligned page rendering and per-cell PNG crops using `pdftoppm -png -r 108`
|
||||||
- geometry-based parsing across the currently enabled table set:
|
- geometry-based parsing across the currently enabled table set:
|
||||||
- `arcane-aether`
|
- `arcane-aether`
|
||||||
- `arcane-nether`
|
- `arcane-nether`
|
||||||
@@ -60,6 +61,11 @@ The current implementation supports:
|
|||||||
- conditional branch extraction into `critical_branch`
|
- conditional branch extraction into `critical_branch`
|
||||||
- footer/page-number filtering during body parsing
|
- footer/page-number filtering during body parsing
|
||||||
- transactional loading into SQLite
|
- transactional loading into SQLite
|
||||||
|
- importer-managed source provenance for each parsed result:
|
||||||
|
- source page number
|
||||||
|
- source crop bounds
|
||||||
|
- deterministic crop-image path
|
||||||
|
- non-destructive merge loading that preserves curated rows
|
||||||
- conditional branch display through the web critical lookup
|
- conditional branch display through the web critical lookup
|
||||||
|
|
||||||
The current implementation does not yet support:
|
The current implementation does not yet support:
|
||||||
@@ -75,8 +81,9 @@ The importer workflow is:
|
|||||||
2. Extract the source PDF into an artifact format.
|
2. Extract the source PDF into an artifact format.
|
||||||
3. Parse the extracted artifact into an in-memory table model.
|
3. Parse the extracted artifact into an in-memory table model.
|
||||||
4. Write debug artifacts to disk.
|
4. Write debug artifacts to disk.
|
||||||
5. Validate the parsed result.
|
5. Render page and cell reference PNGs.
|
||||||
6. If validation succeeds, load the parsed data into SQLite in a transaction.
|
6. Validate the parsed result.
|
||||||
|
7. If validation succeeds, merge the parsed data into SQLite in a transaction.
|
||||||
|
|
||||||
The importer uses the same EF Core context and domain model as the web app, but it owns the critical-data population flow.
|
The importer uses the same EF Core context and domain model as the web app, but it owns the critical-data population flow.
|
||||||
|
|
||||||
@@ -413,6 +420,36 @@ Use this when:
|
|||||||
- validating a specific row and column
|
- validating a specific row and column
|
||||||
- checking whether a fragment was assigned to the correct cell
|
- checking whether a fragment was assigned to the correct cell
|
||||||
- confirming description and affix splitting
|
- confirming description and affix splitting
|
||||||
|
- confirming page and crop provenance for a specific result
|
||||||
|
|
||||||
|
Each parsed cell now includes:
|
||||||
|
|
||||||
|
- `sourceBounds`
|
||||||
|
- XML-aligned page number and bounding rectangle for the final repaired cell content
|
||||||
|
- `sourceImagePath`
|
||||||
|
- importer-managed relative PNG path when image generation succeeded
|
||||||
|
- `sourceImageCrop`
|
||||||
|
- the final crop rectangle written to disk
|
||||||
|
|
||||||
|
### `pages/page-001.png`
|
||||||
|
|
||||||
|
Rendered PDF page images at `108 DPI`, which matches the coordinate space emitted by `pdftohtml -xml`.
|
||||||
|
|
||||||
|
Use this when:
|
||||||
|
|
||||||
|
- visually checking page-level alignment
|
||||||
|
- comparing XML coordinates against the rendered source page
|
||||||
|
- confirming crop placement without re-running the importer
|
||||||
|
|
||||||
|
### `cells/<group>__<column>__<roll-band>.png`
|
||||||
|
|
||||||
|
One deterministic PNG crop per parsed critical result.
|
||||||
|
|
||||||
|
Use this when:
|
||||||
|
|
||||||
|
- curating a result in the web editor
|
||||||
|
- verifying the importer matched the intended source cell
|
||||||
|
- debugging crop padding or page-boundary issues
|
||||||
|
|
||||||
### `validation-report.json`
|
### `validation-report.json`
|
||||||
|
|
||||||
@@ -547,17 +584,33 @@ The current load path:
|
|||||||
|
|
||||||
1. ensures the SQLite database exists
|
1. ensures the SQLite database exists
|
||||||
2. upgrades older SQLite files to the current importer-owned critical schema where needed
|
2. upgrades older SQLite files to the current importer-owned critical schema where needed
|
||||||
3. deletes the existing subtree for the targeted critical table
|
3. reconciles the targeted table, axes, and existing results by logical identity
|
||||||
4. inserts:
|
4. inserts newly discovered rows
|
||||||
- `critical_table`
|
5. updates uncurated rows in place
|
||||||
- `critical_column`
|
6. preserves curated rows and their edited child rows
|
||||||
- `critical_roll_band`
|
7. refreshes importer-managed source provenance and crop-image metadata
|
||||||
- `critical_result`
|
8. deletes unmatched rows only when they are still uncurated
|
||||||
- `critical_branch`
|
9. commits only after the full merge is saved
|
||||||
- `critical_effect`
|
|
||||||
5. commits only after the full table is saved
|
|
||||||
|
|
||||||
This means importer iterations can target one table without resetting unrelated database content.
|
Result identity is keyed by:
|
||||||
|
|
||||||
|
- table slug
|
||||||
|
- optional group key
|
||||||
|
- column key
|
||||||
|
- roll-band label
|
||||||
|
|
||||||
|
This means importer iterations can target one table without resetting unrelated database content, while still protecting manually curated rows from later parser changes.
|
||||||
|
|
||||||
|
## Image Toolchain
|
||||||
|
|
||||||
|
The importer now uses two Poppler tools:
|
||||||
|
|
||||||
|
- `pdftohtml -xml -i -noframes`
|
||||||
|
- extracts geometry-aware XML text
|
||||||
|
- `pdftoppm -png -r 108`
|
||||||
|
- renders page PNGs and per-cell crop PNGs
|
||||||
|
|
||||||
|
The `108 DPI` render setting is deliberate: for the current PDFs and Poppler output, it produces page images whose pixel dimensions match the XML `page width` and `page height`, so crop coordinates can be applied directly without an extra scale-conversion step.
|
||||||
|
|
||||||
## Interaction With Web App Startup
|
## Interaction With Web App Startup
|
||||||
|
|
||||||
|
|||||||
@@ -94,6 +94,11 @@ public static class RolemasterDbSchemaUpgrader
|
|||||||
|
|
||||||
private static async Task EnsureCriticalResultCurationColumnsAsync(RolemasterDbContext dbContext, CancellationToken cancellationToken)
|
private static async Task EnsureCriticalResultCurationColumnsAsync(RolemasterDbContext dbContext, CancellationToken cancellationToken)
|
||||||
{
|
{
|
||||||
|
if (!await TableExistsAsync(dbContext, "CriticalResults", cancellationToken))
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (!await ColumnExistsAsync(dbContext, "CriticalResults", "IsCurated", cancellationToken))
|
if (!await ColumnExistsAsync(dbContext, "CriticalResults", "IsCurated", cancellationToken))
|
||||||
{
|
{
|
||||||
await dbContext.Database.ExecuteSqlRawAsync(
|
await dbContext.Database.ExecuteSqlRawAsync(
|
||||||
@@ -195,4 +200,38 @@ public static class RolemasterDbSchemaUpgrader
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static async Task<bool> TableExistsAsync(
|
||||||
|
RolemasterDbContext dbContext,
|
||||||
|
string tableName,
|
||||||
|
CancellationToken cancellationToken)
|
||||||
|
{
|
||||||
|
var connection = dbContext.Database.GetDbConnection();
|
||||||
|
var shouldClose = connection.State != System.Data.ConnectionState.Open;
|
||||||
|
if (shouldClose)
|
||||||
|
{
|
||||||
|
await connection.OpenAsync(cancellationToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await using var command = connection.CreateCommand();
|
||||||
|
command.CommandText = "SELECT COUNT(*) FROM sqlite_master WHERE type = 'table' AND name = $tableName;";
|
||||||
|
|
||||||
|
var parameter = command.CreateParameter();
|
||||||
|
parameter.ParameterName = "$tableName";
|
||||||
|
parameter.Value = tableName;
|
||||||
|
command.Parameters.Add(parameter);
|
||||||
|
|
||||||
|
var result = await command.ExecuteScalarAsync(cancellationToken);
|
||||||
|
return Convert.ToInt32(result) > 0;
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (shouldClose)
|
||||||
|
{
|
||||||
|
await connection.CloseAsync();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -691,9 +691,16 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
{
|
{
|
||||||
var databasePath = Path.Combine(GetArtifactCacheRoot(), $"rolemaster-{Guid.NewGuid():N}.db");
|
var databasePath = Path.Combine(GetArtifactCacheRoot(), $"rolemaster-{Guid.NewGuid():N}.db");
|
||||||
File.Copy(Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.App", "rolemaster.db"), databasePath, true);
|
File.Copy(Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.App", "rolemaster.db"), databasePath, true);
|
||||||
|
UpgradeDatabase(databasePath).GetAwaiter().GetResult();
|
||||||
return databasePath;
|
return databasePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static async Task UpgradeDatabase(string databasePath)
|
||||||
|
{
|
||||||
|
await using var dbContext = CreateDbContext(databasePath);
|
||||||
|
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext);
|
||||||
|
}
|
||||||
|
|
||||||
private static string GetRepositoryRoot()
|
private static string GetRepositoryRoot()
|
||||||
{
|
{
|
||||||
var probe = new DirectoryInfo(AppContext.BaseDirectory);
|
var probe = new DirectoryInfo(AppContext.BaseDirectory);
|
||||||
|
|||||||
Reference in New Issue
Block a user