Add OCR import support for void critical table

This commit is contained in:
2026-03-19 23:16:09 +01:00
parent b4c8f8c142
commit 7bb0c1b8d1
35 changed files with 4379 additions and 285 deletions

View File

@@ -167,6 +167,15 @@
"extractionMethod": "xml", "extractionMethod": "xml",
"pdfPath": "sources/Unbalance.pdf", "pdfPath": "sources/Unbalance.pdf",
"enabled": true "enabled": true
},
{
"slug": "void",
"displayName": "Void Critical Strike Table",
"family": "standard",
"extractionMethod": "ocr",
"axisTemplateSlug": "mana-standard-19",
"pdfPath": "sources/Void.pdf",
"enabled": true
} }
] ]
} }

Binary file not shown.

View File

@@ -478,12 +478,12 @@ public sealed class CriticalCellReparseIntegrationTests
initialResponse.Branches)); initialResponse.Branches));
Assert.NotNull(saveResponse); Assert.NotNull(saveResponse);
Assert.Contains(saveResponse!.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "2d10-16"); Assert.Contains(saveResponse!.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "+2d10-16");
var reopenedResponse = await lookupService.GetCriticalCellEditorAsync("mana", resultId); var reopenedResponse = await lookupService.GetCriticalCellEditorAsync("mana", resultId);
Assert.NotNull(reopenedResponse); Assert.NotNull(reopenedResponse);
Assert.Contains("-2d10-16pp", reopenedResponse!.QuickParseInput, StringComparison.Ordinal); Assert.Contains("-2d10-16pp", reopenedResponse!.QuickParseInput, StringComparison.Ordinal);
Assert.Contains(reopenedResponse.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "2d10-16"); Assert.Contains(reopenedResponse.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "+2d10-16");
var reparsed = await lookupService.ReparseCriticalCellAsync( var reparsed = await lookupService.ReparseCriticalCellAsync(
"mana", "mana",
@@ -643,20 +643,5 @@ public sealed class CriticalCellReparseIntegrationTests
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext); await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext);
} }
private static string GetRepositoryRoot() private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
{
var probe = new DirectoryInfo(AppContext.BaseDirectory);
while (probe is not null)
{
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
{
return probe.FullName;
}
probe = probe.Parent;
}
throw new InvalidOperationException("Could not find the repository root for integration tests.");
}
} }

View File

@@ -6,6 +6,7 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
{ {
private static readonly PdfXmlExtractor Extractor = new(); private static readonly PdfXmlExtractor Extractor = new();
private static readonly StandardCriticalTableParser StandardParser = new(); private static readonly StandardCriticalTableParser StandardParser = new();
private static readonly StandardOcrBootstrapper StandardOcrBootstrapper = new();
[Fact] [Fact]
public async Task Generated_artifacts_include_page_and_cell_source_images() public async Task Generated_artifacts_include_page_and_cell_source_images()
@@ -32,6 +33,34 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
Assert.True(File.Exists(artifactPaths.ResolveRelativePath(result.SourceImagePath!))); Assert.True(File.Exists(artifactPaths.ResolveRelativePath(result.SourceImagePath!)));
} }
[Fact]
public async Task Generated_ocr_artifacts_preserve_pixel_space_crop_metadata()
{
var (parseResult, artifactPaths) = await LoadPreparedVoidParseResultAsync();
var result = FindResult(parseResult, "96-99", "D");
var cellArtifact = parseResult.Cells.Single(item =>
item.GroupKey is null &&
item.RollBandLabel == "96-99" &&
item.ColumnKey == "D");
Assert.True(result.SourceBounds.PageNumber > 0);
Assert.True(result.SourceBounds.Width > 0);
Assert.True(result.SourceBounds.Height > 0);
Assert.NotNull(result.SourceImagePath);
Assert.NotNull(result.SourceImageCrop);
Assert.Equal(1, result.SourceImageCrop!.ScaleFactor);
Assert.Equal(PdfXmlExtractor.ScaledRenderDpi, result.SourceImageCrop.RenderDpi);
Assert.Equal(3600, result.SourceImageCrop.PageWidth);
Assert.Equal(5070, result.SourceImageCrop.PageHeight);
Assert.Equal(result.SourceBounds.Width, result.SourceImageCrop.BoundsWidth);
Assert.Equal(result.SourceBounds.Height, result.SourceImageCrop.BoundsHeight);
Assert.Equal(result.SourceImagePath, cellArtifact.SourceImagePath);
Assert.NotNull(cellArtifact.SourceImageCrop);
Assert.True(File.Exists(artifactPaths.GetPageImagePath(result.SourceBounds.PageNumber)));
Assert.True(File.Exists(artifactPaths.ResolveRelativePath(result.SourceImagePath!)));
}
private static async Task<(CriticalTableParseResult ParseResult, ImportArtifactPaths ArtifactPaths)> LoadPreparedSlashParseResultAsync() private static async Task<(CriticalTableParseResult ParseResult, ImportArtifactPaths ArtifactPaths)> LoadPreparedSlashParseResultAsync()
{ {
var entry = LoadManifest().Tables.Single(item => item.Slug == "slash"); var entry = LoadManifest().Tables.Single(item => item.Slug == "slash");
@@ -51,6 +80,25 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
return (parseResult, artifactPaths); return (parseResult, artifactPaths);
} }
private static async Task<(CriticalTableParseResult ParseResult, ImportArtifactPaths ArtifactPaths)> LoadPreparedVoidParseResultAsync()
{
var entry = LoadManifest().Tables.Single(item => item.Slug == "void");
var source = new ExtractedCriticalSource(
"ocr",
"Imported from PDF OCR extraction.",
SourceRenderProfile.OcrPixels(PdfXmlExtractor.ScaledRenderDpi),
[new ParsedPdfPageGeometry(1, 3600, 5070)],
OcrCriticalSourceExtractor.ParseTsv(await File.ReadAllTextAsync(GetVoidFixturePath())));
var layout = StandardOcrBootstrapper.Bootstrap(source, StandardTableAxisTemplateCatalog.Resolve(entry.AxisTemplateSlug));
var parseResult = StandardParser.Parse(entry, source, layout);
var artifactRoot = Path.Combine(GetArtifactCacheRoot(), Guid.NewGuid().ToString("N"));
var artifactPaths = ImportArtifactPaths.Create(artifactRoot, entry.Slug);
var generator = new CriticalSourceImageArtifactGenerator(new PdfXmlExtractor());
await generator.GenerateAsync(Path.Combine(GetRepositoryRoot(), entry.PdfPath), artifactPaths, parseResult);
return (parseResult, artifactPaths);
}
private static ParsedCriticalResult FindResult(CriticalTableParseResult parseResult, string rollBandLabel, string columnKey) => private static ParsedCriticalResult FindResult(CriticalTableParseResult parseResult, string rollBandLabel, string columnKey) =>
parseResult.Table.Results.Single(item => parseResult.Table.Results.Single(item =>
item.GroupKey is null && item.GroupKey is null &&
@@ -60,6 +108,9 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
private static CriticalImportManifest LoadManifest() => private static CriticalImportManifest LoadManifest() =>
new CriticalImportManifestLoader().Load(Path.Combine(GetRepositoryRoot(), "sources", "critical-import-manifest.json")); new CriticalImportManifestLoader().Load(Path.Combine(GetRepositoryRoot(), "sources", "critical-import-manifest.json"));
private static string GetVoidFixturePath() =>
Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.ImportTool.Tests", "Fixtures", "Void", "source.ocr.tsv");
private static string GetArtifactCacheRoot() private static string GetArtifactCacheRoot()
{ {
var cacheRoot = Path.Combine(Path.GetTempPath(), "RolemasterDb.ImportTool.MergeTests"); var cacheRoot = Path.Combine(Path.GetTempPath(), "RolemasterDb.ImportTool.MergeTests");
@@ -67,20 +118,5 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
return cacheRoot; return cacheRoot;
} }
private static string GetRepositoryRoot() private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
{
var probe = new DirectoryInfo(AppContext.BaseDirectory);
while (probe is not null)
{
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
{
return probe.FullName;
}
probe = probe.Parent;
}
throw new InvalidOperationException("Could not find the repository root for integration tests.");
}
} }

View File

@@ -315,20 +315,5 @@ public sealed class CriticalImportMergeIntegrationTests
return cacheRoot; return cacheRoot;
} }
private static string GetRepositoryRoot() private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
{
var probe = new DirectoryInfo(AppContext.BaseDirectory);
while (probe is not null)
{
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
{
return probe.FullName;
}
probe = probe.Parent;
}
throw new InvalidOperationException("Could not find the repository root for integration tests.");
}
} }

File diff suppressed because it is too large Load Diff

View File

@@ -33,11 +33,13 @@ public sealed class StandardCriticalTableParserIntegrationTests
"subdual", "subdual",
"super_large_creature_weapon", "super_large_creature_weapon",
"tiny", "tiny",
"unbalance" "unbalance",
"void"
]; ];
private static readonly PdfXmlExtractor Extractor = new(); private static readonly PdfXmlExtractor Extractor = new();
private static readonly StandardCriticalTableParser StandardParser = new(); private static readonly StandardCriticalTableParser StandardParser = new();
private static readonly StandardOcrBootstrapper StandardOcrBootstrapper = new();
private static readonly VariantColumnCriticalTableParser VariantColumnParser = new(); private static readonly VariantColumnCriticalTableParser VariantColumnParser = new();
private static readonly GroupedVariantCriticalTableParser GroupedVariantParser = new(); private static readonly GroupedVariantCriticalTableParser GroupedVariantParser = new();
@@ -57,6 +59,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
yield return new object[] { "mana", null!, "96-99", "E", "momentarily transformed" }; yield return new object[] { "mana", null!, "96-99", "E", "momentarily transformed" };
yield return new object[] { "mana", null!, "100", "E", "Mana consumes everything" }; yield return new object[] { "mana", null!, "100", "E", "Mana consumes everything" };
yield return new object[] { "tiny", null!, "100", "E", "Vein and artery severed" }; yield return new object[] { "tiny", null!, "100", "E", "Vein and artery severed" };
yield return new object[] { "void", null!, "96-99", "D", "Foe inhales the void" };
yield return new object[] { "large_creature_weapon", null!, "01-05", "NORMAL", "Weapon shatters" }; yield return new object[] { "large_creature_weapon", null!, "01-05", "NORMAL", "Weapon shatters" };
yield return new object[] { "super_large_creature_weapon", null!, "31-40", "SLAYING", "Boom! Solid without question" }; yield return new object[] { "super_large_creature_weapon", null!, "31-40", "SLAYING", "Boom! Solid without question" };
yield return new object[] { "large_creature_magic", "large", "251+", "NORMAL", "Foe lowers his eyes within your reach" }; yield return new object[] { "large_creature_magic", "large", "251+", "NORMAL", "Foe lowers his eyes within your reach" };
@@ -75,13 +78,16 @@ public sealed class StandardCriticalTableParserIntegrationTests
Assert.Equal(ExpectedEnabledSlugs, enabledTables.Select(item => item.Slug)); Assert.Equal(ExpectedEnabledSlugs, enabledTables.Select(item => item.Slug));
Assert.All(enabledTables, entry => Assert.All(enabledTables, entry =>
{ {
Assert.Equal("xml", entry.ExtractionMethod); Assert.True(
new[] { "xml", "ocr" }.Contains(entry.ExtractionMethod, StringComparer.Ordinal),
$"Unexpected extraction method '{entry.ExtractionMethod}' for '{entry.Slug}'.");
Assert.True(File.Exists(Path.Combine(GetRepositoryRoot(), entry.PdfPath)), $"Missing source PDF for '{entry.Slug}'."); Assert.True(File.Exists(Path.Combine(GetRepositoryRoot(), entry.PdfPath)), $"Missing source PDF for '{entry.Slug}'.");
}); });
Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "large_creature_weapon").Family); Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "large_creature_weapon").Family);
Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "super_large_creature_weapon").Family); Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "super_large_creature_weapon").Family);
Assert.Equal("grouped_variant", enabledTables.Single(item => item.Slug == "large_creature_magic").Family); Assert.Equal("grouped_variant", enabledTables.Single(item => item.Slug == "large_creature_magic").Family);
Assert.Equal("ocr", enabledTables.Single(item => item.Slug == "void").ExtractionMethod);
} }
[Theory] [Theory]
@@ -604,6 +610,25 @@ public sealed class StandardCriticalTableParserIntegrationTests
Assert.StartsWith("Strike to foe's hip.", result.RawCellText, StringComparison.Ordinal); Assert.StartsWith("Strike to foe's hip.", result.RawCellText, StringComparison.Ordinal);
} }
[Fact]
public async Task Loader_persists_void_table_from_fixture()
{
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "void", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var databasePath = CreateTemporaryDatabaseCopy();
var loader = new CriticalImportLoader(databasePath);
await loader.LoadAsync(parseResult.Table);
await using var dbContext = CreateDbContext(databasePath);
var results = await dbContext.CriticalResults
.Include(item => item.CriticalTable)
.Where(item => item.CriticalTable.Slug == "void")
.CountAsync();
Assert.Equal(95, results);
}
[Fact] [Fact]
public async Task Lookup_service_returns_effects_for_results_and_branches() public async Task Lookup_service_returns_effects_for_results_and_branches()
{ {
@@ -632,6 +657,25 @@ public sealed class StandardCriticalTableParserIntegrationTests
private static async Task<CriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry) private static async Task<CriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
{ {
if (string.Equals(entry.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase))
{
var tsvContent = await File.ReadAllTextAsync(Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.ImportTool.Tests", "Fixtures", "Void", "source.ocr.tsv"));
var source = new ExtractedCriticalSource(
"ocr",
"Imported from PDF OCR extraction.",
SourceRenderProfile.OcrPixels(PdfXmlExtractor.ScaledRenderDpi),
[new ParsedPdfPageGeometry(1, 3600, 5070)],
OcrCriticalSourceExtractor.ParseTsv(tsvContent));
return entry.Family switch
{
"standard" => StandardParser.Parse(entry, source, StandardOcrBootstrapper.Bootstrap(source, StandardTableAxisTemplateCatalog.Resolve(entry.AxisTemplateSlug))),
"variant_column" => VariantColumnParser.Parse(entry, source),
"grouped_variant" => GroupedVariantParser.Parse(entry, source),
_ => throw new InvalidOperationException($"Unsupported manifest family '{entry.Family}'.")
};
}
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml"); var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
if (!File.Exists(xmlPath)) if (!File.Exists(xmlPath))
@@ -701,20 +745,5 @@ public sealed class StandardCriticalTableParserIntegrationTests
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext); await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext);
} }
private static string GetRepositoryRoot() private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
{
var probe = new DirectoryInfo(AppContext.BaseDirectory);
while (probe is not null)
{
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
{
return probe.FullName;
}
probe = probe.Parent;
}
throw new InvalidOperationException("Could not find the repository root for integration tests.");
}
} }

View File

@@ -0,0 +1,39 @@
namespace RolemasterDb.ImportTool.Tests;
internal static class TestRepositoryPaths
{
private const string RepositoryRootEnvironmentVariable = "ROLEMASTERDB_REPOSITORY_ROOT";
public static string GetRepositoryRoot()
{
var configuredRoot = Environment.GetEnvironmentVariable(RepositoryRootEnvironmentVariable);
if (!string.IsNullOrWhiteSpace(configuredRoot))
{
var fullPath = Path.GetFullPath(configuredRoot);
if (File.Exists(Path.Combine(fullPath, "RolemasterDB.slnx")))
{
return fullPath;
}
}
var probes = new[]
{
new DirectoryInfo(AppContext.BaseDirectory),
new DirectoryInfo(Directory.GetCurrentDirectory())
};
foreach (var probe in probes)
{
for (var current = probe; current is not null; current = current.Parent)
{
if (File.Exists(Path.Combine(current.FullName, "RolemasterDB.slnx")))
{
return current.FullName;
}
}
}
throw new InvalidOperationException(
$"Could not find the repository root for integration tests. Set {RepositoryRootEnvironmentVariable} to the repository path.");
}
}

View File

@@ -7,6 +7,7 @@ public sealed class CriticalImportCommandRunner
private readonly CriticalImportManifestLoader manifestLoader = new(); private readonly CriticalImportManifestLoader manifestLoader = new();
private readonly ImportArtifactWriter artifactWriter = new(); private readonly ImportArtifactWriter artifactWriter = new();
private readonly PdfXmlExtractor pdfXmlExtractor = new(); private readonly PdfXmlExtractor pdfXmlExtractor = new();
private readonly StandardOcrBootstrapper standardOcrBootstrapper = new();
private readonly CriticalSourceImageArtifactGenerator sourceImageArtifactGenerator; private readonly CriticalSourceImageArtifactGenerator sourceImageArtifactGenerator;
private readonly StandardCriticalTableParser standardParser = new(); private readonly StandardCriticalTableParser standardParser = new();
private readonly VariantColumnCriticalTableParser variantColumnParser = new(); private readonly VariantColumnCriticalTableParser variantColumnParser = new();
@@ -35,8 +36,9 @@ public sealed class CriticalImportCommandRunner
{ {
var entry = GetManifestEntry(options.Table); var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug); var artifactPaths = CreateArtifactPaths(entry.Slug);
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath); var extractor = CreateSourceExtractor(entry);
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.XmlPath}"); await extractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.GetSourceArtifactPath(entry.ExtractionMethod)}");
return 0; return 0;
} }
@@ -44,15 +46,8 @@ public sealed class CriticalImportCommandRunner
{ {
var entry = GetManifestEntry(options.Table); var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug); var artifactPaths = CreateArtifactPaths(entry.Slug);
var extractedSource = await LoadExtractedSourceAsync(entry, artifactPaths);
if (!File.Exists(artifactPaths.XmlPath)) var parseResult = Parse(entry, extractedSource);
{
Console.Error.WriteLine($"Missing XML artifact: {artifactPaths.XmlPath}");
return 1;
}
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
var parseResult = Parse(entry, xmlContent);
await sourceImageArtifactGenerator.GenerateAsync( await sourceImageArtifactGenerator.GenerateAsync(
ResolveRepositoryPath(entry.PdfPath), ResolveRepositoryPath(entry.PdfPath),
artifactPaths, artifactPaths,
@@ -104,14 +99,14 @@ public sealed class CriticalImportCommandRunner
{ {
var entry = GetManifestEntry(options.Table); var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug); var artifactPaths = CreateArtifactPaths(entry.Slug);
var extractor = CreateSourceExtractor(entry);
if (!File.Exists(artifactPaths.XmlPath)) if (!File.Exists(artifactPaths.GetSourceArtifactPath(entry.ExtractionMethod)))
{ {
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath); await extractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
} }
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath); var extractedSource = await extractor.LoadAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
var parseResult = Parse(entry, xmlContent); var parseResult = Parse(entry, extractedSource);
await sourceImageArtifactGenerator.GenerateAsync( await sourceImageArtifactGenerator.GenerateAsync(
ResolveRepositoryPath(entry.PdfPath), ResolveRepositoryPath(entry.PdfPath),
artifactPaths, artifactPaths,
@@ -143,26 +138,61 @@ public sealed class CriticalImportCommandRunner
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'."); ?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
} }
private CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) private async Task<ExtractedCriticalSource> LoadExtractedSourceAsync(CriticalImportManifestEntry entry, ImportArtifactPaths artifactPaths)
{
var extractor = CreateSourceExtractor(entry);
var sourceArtifactPath = artifactPaths.GetSourceArtifactPath(entry.ExtractionMethod);
if (!File.Exists(sourceArtifactPath))
{
Console.Error.WriteLine($"Missing source artifact: {sourceArtifactPath}");
throw new FileNotFoundException($"Missing source artifact: {sourceArtifactPath}", sourceArtifactPath);
}
return await extractor.LoadAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
}
private CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source)
{ {
if (string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase)) if (string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
{ {
return standardParser.Parse(entry, xmlContent); if (string.Equals(entry.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase))
{
var template = StandardTableAxisTemplateCatalog.Resolve(entry.AxisTemplateSlug);
var layout = standardOcrBootstrapper.Bootstrap(source, template);
return standardParser.Parse(entry, source, layout);
}
return standardParser.Parse(entry, source);
} }
if (string.Equals(entry.Family, "variant_column", StringComparison.OrdinalIgnoreCase)) if (string.Equals(entry.Family, "variant_column", StringComparison.OrdinalIgnoreCase))
{ {
return variantColumnParser.Parse(entry, xmlContent); return variantColumnParser.Parse(entry, source);
} }
if (string.Equals(entry.Family, "grouped_variant", StringComparison.OrdinalIgnoreCase)) if (string.Equals(entry.Family, "grouped_variant", StringComparison.OrdinalIgnoreCase))
{ {
return groupedVariantParser.Parse(entry, xmlContent); return groupedVariantParser.Parse(entry, source);
} }
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by the importer."); throw new InvalidOperationException($"Family '{entry.Family}' is not supported by the importer.");
} }
private ICriticalSourceExtractor CreateSourceExtractor(CriticalImportManifestEntry entry)
{
if (string.Equals(entry.ExtractionMethod, "xml", StringComparison.OrdinalIgnoreCase))
{
return new XmlCriticalSourceExtractor(pdfXmlExtractor);
}
if (string.Equals(entry.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase))
{
return new OcrCriticalSourceExtractor(pdfXmlExtractor);
}
throw new InvalidOperationException($"Extraction method '{entry.ExtractionMethod}' is not supported by the importer.");
}
private static ImportArtifactPaths CreateArtifactPaths(string slug) => private static ImportArtifactPaths CreateArtifactPaths(string slug) =>
ImportArtifactPaths.Create(RepositoryPaths.Discover().ArtifactsRootPath, slug); ImportArtifactPaths.Create(RepositoryPaths.Discover().ArtifactsRootPath, slug);

View File

@@ -6,6 +6,7 @@ public sealed class CriticalImportManifestEntry
public string DisplayName { get; set; } = string.Empty; public string DisplayName { get; set; } = string.Empty;
public string Family { get; set; } = string.Empty; public string Family { get; set; } = string.Empty;
public string ExtractionMethod { get; set; } = string.Empty; public string ExtractionMethod { get; set; } = string.Empty;
public string? AxisTemplateSlug { get; set; }
public string PdfPath { get; set; } = string.Empty; public string PdfPath { get; set; } = string.Empty;
public bool Enabled { get; set; } = true; public bool Enabled { get; set; } = true;
} }

View File

@@ -23,6 +23,7 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
pdfPath, pdfPath,
pageGeometry.PageNumber, pageGeometry.PageNumber,
artifactPaths.GetPageImagePath(pageGeometry.PageNumber), artifactPaths.GetPageImagePath(pageGeometry.PageNumber),
parseResult.RenderProfile.RenderDpi,
cancellationToken); cancellationToken);
} }
@@ -38,7 +39,7 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
$"Missing page geometry for page {result.SourceBounds.PageNumber} in table '{parseResult.Table.Slug}'."); $"Missing page geometry for page {result.SourceBounds.PageNumber} in table '{parseResult.Table.Slug}'.");
} }
var crop = CreateCrop(result.SourceBounds, pageGeometry); var crop = CreateCrop(result.SourceBounds, pageGeometry, parseResult.RenderProfile);
var relativePath = artifactPaths.GetRelativeCellImagePath(result.GroupKey, result.ColumnKey, result.RollBandLabel); var relativePath = artifactPaths.GetRelativeCellImagePath(result.GroupKey, result.ColumnKey, result.RollBandLabel);
var fullPath = artifactPaths.ResolveRelativePath(relativePath); var fullPath = artifactPaths.ResolveRelativePath(relativePath);
@@ -50,6 +51,7 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
crop.CropWidth, crop.CropWidth,
crop.CropHeight, crop.CropHeight,
fullPath, fullPath,
parseResult.RenderProfile.RenderDpi,
cancellationToken); cancellationToken);
result.SourceImagePath = relativePath; result.SourceImagePath = relativePath;
@@ -66,7 +68,8 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
private static CriticalSourceImageCrop CreateCrop( private static CriticalSourceImageCrop CreateCrop(
ParsedCriticalSourceRect sourceBounds, ParsedCriticalSourceRect sourceBounds,
ParsedPdfPageGeometry pageGeometry) ParsedPdfPageGeometry pageGeometry,
SourceRenderProfile renderProfile)
{ {
var cropLeft = Math.Max(0, sourceBounds.Left - CropPaddingX); var cropLeft = Math.Max(0, sourceBounds.Left - CropPaddingX);
var cropTop = Math.Max(0, sourceBounds.Top - CropPaddingY); var cropTop = Math.Max(0, sourceBounds.Top - CropPaddingY);
@@ -75,18 +78,18 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
return new CriticalSourceImageCrop( return new CriticalSourceImageCrop(
sourceBounds.PageNumber, sourceBounds.PageNumber,
PdfXmlExtractor.ScaleCoordinate(pageGeometry.Width), renderProfile.ScaleCoordinate(pageGeometry.Width),
PdfXmlExtractor.ScaleCoordinate(pageGeometry.Height), renderProfile.ScaleCoordinate(pageGeometry.Height),
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Left), renderProfile.ScaleCoordinate(sourceBounds.Left),
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Top), renderProfile.ScaleCoordinate(sourceBounds.Top),
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Width), renderProfile.ScaleCoordinate(sourceBounds.Width),
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Height), renderProfile.ScaleCoordinate(sourceBounds.Height),
PdfXmlExtractor.ScaleCoordinate(cropLeft), renderProfile.ScaleCoordinate(cropLeft),
PdfXmlExtractor.ScaleCoordinate(cropTop), renderProfile.ScaleCoordinate(cropTop),
PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropRight - cropLeft)), renderProfile.ScaleCoordinate(Math.Max(1, cropRight - cropLeft)),
PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropBottom - cropTop)), renderProfile.ScaleCoordinate(Math.Max(1, cropBottom - cropTop)),
PdfXmlExtractor.ScaledRenderDpi, renderProfile.RenderDpi,
PdfXmlExtractor.RenderScaleFactor); renderProfile.ScaleFactor);
} }
private static string CreateCellKey(string? groupKey, string rollBandLabel, string columnKey) => private static string CreateCellKey(string? groupKey, string rollBandLabel, string columnKey) =>

View File

@@ -2,7 +2,7 @@ using CommandLine;
namespace RolemasterDb.ImportTool; namespace RolemasterDb.ImportTool;
[Verb("extract", HelpText = "Extract a critical table PDF into a text artifact.")] [Verb("extract", HelpText = "Extract a critical table PDF into its source artifact.")]
public sealed class ExtractOptions public sealed class ExtractOptions
{ {
[Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to extract.")] [Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to extract.")]

View File

@@ -0,0 +1,17 @@
using RolemasterDb.ImportTool.Parsing;
namespace RolemasterDb.ImportTool;
public sealed class ExtractedCriticalSource(
string extractionMethod,
string importNotes,
SourceRenderProfile renderProfile,
IReadOnlyList<ParsedPdfPageGeometry> pageGeometries,
IReadOnlyList<PositionedTextFragment> fragments)
{
public string ExtractionMethod { get; } = extractionMethod;
public string ImportNotes { get; } = importNotes;
public SourceRenderProfile RenderProfile { get; } = renderProfile;
public IReadOnlyList<ParsedPdfPageGeometry> PageGeometries { get; } = pageGeometries;
public IReadOnlyList<PositionedTextFragment> Fragments { get; } = fragments;
}

View File

@@ -0,0 +1,11 @@
namespace RolemasterDb.ImportTool;
public interface ICriticalSourceExtractor
{
Task ExtractAsync(string pdfPath, ImportArtifactPaths artifactPaths, CancellationToken cancellationToken = default);
Task<ExtractedCriticalSource> LoadAsync(
string pdfPath,
ImportArtifactPaths artifactPaths,
CancellationToken cancellationToken = default);
}

View File

@@ -9,9 +9,11 @@ public sealed class ImportArtifactPaths
string tableSlug, string tableSlug,
string directoryPath, string directoryPath,
string xmlPath, string xmlPath,
string ocrTsvPath,
string fragmentsJsonPath, string fragmentsJsonPath,
string parsedCellsJsonPath, string parsedCellsJsonPath,
string validationReportPath, string validationReportPath,
string ocrPagesDirectoryPath,
string pagesDirectoryPath, string pagesDirectoryPath,
string cellsDirectoryPath) string cellsDirectoryPath)
{ {
@@ -19,9 +21,11 @@ public sealed class ImportArtifactPaths
TableSlug = tableSlug; TableSlug = tableSlug;
DirectoryPath = directoryPath; DirectoryPath = directoryPath;
XmlPath = xmlPath; XmlPath = xmlPath;
OcrTsvPath = ocrTsvPath;
FragmentsJsonPath = fragmentsJsonPath; FragmentsJsonPath = fragmentsJsonPath;
ParsedCellsJsonPath = parsedCellsJsonPath; ParsedCellsJsonPath = parsedCellsJsonPath;
ValidationReportPath = validationReportPath; ValidationReportPath = validationReportPath;
OcrPagesDirectoryPath = ocrPagesDirectoryPath;
PagesDirectoryPath = pagesDirectoryPath; PagesDirectoryPath = pagesDirectoryPath;
CellsDirectoryPath = cellsDirectoryPath; CellsDirectoryPath = cellsDirectoryPath;
} }
@@ -30,15 +34,18 @@ public sealed class ImportArtifactPaths
public string TableSlug { get; } public string TableSlug { get; }
public string DirectoryPath { get; } public string DirectoryPath { get; }
public string XmlPath { get; } public string XmlPath { get; }
public string OcrTsvPath { get; }
public string FragmentsJsonPath { get; } public string FragmentsJsonPath { get; }
public string ParsedCellsJsonPath { get; } public string ParsedCellsJsonPath { get; }
public string ValidationReportPath { get; } public string ValidationReportPath { get; }
public string OcrPagesDirectoryPath { get; }
public string PagesDirectoryPath { get; } public string PagesDirectoryPath { get; }
public string CellsDirectoryPath { get; } public string CellsDirectoryPath { get; }
public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug) public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug)
{ {
var directoryPath = Path.Combine(artifactsRootPath, tableSlug); var directoryPath = Path.Combine(artifactsRootPath, tableSlug);
var ocrPagesDirectoryPath = Path.Combine(directoryPath, "ocr-pages");
var pagesDirectoryPath = Path.Combine(directoryPath, "pages"); var pagesDirectoryPath = Path.Combine(directoryPath, "pages");
var cellsDirectoryPath = Path.Combine(directoryPath, "cells"); var cellsDirectoryPath = Path.Combine(directoryPath, "cells");
@@ -47,13 +54,23 @@ public sealed class ImportArtifactPaths
tableSlug, tableSlug,
directoryPath, directoryPath,
Path.Combine(directoryPath, "source.xml"), Path.Combine(directoryPath, "source.xml"),
Path.Combine(directoryPath, "source.ocr.tsv"),
Path.Combine(directoryPath, "fragments.json"), Path.Combine(directoryPath, "fragments.json"),
Path.Combine(directoryPath, "parsed-cells.json"), Path.Combine(directoryPath, "parsed-cells.json"),
Path.Combine(directoryPath, "validation-report.json"), Path.Combine(directoryPath, "validation-report.json"),
ocrPagesDirectoryPath,
pagesDirectoryPath, pagesDirectoryPath,
cellsDirectoryPath); cellsDirectoryPath);
} }
public string GetSourceArtifactPath(string extractionMethod) =>
string.Equals(extractionMethod, "ocr", StringComparison.OrdinalIgnoreCase)
? OcrTsvPath
: XmlPath;
public string GetOcrPageImagePath(int pageNumber) =>
Path.Combine(OcrPagesDirectoryPath, $"page-{pageNumber:000}.png");
public string GetPageImagePath(int pageNumber) => public string GetPageImagePath(int pageNumber) =>
Path.Combine(PagesDirectoryPath, $"page-{pageNumber:000}.png"); Path.Combine(PagesDirectoryPath, $"page-{pageNumber:000}.png");

View File

@@ -2,7 +2,7 @@ using CommandLine;
namespace RolemasterDb.ImportTool; namespace RolemasterDb.ImportTool;
[Verb("load", HelpText = "Load a parsed critical table from its extracted text artifact.")] [Verb("load", HelpText = "Load a parsed critical table from its extracted source artifact.")]
public sealed class LoadOptions public sealed class LoadOptions
{ {
[Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to load.")] [Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to load.")]

View File

@@ -0,0 +1,204 @@
using System.Globalization;
using System.Text;
using RolemasterDb.ImportTool.Parsing;
namespace RolemasterDb.ImportTool;
public sealed class OcrCriticalSourceExtractor(PdfXmlExtractor pdfXmlExtractor) : ICriticalSourceExtractor
{
private const int OcrRenderDpi = PdfXmlExtractor.ScaledRenderDpi;
private const string TesseractExeDefaultPath = @"C:\Program Files\Sejda PDF Desktop\resources\vendor\tesseract-windows-x64\tesseract.exe";
private const string TessdataDefaultPath = @"C:\Program Files\Sejda PDF Desktop\resources\vendor\tessdata";
public async Task ExtractAsync(string pdfPath, ImportArtifactPaths artifactPaths, CancellationToken cancellationToken = default)
{
Directory.CreateDirectory(artifactPaths.DirectoryPath);
Directory.CreateDirectory(artifactPaths.OcrPagesDirectoryPath);
var info = await pdfXmlExtractor.ReadDocumentInfoAsync(pdfPath, cancellationToken);
if (info.PageCount != 1)
{
throw new InvalidOperationException("The OCR extractor currently supports only single-page critical tables.");
}
var pageImagePath = artifactPaths.GetOcrPageImagePath(1);
await pdfXmlExtractor.RenderPagePngAsync(pdfPath, 1, pageImagePath, OcrRenderDpi, cancellationToken);
var tsvContent = await RunTesseractAsync(pageImagePath, cancellationToken);
await File.WriteAllTextAsync(artifactPaths.OcrTsvPath, tsvContent, cancellationToken);
}
public async Task<ExtractedCriticalSource> LoadAsync(
string pdfPath,
ImportArtifactPaths artifactPaths,
CancellationToken cancellationToken = default)
{
if (!File.Exists(artifactPaths.OcrTsvPath))
{
throw new FileNotFoundException($"Missing OCR artifact: {artifactPaths.OcrTsvPath}", artifactPaths.OcrTsvPath);
}
var pageImagePath = artifactPaths.GetOcrPageImagePath(1);
if (!File.Exists(pageImagePath))
{
throw new FileNotFoundException($"Missing OCR page image artifact: {pageImagePath}", pageImagePath);
}
var tsvContent = await File.ReadAllTextAsync(artifactPaths.OcrTsvPath, cancellationToken);
var (pageWidth, pageHeight) = ReadPngDimensions(pageImagePath);
return new ExtractedCriticalSource(
"ocr",
"Imported from PDF OCR extraction.",
SourceRenderProfile.OcrPixels(OcrRenderDpi),
[new ParsedPdfPageGeometry(1, pageWidth, pageHeight)],
ParseTsv(tsvContent));
}
internal static IReadOnlyList<PositionedTextFragment> ParseTsv(string tsvContent)
{
var lines = tsvContent
.Split(["\r\n", "\n"], StringSplitOptions.RemoveEmptyEntries)
.ToList();
if (lines.Count == 0)
{
return [];
}
var fragments = new List<PositionedTextFragment>();
foreach (var line in lines.Skip(1))
{
var columns = line.Split('\t');
if (columns.Length < 12 || columns[0] != "5")
{
continue;
}
var text = CriticalTableParserSupport.NormalizeText(string.Join('\t', columns.Skip(11)));
if (string.IsNullOrWhiteSpace(text))
{
continue;
}
fragments.Add(new PositionedTextFragment(
int.Parse(columns[1], CultureInfo.InvariantCulture),
int.Parse(columns[7], CultureInfo.InvariantCulture),
int.Parse(columns[6], CultureInfo.InvariantCulture),
int.Parse(columns[8], CultureInfo.InvariantCulture),
int.Parse(columns[9], CultureInfo.InvariantCulture),
text,
ParseConfidence(columns[10])));
}
return fragments;
}
private static int? ParseConfidence(string value) =>
int.TryParse(value, NumberStyles.Integer, CultureInfo.InvariantCulture, out var confidence) && confidence >= 0
? confidence
: null;
private static (int Width, int Height) ReadPngDimensions(string path)
{
using var stream = File.OpenRead(path);
using var reader = new BinaryReader(stream, Encoding.UTF8, leaveOpen: false);
var signature = reader.ReadBytes(8);
var expectedSignature = new byte[] { 137, 80, 78, 71, 13, 10, 26, 10 };
if (!signature.SequenceEqual(expectedSignature))
{
throw new InvalidOperationException($"'{path}' is not a PNG file.");
}
_ = ReadBigEndianInt32(reader);
var chunkType = Encoding.ASCII.GetString(reader.ReadBytes(4));
if (!string.Equals(chunkType, "IHDR", StringComparison.Ordinal))
{
throw new InvalidOperationException($"'{path}' is missing a PNG IHDR header.");
}
var width = ReadBigEndianInt32(reader);
var height = ReadBigEndianInt32(reader);
return (width, height);
}
private static int ReadBigEndianInt32(BinaryReader reader)
{
var bytes = reader.ReadBytes(4);
if (bytes.Length != 4)
{
throw new EndOfStreamException("Unexpected end of stream.");
}
if (BitConverter.IsLittleEndian)
{
Array.Reverse(bytes);
}
return BitConverter.ToInt32(bytes, 0);
}
private static async Task<string> RunTesseractAsync(string imagePath, CancellationToken cancellationToken)
{
var startInfo = new System.Diagnostics.ProcessStartInfo
{
FileName = ResolveTesseractExecutable(),
RedirectStandardError = true,
RedirectStandardOutput = true,
UseShellExecute = false,
CreateNoWindow = true
};
startInfo.Environment["TESSDATA_PREFIX"] = ResolveTessdataPath();
startInfo.ArgumentList.Add(imagePath);
startInfo.ArgumentList.Add("stdout");
startInfo.ArgumentList.Add("--psm");
startInfo.ArgumentList.Add("11");
startInfo.ArgumentList.Add("tsv");
using var process = new System.Diagnostics.Process { StartInfo = startInfo };
process.Start();
var output = await process.StandardOutput.ReadToEndAsync(cancellationToken);
await process.WaitForExitAsync(cancellationToken);
if (process.ExitCode != 0)
{
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
throw new InvalidOperationException($"tesseract failed for '{imagePath}': {error}");
}
return output;
}
private static string ResolveTesseractExecutable()
{
var configuredPath = Environment.GetEnvironmentVariable("ROLEMASTERDB_TESSERACT_PATH");
if (!string.IsNullOrWhiteSpace(configuredPath) && File.Exists(configuredPath))
{
return configuredPath;
}
if (File.Exists(TesseractExeDefaultPath))
{
return TesseractExeDefaultPath;
}
return "tesseract";
}
private static string ResolveTessdataPath()
{
var configuredPath = Environment.GetEnvironmentVariable("ROLEMASTERDB_TESSDATA_PREFIX");
if (!string.IsNullOrWhiteSpace(configuredPath) && Directory.Exists(configuredPath))
{
return configuredPath;
}
if (Directory.Exists(TessdataDefaultPath))
{
return TessdataDefaultPath;
}
return string.Empty;
}
}

View File

@@ -1,7 +1,7 @@
namespace RolemasterDb.ImportTool.Parsing; namespace RolemasterDb.ImportTool.Parsing;
internal sealed class ColumnarCellLine(string text, List<XmlTextFragment> fragments) internal sealed class ColumnarCellLine(string text, List<PositionedTextFragment> fragments)
{ {
public string Text { get; } = text; public string Text { get; } = text;
public List<XmlTextFragment> Fragments { get; } = fragments; public List<PositionedTextFragment> Fragments { get; } = fragments;
} }

View File

@@ -3,13 +3,15 @@ namespace RolemasterDb.ImportTool.Parsing;
public sealed class CriticalTableParseResult( public sealed class CriticalTableParseResult(
ParsedCriticalTable table, ParsedCriticalTable table,
IReadOnlyList<ParsedPdfPageGeometry> pageGeometries, IReadOnlyList<ParsedPdfPageGeometry> pageGeometries,
IReadOnlyList<XmlTextFragment> fragments, IReadOnlyList<PositionedTextFragment> fragments,
SourceRenderProfile renderProfile,
IReadOnlyList<ParsedCriticalCellArtifact> cells, IReadOnlyList<ParsedCriticalCellArtifact> cells,
ImportValidationReport validationReport) ImportValidationReport validationReport)
{ {
public ParsedCriticalTable Table { get; } = table; public ParsedCriticalTable Table { get; } = table;
public IReadOnlyList<ParsedPdfPageGeometry> PageGeometries { get; } = pageGeometries; public IReadOnlyList<ParsedPdfPageGeometry> PageGeometries { get; } = pageGeometries;
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments; public IReadOnlyList<PositionedTextFragment> Fragments { get; } = fragments;
public SourceRenderProfile RenderProfile { get; } = renderProfile;
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells; public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
public ImportValidationReport ValidationReport { get; } = validationReport; public ImportValidationReport ValidationReport { get; } = validationReport;
} }

View File

@@ -22,7 +22,7 @@ internal static class CriticalTableParserSupport
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|)\d+\)$", RegexOptions.Compiled); private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|)\d+\)$", RegexOptions.Compiled);
private static readonly Regex BoundaryBonusLineRegex = new(@"^(?:all allies|all foe's allies|all foes|all opponents)\b", RegexOptions.IgnoreCase | RegexOptions.Compiled); private static readonly Regex BoundaryBonusLineRegex = new(@"^(?:all allies|all foe's allies|all foes|all opponents)\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
internal static List<XmlTextFragment> LoadFragments(string xmlContent) internal static List<PositionedTextFragment> LoadFragments(string xmlContent)
{ {
using var stringReader = new StringReader(xmlContent); using var stringReader = new StringReader(xmlContent);
using var xmlReader = XmlReader.Create( using var xmlReader = XmlReader.Create(
@@ -39,7 +39,7 @@ internal static class CriticalTableParserSupport
{ {
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1"); var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
return page.Elements("text") return page.Elements("text")
.Select(item => new XmlTextFragment( .Select(item => new PositionedTextFragment(
pageNumber, pageNumber,
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")), int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")), int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
@@ -73,8 +73,8 @@ internal static class CriticalTableParserSupport
.ToList(); .ToList();
} }
internal static List<XmlTextFragment> FindRowLabelFragments( internal static List<PositionedTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments, IReadOnlyList<PositionedTextFragment> fragments,
int leftCutoff, int leftCutoff,
int bodyStartTop, int bodyStartTop,
int keyTop) int keyTop)
@@ -89,7 +89,7 @@ internal static class CriticalTableParserSupport
.ThenBy(item => item.Left) .ThenBy(item => item.Left)
.ToList(); .ToList();
var merged = new List<XmlTextFragment>(); var merged = new List<PositionedTextFragment>();
for (var index = 0; index < candidates.Count; index++) for (var index = 0; index < candidates.Count; index++)
{ {
@@ -107,7 +107,7 @@ internal static class CriticalTableParserSupport
} }
} }
var deduped = new List<XmlTextFragment>(); var deduped = new List<PositionedTextFragment>();
foreach (var candidate in merged) foreach (var candidate in merged)
{ {
@@ -128,7 +128,7 @@ internal static class CriticalTableParserSupport
internal static bool IsRollBandLabel(string value) => internal static bool IsRollBandLabel(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:\s*-\s*\d{2,3})?$|^\d{2,3}\+$"); Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:\s*-\s*\d{2,3})?$|^\d{2,3}\+$");
internal static bool IsPotentialRowLabelFragment(XmlTextFragment fragment, int leftCutoff) => internal static bool IsPotentialRowLabelFragment(PositionedTextFragment fragment, int leftCutoff) =>
fragment.Left < leftCutoff && fragment.Left < leftCutoff &&
(IsRollBandLabel(fragment.Text) || LooksLikeSplitRollBandStart(fragment.Text)); (IsRollBandLabel(fragment.Text) || LooksLikeSplitRollBandStart(fragment.Text));
@@ -163,9 +163,9 @@ internal static class CriticalTableParserSupport
return columns[^1].Key; return columns[^1].Key;
} }
internal static IReadOnlyList<ColumnarCellLine> BuildLines(IReadOnlyList<XmlTextFragment> fragments) internal static IReadOnlyList<ColumnarCellLine> BuildLines(IReadOnlyList<PositionedTextFragment> fragments)
{ {
var lines = new List<List<XmlTextFragment>>(); var lines = new List<List<PositionedTextFragment>>();
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left)) foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
{ {
@@ -292,9 +292,9 @@ internal static class CriticalTableParserSupport
.Replace('', '\'') .Replace('', '\'')
.Trim(); .Trim();
private static List<XmlTextFragment> RemoveRedundantContainedFragments(IReadOnlyList<XmlTextFragment> fragments) private static List<PositionedTextFragment> RemoveRedundantContainedFragments(IReadOnlyList<PositionedTextFragment> fragments)
{ {
var redundant = new HashSet<XmlTextFragment>(); var redundant = new HashSet<PositionedTextFragment>();
foreach (var group in fragments.GroupBy(item => (item.PageNumber, item.Top, item.Height))) foreach (var group in fragments.GroupBy(item => (item.PageNumber, item.Top, item.Height)))
{ {
@@ -331,7 +331,7 @@ internal static class CriticalTableParserSupport
.ToList(); .ToList();
} }
private static bool IsHorizontallyContained(XmlTextFragment candidate, XmlTextFragment container) private static bool IsHorizontallyContained(PositionedTextFragment candidate, PositionedTextFragment container)
{ {
const int containmentTolerance = 1; const int containmentTolerance = 1;
@@ -353,7 +353,7 @@ internal static class CriticalTableParserSupport
return normalized.Length == 0 ? null : normalized; return normalized.Length == 0 ? null : normalized;
} }
internal static int FindKeyTop(IReadOnlyList<XmlTextFragment> fragments) => internal static int FindKeyTop(IReadOnlyList<PositionedTextFragment> fragments) =>
fragments fragments
.Where(item => .Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) || string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
@@ -362,7 +362,7 @@ internal static class CriticalTableParserSupport
.Select(item => (int?)item.Top) .Select(item => (int?)item.Top)
.Min() ?? int.MaxValue; .Min() ?? int.MaxValue;
internal static AffixLegend ParseAffixLegend(IReadOnlyList<XmlTextFragment> fragments, int keyTop) internal static AffixLegend ParseAffixLegend(IReadOnlyList<PositionedTextFragment> fragments, int keyTop)
{ {
if (keyTop == int.MaxValue) if (keyTop == int.MaxValue)
{ {
@@ -401,12 +401,12 @@ internal static class CriticalTableParserSupport
supportsPowerPointModifier: footerText.Contains("powerpoint modification", StringComparison.OrdinalIgnoreCase)); supportsPowerPointModifier: footerText.Contains("powerpoint modification", StringComparison.OrdinalIgnoreCase));
} }
internal static List<XmlTextFragment> SplitBoundaryCrossingFragments( internal static List<PositionedTextFragment> SplitBoundaryCrossingFragments(
IReadOnlyList<XmlTextFragment> bodyFragments, IReadOnlyList<PositionedTextFragment> bodyFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters, IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols) IReadOnlySet<string> affixLegendSymbols)
{ {
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count); var splitFragments = new List<PositionedTextFragment>(bodyFragments.Count);
foreach (var fragment in bodyFragments) foreach (var fragment in bodyFragments)
{ {
@@ -417,7 +417,7 @@ internal static class CriticalTableParserSupport
} }
internal static List<(int Top, bool IsAffixLike)> BuildBodyLines( internal static List<(int Top, bool IsAffixLike)> BuildBodyLines(
IReadOnlyList<XmlTextFragment> bodyFragments, IReadOnlyList<PositionedTextFragment> bodyFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters, IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols) IReadOnlySet<string> affixLegendSymbols)
{ {
@@ -440,7 +440,7 @@ internal static class CriticalTableParserSupport
return bodyLines; return bodyLines;
} }
internal static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop) internal static bool IsFooterPageNumberFragment(PositionedTextFragment fragment, int keyTop)
{ {
if (keyTop == int.MaxValue) if (keyTop == int.MaxValue)
{ {
@@ -451,9 +451,9 @@ internal static class CriticalTableParserSupport
Regex.IsMatch(fragment.Text, @"^\d{2,3}$"); Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
} }
internal static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments) internal static IEnumerable<List<PositionedTextFragment>> GroupByTop(IReadOnlyList<PositionedTextFragment> fragments)
{ {
var groups = new List<List<XmlTextFragment>>(); var groups = new List<List<PositionedTextFragment>>();
foreach (var fragment in fragments) foreach (var fragment in fragments)
{ {
@@ -469,7 +469,7 @@ internal static class CriticalTableParserSupport
return groups; return groups;
} }
internal static List<RowAnchor> CreateRowAnchors(IReadOnlyList<XmlTextFragment> rowLabelFragments) => internal static List<RowAnchor> CreateRowAnchors(IReadOnlyList<PositionedTextFragment> rowLabelFragments) =>
rowLabelFragments rowLabelFragments
.OrderBy(item => item.Top) .OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1)) .Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1))
@@ -489,13 +489,13 @@ internal static class CriticalTableParserSupport
rowAnchors[0].Top - HeaderToRowLabelMinimumGap - TopGroupingTolerance)); rowAnchors[0].Top - HeaderToRowLabelMinimumGap - TopGroupingTolerance));
} }
internal static List<XmlTextFragment> BuildBodyFragments( internal static List<PositionedTextFragment> BuildBodyFragments(
IReadOnlyList<XmlTextFragment> fragments, IReadOnlyList<PositionedTextFragment> fragments,
int bodyStartTop, int bodyStartTop,
int keyTop, int keyTop,
int leftCutoff, int leftCutoff,
IReadOnlyList<RowAnchor> rowAnchors, IReadOnlyList<RowAnchor> rowAnchors,
IReadOnlyCollection<XmlTextFragment> excludedFragments, IReadOnlyCollection<PositionedTextFragment> excludedFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters, IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols) IReadOnlySet<string> affixLegendSymbols)
{ {
@@ -580,7 +580,9 @@ internal static class CriticalTableParserSupport
AffixLegend affixLegend, AffixLegend affixLegend,
List<ParsedCriticalCellArtifact> parsedCells, List<ParsedCriticalCellArtifact> parsedCells,
List<ParsedCriticalResult> parsedResults, List<ParsedCriticalResult> parsedResults,
List<string> validationErrors) List<string> validationErrors,
List<string>? validationWarnings = null,
bool downgradeCellContentValidationToWarnings = false)
{ {
var sharedLegend = ToSharedAffixLegend(affixLegend); var sharedLegend = ToSharedAffixLegend(affixLegend);
@@ -589,8 +591,16 @@ internal static class CriticalTableParserSupport
var lineTexts = cellEntry.Lines.Select(line => line.Text).ToList(); var lineTexts = cellEntry.Lines.Select(line => line.Text).ToList();
var content = SharedParsing.CriticalCellTextParser.Parse(lineTexts, sharedLegend); var content = SharedParsing.CriticalCellTextParser.Parse(lineTexts, sharedLegend);
var sourceBounds = BuildSourceBounds(cellEntry.Lines.SelectMany(line => line.Fragments).ToList()); var sourceBounds = BuildSourceBounds(cellEntry.Lines.SelectMany(line => line.Fragments).ToList());
validationErrors.AddRange(content.ValidationErrors.Select(error => var contentIssues = content.ValidationErrors.Select(error =>
$"Cell '{BuildCellIdentifier(cellEntry)}': {error}")); $"Cell '{BuildCellIdentifier(cellEntry)}': {error}");
if (downgradeCellContentValidationToWarnings)
{
validationWarnings?.AddRange(contentIssues);
}
else
{
validationErrors.AddRange(contentIssues);
}
var effects = content.Effects.Select(ToImportToolEffect).ToList(); var effects = content.Effects.Select(ToImportToolEffect).ToList();
var branches = content.Branches.Select(ToImportToolBranch).ToList(); var branches = content.Branches.Select(ToImportToolBranch).ToList();
@@ -621,7 +631,7 @@ internal static class CriticalTableParserSupport
} }
} }
private static ParsedCriticalSourceRect BuildSourceBounds(IReadOnlyList<XmlTextFragment> fragments) private static ParsedCriticalSourceRect BuildSourceBounds(IReadOnlyList<PositionedTextFragment> fragments)
{ {
if (fragments.Count == 0) if (fragments.Count == 0)
{ {
@@ -688,7 +698,7 @@ internal static class CriticalTableParserSupport
private static bool LooksLikeSplitRollBandStart(string value) => private static bool LooksLikeSplitRollBandStart(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}\s*-$"); Regex.IsMatch(value.Trim(), @"^\d{2,3}\s*-$");
private static bool TryMergeSplitRollBand(IReadOnlyList<XmlTextFragment> candidates, int index, out XmlTextFragment mergedCandidate) private static bool TryMergeSplitRollBand(IReadOnlyList<PositionedTextFragment> candidates, int index, out PositionedTextFragment mergedCandidate)
{ {
var current = candidates[index]; var current = candidates[index];
if (!LooksLikeSplitRollBandStart(current.Text) || index + 1 >= candidates.Count) if (!LooksLikeSplitRollBandStart(current.Text) || index + 1 >= candidates.Count)
@@ -712,7 +722,7 @@ internal static class CriticalTableParserSupport
var mergedLabel = $"{startDigits}-{next.Text.Trim()}"; var mergedLabel = $"{startDigits}-{next.Text.Trim()}";
var right = Math.Max(current.Left + current.Width, next.Left + next.Width); var right = Math.Max(current.Left + current.Width, next.Left + next.Width);
mergedCandidate = new XmlTextFragment( mergedCandidate = new PositionedTextFragment(
current.PageNumber, current.PageNumber,
current.Top, current.Top,
Math.Min(current.Left, next.Left), Math.Min(current.Left, next.Left),
@@ -722,8 +732,8 @@ internal static class CriticalTableParserSupport
return true; return true;
} }
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingFragment( private static IReadOnlyList<PositionedTextFragment> SplitBoundaryCrossingFragment(
XmlTextFragment fragment, PositionedTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters, IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlySet<string> affixLegendSymbols) IReadOnlySet<string> affixLegendSymbols)
{ {
@@ -746,8 +756,8 @@ internal static class CriticalTableParserSupport
return [fragment]; return [fragment];
} }
private static IReadOnlyList<XmlTextFragment> BuildSplitFragmentsFromMatches( private static IReadOnlyList<PositionedTextFragment> BuildSplitFragmentsFromMatches(
XmlTextFragment fragment, PositionedTextFragment fragment,
MatchCollection matches, MatchCollection matches,
IReadOnlyList<(string Key, double CenterX)> columnCenters) IReadOnlyList<(string Key, double CenterX)> columnCenters)
{ {
@@ -757,7 +767,7 @@ internal static class CriticalTableParserSupport
} }
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1); var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
var splitFragments = new List<XmlTextFragment>(matches.Count); var splitFragments = new List<PositionedTextFragment>(matches.Count);
foreach (Match match in matches) foreach (Match match in matches)
{ {
@@ -770,7 +780,7 @@ internal static class CriticalTableParserSupport
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index); var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length)); var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
splitFragments.Add(new XmlTextFragment( splitFragments.Add(new PositionedTextFragment(
fragment.PageNumber, fragment.PageNumber,
fragment.Top, fragment.Top,
segmentLeft, segmentLeft,
@@ -796,9 +806,9 @@ internal static class CriticalTableParserSupport
} }
private static bool TrySplitProseFragmentAtBoundaries( private static bool TrySplitProseFragmentAtBoundaries(
XmlTextFragment fragment, PositionedTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters, IReadOnlyList<(string Key, double CenterX)> columnCenters,
out IReadOnlyList<XmlTextFragment> splitFragments) out IReadOnlyList<PositionedTextFragment> splitFragments)
{ {
splitFragments = null!; splitFragments = null!;
@@ -808,7 +818,7 @@ internal static class CriticalTableParserSupport
return false; return false;
} }
var segments = new List<XmlTextFragment>(); var segments = new List<PositionedTextFragment>();
var segmentStart = 0; var segmentStart = 0;
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1); var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
@@ -839,7 +849,7 @@ internal static class CriticalTableParserSupport
} }
private static List<int> FindBoundarySplitIndexes( private static List<int> FindBoundarySplitIndexes(
XmlTextFragment fragment, PositionedTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters) IReadOnlyList<(string Key, double CenterX)> columnCenters)
{ {
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1); var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
@@ -907,8 +917,8 @@ internal static class CriticalTableParserSupport
return bestIndex; return bestIndex;
} }
private static XmlTextFragment? CreateFragmentSegment( private static PositionedTextFragment? CreateFragmentSegment(
XmlTextFragment fragment, PositionedTextFragment fragment,
int startIndex, int startIndex,
int length, int length,
double characterWidth) double characterWidth)
@@ -940,7 +950,7 @@ internal static class CriticalTableParserSupport
var actualLength = trimmedEnd - trimmedStart + 1; var actualLength = trimmedEnd - trimmedStart + 1;
var segmentText = CollapseWhitespace(fragment.Text.Substring(actualStart, actualLength)); var segmentText = CollapseWhitespace(fragment.Text.Substring(actualStart, actualLength));
return new XmlTextFragment( return new PositionedTextFragment(
fragment.PageNumber, fragment.PageNumber,
fragment.Top, fragment.Top,
fragment.Left + (int)Math.Round(characterWidth * actualStart), fragment.Left + (int)Math.Round(characterWidth * actualStart),
@@ -950,7 +960,7 @@ internal static class CriticalTableParserSupport
} }
private static bool CrossesColumnBoundary( private static bool CrossesColumnBoundary(
XmlTextFragment fragment, PositionedTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters) IReadOnlyList<(string Key, double CenterX)> columnCenters)
{ {
var fragmentRight = fragment.Left + fragment.Width; var fragmentRight = fragment.Left + fragment.Width;

View File

@@ -14,10 +14,10 @@ public sealed class GroupedVariantCriticalTableParser
new("SLAYING", "Slaying", "variant", 2) new("SLAYING", "Slaying", "variant", 2)
]; ];
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source)
{ {
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); var fragments = source.Fragments;
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent); var pageGeometries = source.PageGeometries;
var groupHeaders = FindGroupHeaders(fragments); var groupHeaders = FindGroupHeaders(fragments);
var columnHeaders = FindColumnHeaders(fragments); var columnHeaders = FindColumnHeaders(fragments);
var validationErrors = new List<string>(); var validationErrors = new List<string>();
@@ -50,7 +50,7 @@ public sealed class GroupedVariantCriticalTableParser
if (rowAnchors.Count == 0) if (rowAnchors.Count == 0)
{ {
validationErrors.Add("No roll-band labels were found in the XML artifact."); validationErrors.Add("No roll-band labels were found in the source artifact.");
} }
var columnCenters = combinedColumnAnchors var columnCenters = combinedColumnAnchors
@@ -136,16 +136,28 @@ public sealed class GroupedVariantCriticalTableParser
entry.DisplayName, entry.DisplayName,
entry.Family, entry.Family,
Path.GetFileName(entry.PdfPath), Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.", source.ImportNotes,
ExpectedGroups, ExpectedGroups,
ExpectedColumns, ExpectedColumns,
parsedRollBands, parsedRollBands,
parsedResults); parsedResults);
return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport); return new CriticalTableParseResult(table, pageGeometries, fragments, source.RenderProfile, parsedCells, validationReport);
} }
private static List<XmlTextFragment> FindGroupHeaders(IReadOnlyList<XmlTextFragment> fragments) public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
return Parse(
entry,
new ExtractedCriticalSource(
"xml",
"Imported from PDF XML extraction.",
SourceRenderProfile.XmlAligned(),
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
CriticalTableParserSupport.LoadFragments(xmlContent)));
}
private static List<PositionedTextFragment> FindGroupHeaders(IReadOnlyList<PositionedTextFragment> fragments)
{ {
var expectedLabels = ExpectedGroups.Select(item => item.Label).ToList(); var expectedLabels = ExpectedGroups.Select(item => item.Label).ToList();
var headerCandidates = fragments var headerCandidates = fragments
@@ -164,10 +176,10 @@ public sealed class GroupedVariantCriticalTableParser
} }
} }
throw new InvalidOperationException("Could not find the grouped-variant section headers in the XML artifact."); throw new InvalidOperationException("Could not find the grouped-variant section headers in the source artifact.");
} }
private static List<XmlTextFragment> FindColumnHeaders(IReadOnlyList<XmlTextFragment> fragments) private static List<PositionedTextFragment> FindColumnHeaders(IReadOnlyList<PositionedTextFragment> fragments)
{ {
var expectedLabels = new[] { "normal", "slaying", "normal", "slaying" }; var expectedLabels = new[] { "normal", "slaying", "normal", "slaying" };
var headerCandidates = fragments var headerCandidates = fragments
@@ -190,6 +202,6 @@ public sealed class GroupedVariantCriticalTableParser
} }
} }
throw new InvalidOperationException("Could not find the grouped-variant column header row in the XML artifact."); throw new InvalidOperationException("Could not find the grouped-variant column header row in the source artifact.");
} }
} }

View File

@@ -0,0 +1,20 @@
namespace RolemasterDb.ImportTool.Parsing;
public class PositionedTextFragment(
int pageNumber,
int top,
int left,
int width,
int height,
string text,
int? confidence = null)
{
public int PageNumber { get; } = pageNumber;
public int Top { get; } = top;
public int Left { get; } = left;
public int Width { get; } = width;
public int Height { get; } = height;
public string Text { get; } = text;
public int? Confidence { get; } = confidence;
public double CenterX => Left + (Width / 2.0);
}

View File

@@ -2,12 +2,14 @@ namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParseResult( public sealed class StandardCriticalTableParseResult(
ParsedCriticalTable table, ParsedCriticalTable table,
IReadOnlyList<XmlTextFragment> fragments, IReadOnlyList<PositionedTextFragment> fragments,
SourceRenderProfile renderProfile,
IReadOnlyList<ParsedCriticalCellArtifact> cells, IReadOnlyList<ParsedCriticalCellArtifact> cells,
ImportValidationReport validationReport) ImportValidationReport validationReport)
{ {
public ParsedCriticalTable Table { get; } = table; public ParsedCriticalTable Table { get; } = table;
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments; public IReadOnlyList<PositionedTextFragment> Fragments { get; } = fragments;
public SourceRenderProfile RenderProfile { get; } = renderProfile;
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells; public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
public ImportValidationReport ValidationReport { get; } = validationReport; public ImportValidationReport ValidationReport { get; } = validationReport;
} }

View File

@@ -2,23 +2,140 @@ namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParser public sealed class StandardCriticalTableParser
{ {
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) internal CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source, StandardTableLayout? layout = null)
{ {
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); var fragments = source.Fragments;
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent); var pageGeometries = source.PageGeometries;
var headerFragments = FindHeaderFragments(fragments);
var validationErrors = new List<string>(); var validationErrors = new List<string>();
var validationWarnings = new List<string>(); var validationWarnings = new List<string>();
layout ??= BuildLayout(fragments, validationErrors);
validationWarnings.AddRange(layout.Warnings);
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, layout.KeyTop);
var affixLegendSymbols = affixLegend.ClassificationSymbols;
var bodyFragments = CriticalTableParserSupport.BuildBodyFragments(
fragments,
layout.BodyStartTop,
layout.KeyTop,
layout.LeftCutoff,
layout.RowAnchors,
layout.ExcludedFragments,
layout.ColumnCenters,
affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, layout.ColumnCenters, affixLegendSymbols);
var parsedRollBands = layout.RowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<ColumnarCellEntry>();
for (var rowIndex = 0; rowIndex < layout.RowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? layout.BodyStartTop
: CriticalTableParserSupport.ResolveRowBoundaryTop(layout.RowAnchors[rowIndex - 1], layout.RowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == layout.RowAnchors.Count - 1
? layout.KeyTop - 1
: CriticalTableParserSupport.ResolveRowBoundaryTop(layout.RowAnchors[rowIndex], layout.RowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList();
foreach (var columnAnchor in layout.ColumnCenters)
{
var cellFragments = rowFragments
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, layout.ColumnCenters) == columnAnchor.Key)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (cellFragments.Count == 0)
{
validationErrors.Add($"Missing content for roll band '{layout.RowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
continue;
}
cellEntries.Add(new ColumnarCellEntry(
null,
layout.RowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
}
}
CriticalTableParserSupport.RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
CriticalTableParserSupport.BuildParsedArtifacts(
cellEntries,
affixLegend,
parsedCells,
parsedResults,
validationErrors,
validationWarnings,
downgradeCellContentValidationToWarnings: string.Equals(source.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase));
if (layout.ColumnCenters.Count != 5)
{
validationErrors.Add($"Expected 5 standard-table columns but found {layout.ColumnCenters.Count}.");
}
if (parsedCells.Count != layout.RowAnchors.Count * layout.ColumnCenters.Count)
{
validationErrors.Add(
$"Expected {layout.RowAnchors.Count * layout.ColumnCenters.Count} parsed cells but produced {parsedCells.Count}.");
}
var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
validationWarnings,
layout.RowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
source.ImportNotes,
[],
layout.ColumnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new CriticalTableParseResult(table, pageGeometries, fragments, source.RenderProfile, parsedCells, validationReport);
}
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
return Parse(
entry,
new ExtractedCriticalSource(
"xml",
"Imported from PDF XML extraction.",
SourceRenderProfile.XmlAligned(),
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
CriticalTableParserSupport.LoadFragments(xmlContent)));
}
private static StandardTableLayout BuildLayout(
IReadOnlyList<PositionedTextFragment> fragments,
ICollection<string> validationErrors)
{
var headerFragments = FindHeaderFragments(fragments);
var columnCenters = headerFragments var columnCenters = headerFragments
.OrderBy(item => item.Left) .OrderBy(item => item.Left)
.Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX)) .Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX))
.ToList(); .ToList();
var headerTop = headerFragments.Max(item => item.Top); var headerTop = headerFragments.Max(item => item.Top);
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments); var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
var affixLegendSymbols = affixLegend.ClassificationSymbols;
var leftCutoff = headerFragments.Min(item => item.Left) - 10; var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments( var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments, fragments,
@@ -30,102 +147,13 @@ public sealed class StandardCriticalTableParser
if (rowAnchors.Count == 0) if (rowAnchors.Count == 0)
{ {
validationErrors.Add("No roll-band labels were found in the XML artifact."); validationErrors.Add("No roll-band labels were found in the source artifact.");
} }
var bodyFragments = CriticalTableParserSupport.BuildBodyFragments( return new StandardTableLayout(headerFragments, columnCenters, rowAnchors, headerTop, bodyStartTop, keyTop, leftCutoff);
fragments,
bodyStartTop,
keyTop,
leftCutoff,
rowAnchors,
headerFragments,
columnCenters,
affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<ColumnarCellEntry>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? bodyStartTop
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList();
foreach (var columnAnchor in columnCenters)
{
var cellFragments = rowFragments
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (cellFragments.Count == 0)
{
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
continue;
}
cellEntries.Add(new ColumnarCellEntry(
null,
rowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
}
}
CriticalTableParserSupport.RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
CriticalTableParserSupport.BuildParsedArtifacts(cellEntries, affixLegend, parsedCells, parsedResults, validationErrors);
if (columnCenters.Count != 5)
{
validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}.");
}
if (parsedCells.Count != rowAnchors.Count * columnCenters.Count)
{
validationErrors.Add(
$"Expected {rowAnchors.Count * columnCenters.Count} parsed cells but produced {parsedCells.Count}.");
}
var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
validationWarnings,
rowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.",
[],
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport);
} }
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments) private static List<PositionedTextFragment> FindHeaderFragments(IReadOnlyList<PositionedTextFragment> fragments)
{ {
var headerCandidates = fragments var headerCandidates = fragments
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0])) .Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
@@ -143,6 +171,6 @@ public sealed class StandardCriticalTableParser
} }
} }
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact."); throw new InvalidOperationException("Could not find the standard-table A-E header row in the source artifact.");
} }
} }

View File

@@ -0,0 +1,150 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed class StandardOcrBootstrapper
{
private const int AnchorConfidenceWarningThreshold = 85;
private const int HeaderTopTolerance = 12;
public StandardTableLayout Bootstrap(ExtractedCriticalSource source, StandardTableAxisTemplate template)
{
var fragments = source.Fragments;
var headerFragments = FindHeaderFragments(fragments, template);
var columnCenters = headerFragments
.OrderBy(item => item.Left)
.Select(item => (Key: NormalizeHeaderText(item.Text), CenterX: item.CenterX))
.ToList();
var headerTop = headerFragments.Max(item => item.Top);
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
var leftCutoff = ResolveRowLabelLeftCutoff(headerFragments);
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
keyTop);
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
var warnings = new List<string>();
if (rowAnchors.Count != template.RollBandLabels.Count)
{
throw new InvalidOperationException(
$"OCR bootstrap found {rowAnchors.Count} row anchors but template '{template.Slug}' expects {template.RollBandLabels.Count}.");
}
var actualLabels = rowAnchors.Select(item => item.Label).ToList();
if (!actualLabels.SequenceEqual(template.RollBandLabels, StringComparer.Ordinal))
{
throw new InvalidOperationException(
$"OCR bootstrap row anchors do not match template '{template.Slug}'.");
}
var fuzzyHeaders = headerFragments
.Where(item => !string.Equals(item.Text, NormalizeHeaderText(item.Text), StringComparison.Ordinal))
.ToList();
if (fuzzyHeaders.Count > 0)
{
warnings.Add(
$"OCR header normalization was applied for: {string.Join(", ", fuzzyHeaders.Select(item => $"'{item.Text}' -> '{NormalizeHeaderText(item.Text)}'"))}.");
}
var lowConfidenceAnchors = headerFragments
.Concat(rowLabelFragments)
.Where(item => item.Confidence is int confidence && confidence < AnchorConfidenceWarningThreshold)
.Select(item => $"'{item.Text}' ({item.Confidence})")
.ToList();
if (lowConfidenceAnchors.Count > 0)
{
warnings.Add($"Low-confidence OCR anchors: {string.Join(", ", lowConfidenceAnchors)}.");
}
return new StandardTableLayout(
headerFragments,
columnCenters,
rowAnchors,
headerTop,
bodyStartTop,
keyTop,
leftCutoff,
warnings);
}
private static List<PositionedTextFragment> FindHeaderFragments(
IReadOnlyList<PositionedTextFragment> fragments,
StandardTableAxisTemplate template)
{
var headerCandidates = fragments
.Where(item => TryNormalizeHeaderText(item.Text, out _))
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
foreach (var group in GroupHeaderCandidates(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => NormalizeHeaderText(item.Text)).ToList();
if (labels.SequenceEqual(template.ColumnKeys, StringComparer.Ordinal))
{
return ordered;
}
}
throw new InvalidOperationException("Could not find the OCR standard-table A-E header row.");
}
private static string NormalizeHeaderText(string value)
{
if (!TryNormalizeHeaderText(value, out var normalized))
{
throw new InvalidOperationException($"Unsupported OCR header fragment '{value}'.");
}
return normalized;
}
private static bool TryNormalizeHeaderText(string value, out string normalized)
{
normalized = value.Trim().ToUpperInvariant();
if (normalized is "A" or "B" or "D" or "E")
{
return true;
}
if (normalized is "C" or "CC")
{
normalized = "C";
return true;
}
return false;
}
private static IEnumerable<List<PositionedTextFragment>> GroupHeaderCandidates(IReadOnlyList<PositionedTextFragment> fragments)
{
var groups = new List<List<PositionedTextFragment>>();
foreach (var fragment in fragments)
{
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > HeaderTopTolerance)
{
groups.Add([fragment]);
continue;
}
groups[^1].Add(fragment);
}
return groups;
}
private static int ResolveRowLabelLeftCutoff(IReadOnlyList<PositionedTextFragment> headerFragments)
{
var ordered = headerFragments.OrderBy(item => item.Left).ToList();
if (ordered.Count < 2)
{
return Math.Max(0, ordered[0].Left - 10);
}
var firstColumnGap = ordered[1].Left - ordered[0].Left;
return Math.Max(0, ordered[0].Left - (firstColumnGap / 2));
}
}

View File

@@ -0,0 +1,11 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed class StandardTableAxisTemplate(
string slug,
IReadOnlyList<string> columnKeys,
IReadOnlyList<string> rollBandLabels)
{
public string Slug { get; } = slug;
public IReadOnlyList<string> ColumnKeys { get; } = columnKeys;
public IReadOnlyList<string> RollBandLabels { get; } = rollBandLabels;
}

View File

@@ -0,0 +1,17 @@
namespace RolemasterDb.ImportTool.Parsing;
internal static class StandardTableAxisTemplateCatalog
{
internal static StandardTableAxisTemplate Resolve(string? slug)
{
if (string.Equals(slug, "mana-standard-19", StringComparison.OrdinalIgnoreCase))
{
return new StandardTableAxisTemplate(
"mana-standard-19",
["A", "B", "C", "D", "E"],
["01-05", "06-10", "11-15", "16-20", "21-35", "36-45", "46-50", "51-55", "56-60", "61-65", "66", "67-70", "71-75", "76-80", "81-85", "86-90", "91-95", "96-99", "100"]);
}
throw new InvalidOperationException($"Unsupported standard-table axis template '{slug ?? "<null>"}'.");
}
}

View File

@@ -0,0 +1,21 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed class StandardTableLayout(
IReadOnlyList<PositionedTextFragment> excludedFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
IReadOnlyList<RowAnchor> rowAnchors,
int headerTop,
int bodyStartTop,
int keyTop,
int leftCutoff,
IReadOnlyList<string>? warnings = null)
{
public IReadOnlyList<PositionedTextFragment> ExcludedFragments { get; } = excludedFragments;
public IReadOnlyList<(string Key, double CenterX)> ColumnCenters { get; } = columnCenters;
public IReadOnlyList<RowAnchor> RowAnchors { get; } = rowAnchors;
public int HeaderTop { get; } = headerTop;
public int BodyStartTop { get; } = bodyStartTop;
public int KeyTop { get; } = keyTop;
public int LeftCutoff { get; } = leftCutoff;
public IReadOnlyList<string> Warnings { get; } = warnings ?? [];
}

View File

@@ -11,10 +11,10 @@ public sealed class VariantColumnCriticalTableParser
new("SLAYING", "Slaying") new("SLAYING", "Slaying")
]; ];
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source)
{ {
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); var fragments = source.Fragments;
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent); var pageGeometries = source.PageGeometries;
var headerFragments = FindHeaderFragments(fragments); var headerFragments = FindHeaderFragments(fragments);
var validationErrors = new List<string>(); var validationErrors = new List<string>();
var validationWarnings = new List<string>(); var validationWarnings = new List<string>();
@@ -43,7 +43,7 @@ public sealed class VariantColumnCriticalTableParser
if (rowAnchors.Count == 0) if (rowAnchors.Count == 0)
{ {
validationErrors.Add("No roll-band labels were found in the XML artifact."); validationErrors.Add("No roll-band labels were found in the source artifact.");
} }
var columnCenters = columnAnchors var columnCenters = columnAnchors
@@ -132,16 +132,28 @@ public sealed class VariantColumnCriticalTableParser
entry.DisplayName, entry.DisplayName,
entry.Family, entry.Family,
Path.GetFileName(entry.PdfPath), Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.", source.ImportNotes,
[], [],
ExpectedColumns.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Label, "variant", index + 1)).ToList(), ExpectedColumns.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Label, "variant", index + 1)).ToList(),
parsedRollBands, parsedRollBands,
parsedResults); parsedResults);
return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport); return new CriticalTableParseResult(table, pageGeometries, fragments, source.RenderProfile, parsedCells, validationReport);
} }
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments) public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
return Parse(
entry,
new ExtractedCriticalSource(
"xml",
"Imported from PDF XML extraction.",
SourceRenderProfile.XmlAligned(),
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
CriticalTableParserSupport.LoadFragments(xmlContent)));
}
private static List<PositionedTextFragment> FindHeaderFragments(IReadOnlyList<PositionedTextFragment> fragments)
{ {
var expectedLabels = ExpectedColumns var expectedLabels = ExpectedColumns
.Select(item => item.Label.ToLowerInvariant()) .Select(item => item.Label.ToLowerInvariant())
@@ -163,7 +175,7 @@ public sealed class VariantColumnCriticalTableParser
} }
} }
throw new InvalidOperationException("Could not find the variant-column header row in the XML artifact."); throw new InvalidOperationException("Could not find the variant-column header row in the source artifact.");
} }
private static ColumnDefinition ResolveColumnDefinition(string value) => private static ColumnDefinition ResolveColumnDefinition(string value) =>

View File

@@ -7,12 +7,6 @@ public sealed class XmlTextFragment(
int width, int width,
int height, int height,
string text) string text)
: PositionedTextFragment(pageNumber, top, left, width, height, text)
{ {
public int PageNumber { get; } = pageNumber;
public int Top { get; } = top;
public int Left { get; } = left;
public int Width { get; } = width;
public int Height { get; } = height;
public string Text { get; } = text;
public double CenterX => Left + (Width / 2.0);
} }

View File

@@ -0,0 +1,8 @@
namespace RolemasterDb.ImportTool;
public sealed class PdfDocumentInfo(int pageCount, double pageWidthPoints, double pageHeightPoints)
{
public int PageCount { get; } = pageCount;
public double PageWidthPoints { get; } = pageWidthPoints;
public double PageHeightPoints { get; } = pageHeightPoints;
}

View File

@@ -1,4 +1,6 @@
using System.Diagnostics; using System.Diagnostics;
using System.Globalization;
using System.Text.RegularExpressions;
namespace RolemasterDb.ImportTool; namespace RolemasterDb.ImportTool;
@@ -7,6 +9,7 @@ public sealed class PdfXmlExtractor
public const int RenderScaleFactor = 4; public const int RenderScaleFactor = 4;
public const int XmlAlignedRenderDpi = 108; public const int XmlAlignedRenderDpi = 108;
public const int ScaledRenderDpi = XmlAlignedRenderDpi * RenderScaleFactor; public const int ScaledRenderDpi = XmlAlignedRenderDpi * RenderScaleFactor;
private const string PortableMiKTeXPath = @"D:\Code\miktex-portable\texmfs\install\miktex\bin\x64";
public static int ScaleCoordinate(int value) => checked(value * RenderScaleFactor); public static int ScaleCoordinate(int value) => checked(value * RenderScaleFactor);
@@ -16,7 +19,7 @@ public sealed class PdfXmlExtractor
var startInfo = new ProcessStartInfo var startInfo = new ProcessStartInfo
{ {
FileName = "pdftohtml", FileName = ResolveExecutable("ROLEMASTERDB_PDFTOHTML_PATH", "pdftohtml.exe"),
RedirectStandardError = true, RedirectStandardError = true,
RedirectStandardOutput = true, RedirectStandardOutput = true,
UseShellExecute = false, UseShellExecute = false,
@@ -40,12 +43,57 @@ public sealed class PdfXmlExtractor
} }
} }
public async Task<PdfDocumentInfo> ReadDocumentInfoAsync(string pdfPath, CancellationToken cancellationToken = default)
{
var startInfo = new ProcessStartInfo
{
FileName = ResolveExecutable("ROLEMASTERDB_PDFINFO_PATH", "pdfinfo.exe"),
RedirectStandardError = true,
RedirectStandardOutput = true,
UseShellExecute = false,
CreateNoWindow = true
};
startInfo.ArgumentList.Add(pdfPath);
using var process = new Process { StartInfo = startInfo };
process.Start();
var output = await process.StandardOutput.ReadToEndAsync(cancellationToken);
await process.WaitForExitAsync(cancellationToken);
if (process.ExitCode != 0)
{
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
throw new InvalidOperationException($"pdfinfo failed for '{pdfPath}': {error}");
}
var pageCountMatch = Regex.Match(output, @"Pages:\s*(\d+)", RegexOptions.Multiline);
var sizeMatch = Regex.Match(output, @"Page size:\s*([0-9.]+)\s*x\s*([0-9.]+)\s*pts", RegexOptions.Multiline);
if (!pageCountMatch.Success || !sizeMatch.Success)
{
throw new InvalidOperationException($"pdfinfo output for '{pdfPath}' could not be parsed.");
}
return new PdfDocumentInfo(
int.Parse(pageCountMatch.Groups[1].Value, CultureInfo.InvariantCulture),
double.Parse(sizeMatch.Groups[1].Value, CultureInfo.InvariantCulture),
double.Parse(sizeMatch.Groups[2].Value, CultureInfo.InvariantCulture));
}
public Task RenderPagePngAsync( public Task RenderPagePngAsync(
string pdfPath, string pdfPath,
int pageNumber, int pageNumber,
string outputPath, string outputPath,
CancellationToken cancellationToken = default) => CancellationToken cancellationToken = default) =>
RenderPngAsync(pdfPath, pageNumber, outputPath, null, null, null, null, cancellationToken); RenderPagePngAsync(pdfPath, pageNumber, outputPath, ScaledRenderDpi, cancellationToken);
public Task RenderPagePngAsync(
string pdfPath,
int pageNumber,
string outputPath,
int renderDpi,
CancellationToken cancellationToken = default) =>
RenderPngAsync(pdfPath, pageNumber, outputPath, renderDpi, null, null, null, null, cancellationToken);
public Task RenderCropPngAsync( public Task RenderCropPngAsync(
string pdfPath, string pdfPath,
@@ -56,12 +104,25 @@ public sealed class PdfXmlExtractor
int height, int height,
string outputPath, string outputPath,
CancellationToken cancellationToken = default) => CancellationToken cancellationToken = default) =>
RenderPngAsync(pdfPath, pageNumber, outputPath, left, top, width, height, cancellationToken); RenderCropPngAsync(pdfPath, pageNumber, left, top, width, height, outputPath, ScaledRenderDpi, cancellationToken);
public Task RenderCropPngAsync(
string pdfPath,
int pageNumber,
int left,
int top,
int width,
int height,
string outputPath,
int renderDpi,
CancellationToken cancellationToken = default) =>
RenderPngAsync(pdfPath, pageNumber, outputPath, renderDpi, left, top, width, height, cancellationToken);
private static async Task RenderPngAsync( private static async Task RenderPngAsync(
string pdfPath, string pdfPath,
int pageNumber, int pageNumber,
string outputPath, string outputPath,
int renderDpi,
int? left, int? left,
int? top, int? top,
int? width, int? width,
@@ -72,7 +133,7 @@ public sealed class PdfXmlExtractor
var startInfo = new ProcessStartInfo var startInfo = new ProcessStartInfo
{ {
FileName = "pdftoppm", FileName = ResolveExecutable("ROLEMASTERDB_PDFTOPPM_PATH", "pdftoppm.exe"),
RedirectStandardError = true, RedirectStandardError = true,
RedirectStandardOutput = true, RedirectStandardOutput = true,
UseShellExecute = false, UseShellExecute = false,
@@ -81,7 +142,7 @@ public sealed class PdfXmlExtractor
startInfo.ArgumentList.Add("-png"); startInfo.ArgumentList.Add("-png");
startInfo.ArgumentList.Add("-r"); startInfo.ArgumentList.Add("-r");
startInfo.ArgumentList.Add(ScaledRenderDpi.ToString()); startInfo.ArgumentList.Add(renderDpi.ToString(CultureInfo.InvariantCulture));
startInfo.ArgumentList.Add("-f"); startInfo.ArgumentList.Add("-f");
startInfo.ArgumentList.Add(pageNumber.ToString()); startInfo.ArgumentList.Add(pageNumber.ToString());
startInfo.ArgumentList.Add("-l"); startInfo.ArgumentList.Add("-l");
@@ -118,4 +179,21 @@ public sealed class PdfXmlExtractor
throw new InvalidOperationException($"pdftoppm completed but did not create '{outputPath}'."); throw new InvalidOperationException($"pdftoppm completed but did not create '{outputPath}'.");
} }
} }
private static string ResolveExecutable(string environmentVariableName, string executableName)
{
var configuredPath = Environment.GetEnvironmentVariable(environmentVariableName);
if (!string.IsNullOrWhiteSpace(configuredPath) && File.Exists(configuredPath))
{
return configuredPath;
}
var portablePath = Path.Combine(PortableMiKTeXPath, executableName);
if (File.Exists(portablePath))
{
return portablePath;
}
return Path.GetFileNameWithoutExtension(executableName);
}
} }

View File

@@ -0,0 +1,3 @@
using System.Runtime.CompilerServices;
[assembly: InternalsVisibleTo("RolemasterDb.ImportTool.Tests")]

View File

@@ -0,0 +1,15 @@
namespace RolemasterDb.ImportTool;
public sealed class SourceRenderProfile(int renderDpi, int scaleFactor)
{
public int RenderDpi { get; } = renderDpi;
public int ScaleFactor { get; } = scaleFactor;
public int ScaleCoordinate(int value) => checked(value * ScaleFactor);
public static SourceRenderProfile XmlAligned() =>
new(PdfXmlExtractor.ScaledRenderDpi, PdfXmlExtractor.RenderScaleFactor);
public static SourceRenderProfile OcrPixels(int renderDpi) =>
new(renderDpi, 1);
}

View File

@@ -0,0 +1,28 @@
using RolemasterDb.ImportTool.Parsing;
namespace RolemasterDb.ImportTool;
public sealed class XmlCriticalSourceExtractor(PdfXmlExtractor pdfXmlExtractor) : ICriticalSourceExtractor
{
public async Task ExtractAsync(string pdfPath, ImportArtifactPaths artifactPaths, CancellationToken cancellationToken = default) =>
await pdfXmlExtractor.ExtractAsync(pdfPath, artifactPaths.XmlPath, cancellationToken);
public async Task<ExtractedCriticalSource> LoadAsync(
string pdfPath,
ImportArtifactPaths artifactPaths,
CancellationToken cancellationToken = default)
{
if (!File.Exists(artifactPaths.XmlPath))
{
throw new FileNotFoundException($"Missing XML artifact: {artifactPaths.XmlPath}", artifactPaths.XmlPath);
}
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath, cancellationToken);
return new ExtractedCriticalSource(
"xml",
"Imported from PDF XML extraction.",
SourceRenderProfile.XmlAligned(),
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
CriticalTableParserSupport.LoadFragments(xmlContent));
}
}