Add OCR import support for void critical table
This commit is contained in:
@@ -167,6 +167,15 @@
|
||||
"extractionMethod": "xml",
|
||||
"pdfPath": "sources/Unbalance.pdf",
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"slug": "void",
|
||||
"displayName": "Void Critical Strike Table",
|
||||
"family": "standard",
|
||||
"extractionMethod": "ocr",
|
||||
"axisTemplateSlug": "mana-standard-19",
|
||||
"pdfPath": "sources/Void.pdf",
|
||||
"enabled": true
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Binary file not shown.
@@ -478,12 +478,12 @@ public sealed class CriticalCellReparseIntegrationTests
|
||||
initialResponse.Branches));
|
||||
|
||||
Assert.NotNull(saveResponse);
|
||||
Assert.Contains(saveResponse!.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "2d10-16");
|
||||
Assert.Contains(saveResponse!.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "+2d10-16");
|
||||
|
||||
var reopenedResponse = await lookupService.GetCriticalCellEditorAsync("mana", resultId);
|
||||
Assert.NotNull(reopenedResponse);
|
||||
Assert.Contains("-2d10-16pp", reopenedResponse!.QuickParseInput, StringComparison.Ordinal);
|
||||
Assert.Contains(reopenedResponse.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "2d10-16");
|
||||
Assert.Contains(reopenedResponse.Effects, effect => effect.EffectCode == AppCriticalEffectCodes.PowerPointModifier && effect.ValueExpression == "+2d10-16");
|
||||
|
||||
var reparsed = await lookupService.ReparseCriticalCellAsync(
|
||||
"mana",
|
||||
@@ -643,20 +643,5 @@ public sealed class CriticalCellReparseIntegrationTests
|
||||
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext);
|
||||
}
|
||||
|
||||
private static string GetRepositoryRoot()
|
||||
{
|
||||
var probe = new DirectoryInfo(AppContext.BaseDirectory);
|
||||
|
||||
while (probe is not null)
|
||||
{
|
||||
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
|
||||
{
|
||||
return probe.FullName;
|
||||
}
|
||||
|
||||
probe = probe.Parent;
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Could not find the repository root for integration tests.");
|
||||
}
|
||||
private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
|
||||
{
|
||||
private static readonly PdfXmlExtractor Extractor = new();
|
||||
private static readonly StandardCriticalTableParser StandardParser = new();
|
||||
private static readonly StandardOcrBootstrapper StandardOcrBootstrapper = new();
|
||||
|
||||
[Fact]
|
||||
public async Task Generated_artifacts_include_page_and_cell_source_images()
|
||||
@@ -32,6 +33,34 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
|
||||
Assert.True(File.Exists(artifactPaths.ResolveRelativePath(result.SourceImagePath!)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Generated_ocr_artifacts_preserve_pixel_space_crop_metadata()
|
||||
{
|
||||
var (parseResult, artifactPaths) = await LoadPreparedVoidParseResultAsync();
|
||||
var result = FindResult(parseResult, "96-99", "D");
|
||||
var cellArtifact = parseResult.Cells.Single(item =>
|
||||
item.GroupKey is null &&
|
||||
item.RollBandLabel == "96-99" &&
|
||||
item.ColumnKey == "D");
|
||||
|
||||
Assert.True(result.SourceBounds.PageNumber > 0);
|
||||
Assert.True(result.SourceBounds.Width > 0);
|
||||
Assert.True(result.SourceBounds.Height > 0);
|
||||
|
||||
Assert.NotNull(result.SourceImagePath);
|
||||
Assert.NotNull(result.SourceImageCrop);
|
||||
Assert.Equal(1, result.SourceImageCrop!.ScaleFactor);
|
||||
Assert.Equal(PdfXmlExtractor.ScaledRenderDpi, result.SourceImageCrop.RenderDpi);
|
||||
Assert.Equal(3600, result.SourceImageCrop.PageWidth);
|
||||
Assert.Equal(5070, result.SourceImageCrop.PageHeight);
|
||||
Assert.Equal(result.SourceBounds.Width, result.SourceImageCrop.BoundsWidth);
|
||||
Assert.Equal(result.SourceBounds.Height, result.SourceImageCrop.BoundsHeight);
|
||||
Assert.Equal(result.SourceImagePath, cellArtifact.SourceImagePath);
|
||||
Assert.NotNull(cellArtifact.SourceImageCrop);
|
||||
Assert.True(File.Exists(artifactPaths.GetPageImagePath(result.SourceBounds.PageNumber)));
|
||||
Assert.True(File.Exists(artifactPaths.ResolveRelativePath(result.SourceImagePath!)));
|
||||
}
|
||||
|
||||
private static async Task<(CriticalTableParseResult ParseResult, ImportArtifactPaths ArtifactPaths)> LoadPreparedSlashParseResultAsync()
|
||||
{
|
||||
var entry = LoadManifest().Tables.Single(item => item.Slug == "slash");
|
||||
@@ -51,6 +80,25 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
|
||||
return (parseResult, artifactPaths);
|
||||
}
|
||||
|
||||
private static async Task<(CriticalTableParseResult ParseResult, ImportArtifactPaths ArtifactPaths)> LoadPreparedVoidParseResultAsync()
|
||||
{
|
||||
var entry = LoadManifest().Tables.Single(item => item.Slug == "void");
|
||||
var source = new ExtractedCriticalSource(
|
||||
"ocr",
|
||||
"Imported from PDF OCR extraction.",
|
||||
SourceRenderProfile.OcrPixels(PdfXmlExtractor.ScaledRenderDpi),
|
||||
[new ParsedPdfPageGeometry(1, 3600, 5070)],
|
||||
OcrCriticalSourceExtractor.ParseTsv(await File.ReadAllTextAsync(GetVoidFixturePath())));
|
||||
var layout = StandardOcrBootstrapper.Bootstrap(source, StandardTableAxisTemplateCatalog.Resolve(entry.AxisTemplateSlug));
|
||||
var parseResult = StandardParser.Parse(entry, source, layout);
|
||||
var artifactRoot = Path.Combine(GetArtifactCacheRoot(), Guid.NewGuid().ToString("N"));
|
||||
var artifactPaths = ImportArtifactPaths.Create(artifactRoot, entry.Slug);
|
||||
var generator = new CriticalSourceImageArtifactGenerator(new PdfXmlExtractor());
|
||||
|
||||
await generator.GenerateAsync(Path.Combine(GetRepositoryRoot(), entry.PdfPath), artifactPaths, parseResult);
|
||||
return (parseResult, artifactPaths);
|
||||
}
|
||||
|
||||
private static ParsedCriticalResult FindResult(CriticalTableParseResult parseResult, string rollBandLabel, string columnKey) =>
|
||||
parseResult.Table.Results.Single(item =>
|
||||
item.GroupKey is null &&
|
||||
@@ -60,6 +108,9 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
|
||||
private static CriticalImportManifest LoadManifest() =>
|
||||
new CriticalImportManifestLoader().Load(Path.Combine(GetRepositoryRoot(), "sources", "critical-import-manifest.json"));
|
||||
|
||||
private static string GetVoidFixturePath() =>
|
||||
Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.ImportTool.Tests", "Fixtures", "Void", "source.ocr.tsv");
|
||||
|
||||
private static string GetArtifactCacheRoot()
|
||||
{
|
||||
var cacheRoot = Path.Combine(Path.GetTempPath(), "RolemasterDb.ImportTool.MergeTests");
|
||||
@@ -67,20 +118,5 @@ public sealed class CriticalImportArtifactGenerationIntegrationTests
|
||||
return cacheRoot;
|
||||
}
|
||||
|
||||
private static string GetRepositoryRoot()
|
||||
{
|
||||
var probe = new DirectoryInfo(AppContext.BaseDirectory);
|
||||
|
||||
while (probe is not null)
|
||||
{
|
||||
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
|
||||
{
|
||||
return probe.FullName;
|
||||
}
|
||||
|
||||
probe = probe.Parent;
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Could not find the repository root for integration tests.");
|
||||
}
|
||||
private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
|
||||
}
|
||||
|
||||
@@ -315,20 +315,5 @@ public sealed class CriticalImportMergeIntegrationTests
|
||||
return cacheRoot;
|
||||
}
|
||||
|
||||
private static string GetRepositoryRoot()
|
||||
{
|
||||
var probe = new DirectoryInfo(AppContext.BaseDirectory);
|
||||
|
||||
while (probe is not null)
|
||||
{
|
||||
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
|
||||
{
|
||||
return probe.FullName;
|
||||
}
|
||||
|
||||
probe = probe.Parent;
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Could not find the repository root for integration tests.");
|
||||
}
|
||||
private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
|
||||
}
|
||||
|
||||
3317
src/RolemasterDb.ImportTool.Tests/Fixtures/Void/source.ocr.tsv
Normal file
3317
src/RolemasterDb.ImportTool.Tests/Fixtures/Void/source.ocr.tsv
Normal file
File diff suppressed because it is too large
Load Diff
@@ -33,11 +33,13 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
||||
"subdual",
|
||||
"super_large_creature_weapon",
|
||||
"tiny",
|
||||
"unbalance"
|
||||
"unbalance",
|
||||
"void"
|
||||
];
|
||||
|
||||
private static readonly PdfXmlExtractor Extractor = new();
|
||||
private static readonly StandardCriticalTableParser StandardParser = new();
|
||||
private static readonly StandardOcrBootstrapper StandardOcrBootstrapper = new();
|
||||
private static readonly VariantColumnCriticalTableParser VariantColumnParser = new();
|
||||
private static readonly GroupedVariantCriticalTableParser GroupedVariantParser = new();
|
||||
|
||||
@@ -57,6 +59,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
||||
yield return new object[] { "mana", null!, "96-99", "E", "momentarily transformed" };
|
||||
yield return new object[] { "mana", null!, "100", "E", "Mana consumes everything" };
|
||||
yield return new object[] { "tiny", null!, "100", "E", "Vein and artery severed" };
|
||||
yield return new object[] { "void", null!, "96-99", "D", "Foe inhales the void" };
|
||||
yield return new object[] { "large_creature_weapon", null!, "01-05", "NORMAL", "Weapon shatters" };
|
||||
yield return new object[] { "super_large_creature_weapon", null!, "31-40", "SLAYING", "Boom! Solid without question" };
|
||||
yield return new object[] { "large_creature_magic", "large", "251+", "NORMAL", "Foe lowers his eyes within your reach" };
|
||||
@@ -75,13 +78,16 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
||||
Assert.Equal(ExpectedEnabledSlugs, enabledTables.Select(item => item.Slug));
|
||||
Assert.All(enabledTables, entry =>
|
||||
{
|
||||
Assert.Equal("xml", entry.ExtractionMethod);
|
||||
Assert.True(
|
||||
new[] { "xml", "ocr" }.Contains(entry.ExtractionMethod, StringComparer.Ordinal),
|
||||
$"Unexpected extraction method '{entry.ExtractionMethod}' for '{entry.Slug}'.");
|
||||
Assert.True(File.Exists(Path.Combine(GetRepositoryRoot(), entry.PdfPath)), $"Missing source PDF for '{entry.Slug}'.");
|
||||
});
|
||||
|
||||
Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "large_creature_weapon").Family);
|
||||
Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "super_large_creature_weapon").Family);
|
||||
Assert.Equal("grouped_variant", enabledTables.Single(item => item.Slug == "large_creature_magic").Family);
|
||||
Assert.Equal("ocr", enabledTables.Single(item => item.Slug == "void").ExtractionMethod);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
@@ -604,6 +610,25 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
||||
Assert.StartsWith("Strike to foe's hip.", result.RawCellText, StringComparison.Ordinal);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Loader_persists_void_table_from_fixture()
|
||||
{
|
||||
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "void", StringComparison.Ordinal));
|
||||
var parseResult = await LoadParseResultAsync(entry);
|
||||
var databasePath = CreateTemporaryDatabaseCopy();
|
||||
var loader = new CriticalImportLoader(databasePath);
|
||||
|
||||
await loader.LoadAsync(parseResult.Table);
|
||||
|
||||
await using var dbContext = CreateDbContext(databasePath);
|
||||
var results = await dbContext.CriticalResults
|
||||
.Include(item => item.CriticalTable)
|
||||
.Where(item => item.CriticalTable.Slug == "void")
|
||||
.CountAsync();
|
||||
|
||||
Assert.Equal(95, results);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Lookup_service_returns_effects_for_results_and_branches()
|
||||
{
|
||||
@@ -632,6 +657,25 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
||||
|
||||
private static async Task<CriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
|
||||
{
|
||||
if (string.Equals(entry.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
var tsvContent = await File.ReadAllTextAsync(Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.ImportTool.Tests", "Fixtures", "Void", "source.ocr.tsv"));
|
||||
var source = new ExtractedCriticalSource(
|
||||
"ocr",
|
||||
"Imported from PDF OCR extraction.",
|
||||
SourceRenderProfile.OcrPixels(PdfXmlExtractor.ScaledRenderDpi),
|
||||
[new ParsedPdfPageGeometry(1, 3600, 5070)],
|
||||
OcrCriticalSourceExtractor.ParseTsv(tsvContent));
|
||||
|
||||
return entry.Family switch
|
||||
{
|
||||
"standard" => StandardParser.Parse(entry, source, StandardOcrBootstrapper.Bootstrap(source, StandardTableAxisTemplateCatalog.Resolve(entry.AxisTemplateSlug))),
|
||||
"variant_column" => VariantColumnParser.Parse(entry, source),
|
||||
"grouped_variant" => GroupedVariantParser.Parse(entry, source),
|
||||
_ => throw new InvalidOperationException($"Unsupported manifest family '{entry.Family}'.")
|
||||
};
|
||||
}
|
||||
|
||||
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
|
||||
|
||||
if (!File.Exists(xmlPath))
|
||||
@@ -701,20 +745,5 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
||||
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext);
|
||||
}
|
||||
|
||||
private static string GetRepositoryRoot()
|
||||
{
|
||||
var probe = new DirectoryInfo(AppContext.BaseDirectory);
|
||||
|
||||
while (probe is not null)
|
||||
{
|
||||
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
|
||||
{
|
||||
return probe.FullName;
|
||||
}
|
||||
|
||||
probe = probe.Parent;
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Could not find the repository root for integration tests.");
|
||||
}
|
||||
private static string GetRepositoryRoot() => TestRepositoryPaths.GetRepositoryRoot();
|
||||
}
|
||||
|
||||
39
src/RolemasterDb.ImportTool.Tests/TestRepositoryPaths.cs
Normal file
39
src/RolemasterDb.ImportTool.Tests/TestRepositoryPaths.cs
Normal file
@@ -0,0 +1,39 @@
|
||||
namespace RolemasterDb.ImportTool.Tests;
|
||||
|
||||
internal static class TestRepositoryPaths
|
||||
{
|
||||
private const string RepositoryRootEnvironmentVariable = "ROLEMASTERDB_REPOSITORY_ROOT";
|
||||
|
||||
public static string GetRepositoryRoot()
|
||||
{
|
||||
var configuredRoot = Environment.GetEnvironmentVariable(RepositoryRootEnvironmentVariable);
|
||||
if (!string.IsNullOrWhiteSpace(configuredRoot))
|
||||
{
|
||||
var fullPath = Path.GetFullPath(configuredRoot);
|
||||
if (File.Exists(Path.Combine(fullPath, "RolemasterDB.slnx")))
|
||||
{
|
||||
return fullPath;
|
||||
}
|
||||
}
|
||||
|
||||
var probes = new[]
|
||||
{
|
||||
new DirectoryInfo(AppContext.BaseDirectory),
|
||||
new DirectoryInfo(Directory.GetCurrentDirectory())
|
||||
};
|
||||
|
||||
foreach (var probe in probes)
|
||||
{
|
||||
for (var current = probe; current is not null; current = current.Parent)
|
||||
{
|
||||
if (File.Exists(Path.Combine(current.FullName, "RolemasterDB.slnx")))
|
||||
{
|
||||
return current.FullName;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new InvalidOperationException(
|
||||
$"Could not find the repository root for integration tests. Set {RepositoryRootEnvironmentVariable} to the repository path.");
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,7 @@ public sealed class CriticalImportCommandRunner
|
||||
private readonly CriticalImportManifestLoader manifestLoader = new();
|
||||
private readonly ImportArtifactWriter artifactWriter = new();
|
||||
private readonly PdfXmlExtractor pdfXmlExtractor = new();
|
||||
private readonly StandardOcrBootstrapper standardOcrBootstrapper = new();
|
||||
private readonly CriticalSourceImageArtifactGenerator sourceImageArtifactGenerator;
|
||||
private readonly StandardCriticalTableParser standardParser = new();
|
||||
private readonly VariantColumnCriticalTableParser variantColumnParser = new();
|
||||
@@ -35,8 +36,9 @@ public sealed class CriticalImportCommandRunner
|
||||
{
|
||||
var entry = GetManifestEntry(options.Table);
|
||||
var artifactPaths = CreateArtifactPaths(entry.Slug);
|
||||
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath);
|
||||
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.XmlPath}");
|
||||
var extractor = CreateSourceExtractor(entry);
|
||||
await extractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
|
||||
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.GetSourceArtifactPath(entry.ExtractionMethod)}");
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -44,15 +46,8 @@ public sealed class CriticalImportCommandRunner
|
||||
{
|
||||
var entry = GetManifestEntry(options.Table);
|
||||
var artifactPaths = CreateArtifactPaths(entry.Slug);
|
||||
|
||||
if (!File.Exists(artifactPaths.XmlPath))
|
||||
{
|
||||
Console.Error.WriteLine($"Missing XML artifact: {artifactPaths.XmlPath}");
|
||||
return 1;
|
||||
}
|
||||
|
||||
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
|
||||
var parseResult = Parse(entry, xmlContent);
|
||||
var extractedSource = await LoadExtractedSourceAsync(entry, artifactPaths);
|
||||
var parseResult = Parse(entry, extractedSource);
|
||||
await sourceImageArtifactGenerator.GenerateAsync(
|
||||
ResolveRepositoryPath(entry.PdfPath),
|
||||
artifactPaths,
|
||||
@@ -104,14 +99,14 @@ public sealed class CriticalImportCommandRunner
|
||||
{
|
||||
var entry = GetManifestEntry(options.Table);
|
||||
var artifactPaths = CreateArtifactPaths(entry.Slug);
|
||||
|
||||
if (!File.Exists(artifactPaths.XmlPath))
|
||||
var extractor = CreateSourceExtractor(entry);
|
||||
if (!File.Exists(artifactPaths.GetSourceArtifactPath(entry.ExtractionMethod)))
|
||||
{
|
||||
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath);
|
||||
await extractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
|
||||
}
|
||||
|
||||
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
|
||||
var parseResult = Parse(entry, xmlContent);
|
||||
var extractedSource = await extractor.LoadAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
|
||||
var parseResult = Parse(entry, extractedSource);
|
||||
await sourceImageArtifactGenerator.GenerateAsync(
|
||||
ResolveRepositoryPath(entry.PdfPath),
|
||||
artifactPaths,
|
||||
@@ -143,26 +138,61 @@ public sealed class CriticalImportCommandRunner
|
||||
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
|
||||
}
|
||||
|
||||
private CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
private async Task<ExtractedCriticalSource> LoadExtractedSourceAsync(CriticalImportManifestEntry entry, ImportArtifactPaths artifactPaths)
|
||||
{
|
||||
var extractor = CreateSourceExtractor(entry);
|
||||
var sourceArtifactPath = artifactPaths.GetSourceArtifactPath(entry.ExtractionMethod);
|
||||
if (!File.Exists(sourceArtifactPath))
|
||||
{
|
||||
Console.Error.WriteLine($"Missing source artifact: {sourceArtifactPath}");
|
||||
throw new FileNotFoundException($"Missing source artifact: {sourceArtifactPath}", sourceArtifactPath);
|
||||
}
|
||||
|
||||
return await extractor.LoadAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths, CancellationToken.None);
|
||||
}
|
||||
|
||||
private CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source)
|
||||
{
|
||||
if (string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return standardParser.Parse(entry, xmlContent);
|
||||
if (string.Equals(entry.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
var template = StandardTableAxisTemplateCatalog.Resolve(entry.AxisTemplateSlug);
|
||||
var layout = standardOcrBootstrapper.Bootstrap(source, template);
|
||||
return standardParser.Parse(entry, source, layout);
|
||||
}
|
||||
|
||||
return standardParser.Parse(entry, source);
|
||||
}
|
||||
|
||||
if (string.Equals(entry.Family, "variant_column", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return variantColumnParser.Parse(entry, xmlContent);
|
||||
return variantColumnParser.Parse(entry, source);
|
||||
}
|
||||
|
||||
if (string.Equals(entry.Family, "grouped_variant", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return groupedVariantParser.Parse(entry, xmlContent);
|
||||
return groupedVariantParser.Parse(entry, source);
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by the importer.");
|
||||
}
|
||||
|
||||
private ICriticalSourceExtractor CreateSourceExtractor(CriticalImportManifestEntry entry)
|
||||
{
|
||||
if (string.Equals(entry.ExtractionMethod, "xml", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return new XmlCriticalSourceExtractor(pdfXmlExtractor);
|
||||
}
|
||||
|
||||
if (string.Equals(entry.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return new OcrCriticalSourceExtractor(pdfXmlExtractor);
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Extraction method '{entry.ExtractionMethod}' is not supported by the importer.");
|
||||
}
|
||||
|
||||
private static ImportArtifactPaths CreateArtifactPaths(string slug) =>
|
||||
ImportArtifactPaths.Create(RepositoryPaths.Discover().ArtifactsRootPath, slug);
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ public sealed class CriticalImportManifestEntry
|
||||
public string DisplayName { get; set; } = string.Empty;
|
||||
public string Family { get; set; } = string.Empty;
|
||||
public string ExtractionMethod { get; set; } = string.Empty;
|
||||
public string? AxisTemplateSlug { get; set; }
|
||||
public string PdfPath { get; set; } = string.Empty;
|
||||
public bool Enabled { get; set; } = true;
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
|
||||
pdfPath,
|
||||
pageGeometry.PageNumber,
|
||||
artifactPaths.GetPageImagePath(pageGeometry.PageNumber),
|
||||
parseResult.RenderProfile.RenderDpi,
|
||||
cancellationToken);
|
||||
}
|
||||
|
||||
@@ -38,7 +39,7 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
|
||||
$"Missing page geometry for page {result.SourceBounds.PageNumber} in table '{parseResult.Table.Slug}'.");
|
||||
}
|
||||
|
||||
var crop = CreateCrop(result.SourceBounds, pageGeometry);
|
||||
var crop = CreateCrop(result.SourceBounds, pageGeometry, parseResult.RenderProfile);
|
||||
var relativePath = artifactPaths.GetRelativeCellImagePath(result.GroupKey, result.ColumnKey, result.RollBandLabel);
|
||||
var fullPath = artifactPaths.ResolveRelativePath(relativePath);
|
||||
|
||||
@@ -50,6 +51,7 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
|
||||
crop.CropWidth,
|
||||
crop.CropHeight,
|
||||
fullPath,
|
||||
parseResult.RenderProfile.RenderDpi,
|
||||
cancellationToken);
|
||||
|
||||
result.SourceImagePath = relativePath;
|
||||
@@ -66,7 +68,8 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
|
||||
|
||||
private static CriticalSourceImageCrop CreateCrop(
|
||||
ParsedCriticalSourceRect sourceBounds,
|
||||
ParsedPdfPageGeometry pageGeometry)
|
||||
ParsedPdfPageGeometry pageGeometry,
|
||||
SourceRenderProfile renderProfile)
|
||||
{
|
||||
var cropLeft = Math.Max(0, sourceBounds.Left - CropPaddingX);
|
||||
var cropTop = Math.Max(0, sourceBounds.Top - CropPaddingY);
|
||||
@@ -75,18 +78,18 @@ public sealed class CriticalSourceImageArtifactGenerator(PdfXmlExtractor pdfXmlE
|
||||
|
||||
return new CriticalSourceImageCrop(
|
||||
sourceBounds.PageNumber,
|
||||
PdfXmlExtractor.ScaleCoordinate(pageGeometry.Width),
|
||||
PdfXmlExtractor.ScaleCoordinate(pageGeometry.Height),
|
||||
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Left),
|
||||
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Top),
|
||||
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Width),
|
||||
PdfXmlExtractor.ScaleCoordinate(sourceBounds.Height),
|
||||
PdfXmlExtractor.ScaleCoordinate(cropLeft),
|
||||
PdfXmlExtractor.ScaleCoordinate(cropTop),
|
||||
PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropRight - cropLeft)),
|
||||
PdfXmlExtractor.ScaleCoordinate(Math.Max(1, cropBottom - cropTop)),
|
||||
PdfXmlExtractor.ScaledRenderDpi,
|
||||
PdfXmlExtractor.RenderScaleFactor);
|
||||
renderProfile.ScaleCoordinate(pageGeometry.Width),
|
||||
renderProfile.ScaleCoordinate(pageGeometry.Height),
|
||||
renderProfile.ScaleCoordinate(sourceBounds.Left),
|
||||
renderProfile.ScaleCoordinate(sourceBounds.Top),
|
||||
renderProfile.ScaleCoordinate(sourceBounds.Width),
|
||||
renderProfile.ScaleCoordinate(sourceBounds.Height),
|
||||
renderProfile.ScaleCoordinate(cropLeft),
|
||||
renderProfile.ScaleCoordinate(cropTop),
|
||||
renderProfile.ScaleCoordinate(Math.Max(1, cropRight - cropLeft)),
|
||||
renderProfile.ScaleCoordinate(Math.Max(1, cropBottom - cropTop)),
|
||||
renderProfile.RenderDpi,
|
||||
renderProfile.ScaleFactor);
|
||||
}
|
||||
|
||||
private static string CreateCellKey(string? groupKey, string rollBandLabel, string columnKey) =>
|
||||
|
||||
@@ -2,7 +2,7 @@ using CommandLine;
|
||||
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
[Verb("extract", HelpText = "Extract a critical table PDF into a text artifact.")]
|
||||
[Verb("extract", HelpText = "Extract a critical table PDF into its source artifact.")]
|
||||
public sealed class ExtractOptions
|
||||
{
|
||||
[Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to extract.")]
|
||||
|
||||
17
src/RolemasterDb.ImportTool/ExtractedCriticalSource.cs
Normal file
17
src/RolemasterDb.ImportTool/ExtractedCriticalSource.cs
Normal file
@@ -0,0 +1,17 @@
|
||||
using RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
public sealed class ExtractedCriticalSource(
|
||||
string extractionMethod,
|
||||
string importNotes,
|
||||
SourceRenderProfile renderProfile,
|
||||
IReadOnlyList<ParsedPdfPageGeometry> pageGeometries,
|
||||
IReadOnlyList<PositionedTextFragment> fragments)
|
||||
{
|
||||
public string ExtractionMethod { get; } = extractionMethod;
|
||||
public string ImportNotes { get; } = importNotes;
|
||||
public SourceRenderProfile RenderProfile { get; } = renderProfile;
|
||||
public IReadOnlyList<ParsedPdfPageGeometry> PageGeometries { get; } = pageGeometries;
|
||||
public IReadOnlyList<PositionedTextFragment> Fragments { get; } = fragments;
|
||||
}
|
||||
11
src/RolemasterDb.ImportTool/ICriticalSourceExtractor.cs
Normal file
11
src/RolemasterDb.ImportTool/ICriticalSourceExtractor.cs
Normal file
@@ -0,0 +1,11 @@
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
public interface ICriticalSourceExtractor
|
||||
{
|
||||
Task ExtractAsync(string pdfPath, ImportArtifactPaths artifactPaths, CancellationToken cancellationToken = default);
|
||||
|
||||
Task<ExtractedCriticalSource> LoadAsync(
|
||||
string pdfPath,
|
||||
ImportArtifactPaths artifactPaths,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
@@ -9,9 +9,11 @@ public sealed class ImportArtifactPaths
|
||||
string tableSlug,
|
||||
string directoryPath,
|
||||
string xmlPath,
|
||||
string ocrTsvPath,
|
||||
string fragmentsJsonPath,
|
||||
string parsedCellsJsonPath,
|
||||
string validationReportPath,
|
||||
string ocrPagesDirectoryPath,
|
||||
string pagesDirectoryPath,
|
||||
string cellsDirectoryPath)
|
||||
{
|
||||
@@ -19,9 +21,11 @@ public sealed class ImportArtifactPaths
|
||||
TableSlug = tableSlug;
|
||||
DirectoryPath = directoryPath;
|
||||
XmlPath = xmlPath;
|
||||
OcrTsvPath = ocrTsvPath;
|
||||
FragmentsJsonPath = fragmentsJsonPath;
|
||||
ParsedCellsJsonPath = parsedCellsJsonPath;
|
||||
ValidationReportPath = validationReportPath;
|
||||
OcrPagesDirectoryPath = ocrPagesDirectoryPath;
|
||||
PagesDirectoryPath = pagesDirectoryPath;
|
||||
CellsDirectoryPath = cellsDirectoryPath;
|
||||
}
|
||||
@@ -30,15 +34,18 @@ public sealed class ImportArtifactPaths
|
||||
public string TableSlug { get; }
|
||||
public string DirectoryPath { get; }
|
||||
public string XmlPath { get; }
|
||||
public string OcrTsvPath { get; }
|
||||
public string FragmentsJsonPath { get; }
|
||||
public string ParsedCellsJsonPath { get; }
|
||||
public string ValidationReportPath { get; }
|
||||
public string OcrPagesDirectoryPath { get; }
|
||||
public string PagesDirectoryPath { get; }
|
||||
public string CellsDirectoryPath { get; }
|
||||
|
||||
public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug)
|
||||
{
|
||||
var directoryPath = Path.Combine(artifactsRootPath, tableSlug);
|
||||
var ocrPagesDirectoryPath = Path.Combine(directoryPath, "ocr-pages");
|
||||
var pagesDirectoryPath = Path.Combine(directoryPath, "pages");
|
||||
var cellsDirectoryPath = Path.Combine(directoryPath, "cells");
|
||||
|
||||
@@ -47,13 +54,23 @@ public sealed class ImportArtifactPaths
|
||||
tableSlug,
|
||||
directoryPath,
|
||||
Path.Combine(directoryPath, "source.xml"),
|
||||
Path.Combine(directoryPath, "source.ocr.tsv"),
|
||||
Path.Combine(directoryPath, "fragments.json"),
|
||||
Path.Combine(directoryPath, "parsed-cells.json"),
|
||||
Path.Combine(directoryPath, "validation-report.json"),
|
||||
ocrPagesDirectoryPath,
|
||||
pagesDirectoryPath,
|
||||
cellsDirectoryPath);
|
||||
}
|
||||
|
||||
public string GetSourceArtifactPath(string extractionMethod) =>
|
||||
string.Equals(extractionMethod, "ocr", StringComparison.OrdinalIgnoreCase)
|
||||
? OcrTsvPath
|
||||
: XmlPath;
|
||||
|
||||
public string GetOcrPageImagePath(int pageNumber) =>
|
||||
Path.Combine(OcrPagesDirectoryPath, $"page-{pageNumber:000}.png");
|
||||
|
||||
public string GetPageImagePath(int pageNumber) =>
|
||||
Path.Combine(PagesDirectoryPath, $"page-{pageNumber:000}.png");
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ using CommandLine;
|
||||
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
[Verb("load", HelpText = "Load a parsed critical table from its extracted text artifact.")]
|
||||
[Verb("load", HelpText = "Load a parsed critical table from its extracted source artifact.")]
|
||||
public sealed class LoadOptions
|
||||
{
|
||||
[Value(0, MetaName = "table", Required = true, HelpText = "The manifest slug of the critical table to load.")]
|
||||
|
||||
204
src/RolemasterDb.ImportTool/OcrCriticalSourceExtractor.cs
Normal file
204
src/RolemasterDb.ImportTool/OcrCriticalSourceExtractor.cs
Normal file
@@ -0,0 +1,204 @@
|
||||
using System.Globalization;
|
||||
using System.Text;
|
||||
|
||||
using RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
public sealed class OcrCriticalSourceExtractor(PdfXmlExtractor pdfXmlExtractor) : ICriticalSourceExtractor
|
||||
{
|
||||
private const int OcrRenderDpi = PdfXmlExtractor.ScaledRenderDpi;
|
||||
private const string TesseractExeDefaultPath = @"C:\Program Files\Sejda PDF Desktop\resources\vendor\tesseract-windows-x64\tesseract.exe";
|
||||
private const string TessdataDefaultPath = @"C:\Program Files\Sejda PDF Desktop\resources\vendor\tessdata";
|
||||
|
||||
public async Task ExtractAsync(string pdfPath, ImportArtifactPaths artifactPaths, CancellationToken cancellationToken = default)
|
||||
{
|
||||
Directory.CreateDirectory(artifactPaths.DirectoryPath);
|
||||
Directory.CreateDirectory(artifactPaths.OcrPagesDirectoryPath);
|
||||
|
||||
var info = await pdfXmlExtractor.ReadDocumentInfoAsync(pdfPath, cancellationToken);
|
||||
if (info.PageCount != 1)
|
||||
{
|
||||
throw new InvalidOperationException("The OCR extractor currently supports only single-page critical tables.");
|
||||
}
|
||||
|
||||
var pageImagePath = artifactPaths.GetOcrPageImagePath(1);
|
||||
await pdfXmlExtractor.RenderPagePngAsync(pdfPath, 1, pageImagePath, OcrRenderDpi, cancellationToken);
|
||||
|
||||
var tsvContent = await RunTesseractAsync(pageImagePath, cancellationToken);
|
||||
await File.WriteAllTextAsync(artifactPaths.OcrTsvPath, tsvContent, cancellationToken);
|
||||
}
|
||||
|
||||
public async Task<ExtractedCriticalSource> LoadAsync(
|
||||
string pdfPath,
|
||||
ImportArtifactPaths artifactPaths,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!File.Exists(artifactPaths.OcrTsvPath))
|
||||
{
|
||||
throw new FileNotFoundException($"Missing OCR artifact: {artifactPaths.OcrTsvPath}", artifactPaths.OcrTsvPath);
|
||||
}
|
||||
|
||||
var pageImagePath = artifactPaths.GetOcrPageImagePath(1);
|
||||
if (!File.Exists(pageImagePath))
|
||||
{
|
||||
throw new FileNotFoundException($"Missing OCR page image artifact: {pageImagePath}", pageImagePath);
|
||||
}
|
||||
|
||||
var tsvContent = await File.ReadAllTextAsync(artifactPaths.OcrTsvPath, cancellationToken);
|
||||
var (pageWidth, pageHeight) = ReadPngDimensions(pageImagePath);
|
||||
|
||||
return new ExtractedCriticalSource(
|
||||
"ocr",
|
||||
"Imported from PDF OCR extraction.",
|
||||
SourceRenderProfile.OcrPixels(OcrRenderDpi),
|
||||
[new ParsedPdfPageGeometry(1, pageWidth, pageHeight)],
|
||||
ParseTsv(tsvContent));
|
||||
}
|
||||
|
||||
internal static IReadOnlyList<PositionedTextFragment> ParseTsv(string tsvContent)
|
||||
{
|
||||
var lines = tsvContent
|
||||
.Split(["\r\n", "\n"], StringSplitOptions.RemoveEmptyEntries)
|
||||
.ToList();
|
||||
if (lines.Count == 0)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
var fragments = new List<PositionedTextFragment>();
|
||||
foreach (var line in lines.Skip(1))
|
||||
{
|
||||
var columns = line.Split('\t');
|
||||
if (columns.Length < 12 || columns[0] != "5")
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var text = CriticalTableParserSupport.NormalizeText(string.Join('\t', columns.Skip(11)));
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
fragments.Add(new PositionedTextFragment(
|
||||
int.Parse(columns[1], CultureInfo.InvariantCulture),
|
||||
int.Parse(columns[7], CultureInfo.InvariantCulture),
|
||||
int.Parse(columns[6], CultureInfo.InvariantCulture),
|
||||
int.Parse(columns[8], CultureInfo.InvariantCulture),
|
||||
int.Parse(columns[9], CultureInfo.InvariantCulture),
|
||||
text,
|
||||
ParseConfidence(columns[10])));
|
||||
}
|
||||
|
||||
return fragments;
|
||||
}
|
||||
|
||||
private static int? ParseConfidence(string value) =>
|
||||
int.TryParse(value, NumberStyles.Integer, CultureInfo.InvariantCulture, out var confidence) && confidence >= 0
|
||||
? confidence
|
||||
: null;
|
||||
|
||||
private static (int Width, int Height) ReadPngDimensions(string path)
|
||||
{
|
||||
using var stream = File.OpenRead(path);
|
||||
using var reader = new BinaryReader(stream, Encoding.UTF8, leaveOpen: false);
|
||||
var signature = reader.ReadBytes(8);
|
||||
var expectedSignature = new byte[] { 137, 80, 78, 71, 13, 10, 26, 10 };
|
||||
if (!signature.SequenceEqual(expectedSignature))
|
||||
{
|
||||
throw new InvalidOperationException($"'{path}' is not a PNG file.");
|
||||
}
|
||||
|
||||
_ = ReadBigEndianInt32(reader);
|
||||
var chunkType = Encoding.ASCII.GetString(reader.ReadBytes(4));
|
||||
if (!string.Equals(chunkType, "IHDR", StringComparison.Ordinal))
|
||||
{
|
||||
throw new InvalidOperationException($"'{path}' is missing a PNG IHDR header.");
|
||||
}
|
||||
|
||||
var width = ReadBigEndianInt32(reader);
|
||||
var height = ReadBigEndianInt32(reader);
|
||||
return (width, height);
|
||||
}
|
||||
|
||||
private static int ReadBigEndianInt32(BinaryReader reader)
|
||||
{
|
||||
var bytes = reader.ReadBytes(4);
|
||||
if (bytes.Length != 4)
|
||||
{
|
||||
throw new EndOfStreamException("Unexpected end of stream.");
|
||||
}
|
||||
|
||||
if (BitConverter.IsLittleEndian)
|
||||
{
|
||||
Array.Reverse(bytes);
|
||||
}
|
||||
|
||||
return BitConverter.ToInt32(bytes, 0);
|
||||
}
|
||||
|
||||
private static async Task<string> RunTesseractAsync(string imagePath, CancellationToken cancellationToken)
|
||||
{
|
||||
var startInfo = new System.Diagnostics.ProcessStartInfo
|
||||
{
|
||||
FileName = ResolveTesseractExecutable(),
|
||||
RedirectStandardError = true,
|
||||
RedirectStandardOutput = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true
|
||||
};
|
||||
|
||||
startInfo.Environment["TESSDATA_PREFIX"] = ResolveTessdataPath();
|
||||
startInfo.ArgumentList.Add(imagePath);
|
||||
startInfo.ArgumentList.Add("stdout");
|
||||
startInfo.ArgumentList.Add("--psm");
|
||||
startInfo.ArgumentList.Add("11");
|
||||
startInfo.ArgumentList.Add("tsv");
|
||||
|
||||
using var process = new System.Diagnostics.Process { StartInfo = startInfo };
|
||||
process.Start();
|
||||
var output = await process.StandardOutput.ReadToEndAsync(cancellationToken);
|
||||
await process.WaitForExitAsync(cancellationToken);
|
||||
|
||||
if (process.ExitCode != 0)
|
||||
{
|
||||
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
|
||||
throw new InvalidOperationException($"tesseract failed for '{imagePath}': {error}");
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
private static string ResolveTesseractExecutable()
|
||||
{
|
||||
var configuredPath = Environment.GetEnvironmentVariable("ROLEMASTERDB_TESSERACT_PATH");
|
||||
if (!string.IsNullOrWhiteSpace(configuredPath) && File.Exists(configuredPath))
|
||||
{
|
||||
return configuredPath;
|
||||
}
|
||||
|
||||
if (File.Exists(TesseractExeDefaultPath))
|
||||
{
|
||||
return TesseractExeDefaultPath;
|
||||
}
|
||||
|
||||
return "tesseract";
|
||||
}
|
||||
|
||||
private static string ResolveTessdataPath()
|
||||
{
|
||||
var configuredPath = Environment.GetEnvironmentVariable("ROLEMASTERDB_TESSDATA_PREFIX");
|
||||
if (!string.IsNullOrWhiteSpace(configuredPath) && Directory.Exists(configuredPath))
|
||||
{
|
||||
return configuredPath;
|
||||
}
|
||||
|
||||
if (Directory.Exists(TessdataDefaultPath))
|
||||
{
|
||||
return TessdataDefaultPath;
|
||||
}
|
||||
|
||||
return string.Empty;
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
internal sealed class ColumnarCellLine(string text, List<XmlTextFragment> fragments)
|
||||
internal sealed class ColumnarCellLine(string text, List<PositionedTextFragment> fragments)
|
||||
{
|
||||
public string Text { get; } = text;
|
||||
public List<XmlTextFragment> Fragments { get; } = fragments;
|
||||
public List<PositionedTextFragment> Fragments { get; } = fragments;
|
||||
}
|
||||
|
||||
@@ -3,13 +3,15 @@ namespace RolemasterDb.ImportTool.Parsing;
|
||||
public sealed class CriticalTableParseResult(
|
||||
ParsedCriticalTable table,
|
||||
IReadOnlyList<ParsedPdfPageGeometry> pageGeometries,
|
||||
IReadOnlyList<XmlTextFragment> fragments,
|
||||
IReadOnlyList<PositionedTextFragment> fragments,
|
||||
SourceRenderProfile renderProfile,
|
||||
IReadOnlyList<ParsedCriticalCellArtifact> cells,
|
||||
ImportValidationReport validationReport)
|
||||
{
|
||||
public ParsedCriticalTable Table { get; } = table;
|
||||
public IReadOnlyList<ParsedPdfPageGeometry> PageGeometries { get; } = pageGeometries;
|
||||
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
|
||||
public IReadOnlyList<PositionedTextFragment> Fragments { get; } = fragments;
|
||||
public SourceRenderProfile RenderProfile { get; } = renderProfile;
|
||||
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
|
||||
public ImportValidationReport ValidationReport { get; } = validationReport;
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ internal static class CriticalTableParserSupport
|
||||
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|–)\d+\)$", RegexOptions.Compiled);
|
||||
private static readonly Regex BoundaryBonusLineRegex = new(@"^(?:all allies|all foe's allies|all foes|all opponents)\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
||||
internal static List<XmlTextFragment> LoadFragments(string xmlContent)
|
||||
internal static List<PositionedTextFragment> LoadFragments(string xmlContent)
|
||||
{
|
||||
using var stringReader = new StringReader(xmlContent);
|
||||
using var xmlReader = XmlReader.Create(
|
||||
@@ -39,7 +39,7 @@ internal static class CriticalTableParserSupport
|
||||
{
|
||||
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
|
||||
return page.Elements("text")
|
||||
.Select(item => new XmlTextFragment(
|
||||
.Select(item => new PositionedTextFragment(
|
||||
pageNumber,
|
||||
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
|
||||
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
|
||||
@@ -73,8 +73,8 @@ internal static class CriticalTableParserSupport
|
||||
.ToList();
|
||||
}
|
||||
|
||||
internal static List<XmlTextFragment> FindRowLabelFragments(
|
||||
IReadOnlyList<XmlTextFragment> fragments,
|
||||
internal static List<PositionedTextFragment> FindRowLabelFragments(
|
||||
IReadOnlyList<PositionedTextFragment> fragments,
|
||||
int leftCutoff,
|
||||
int bodyStartTop,
|
||||
int keyTop)
|
||||
@@ -89,7 +89,7 @@ internal static class CriticalTableParserSupport
|
||||
.ThenBy(item => item.Left)
|
||||
.ToList();
|
||||
|
||||
var merged = new List<XmlTextFragment>();
|
||||
var merged = new List<PositionedTextFragment>();
|
||||
|
||||
for (var index = 0; index < candidates.Count; index++)
|
||||
{
|
||||
@@ -107,7 +107,7 @@ internal static class CriticalTableParserSupport
|
||||
}
|
||||
}
|
||||
|
||||
var deduped = new List<XmlTextFragment>();
|
||||
var deduped = new List<PositionedTextFragment>();
|
||||
|
||||
foreach (var candidate in merged)
|
||||
{
|
||||
@@ -128,7 +128,7 @@ internal static class CriticalTableParserSupport
|
||||
internal static bool IsRollBandLabel(string value) =>
|
||||
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:\s*-\s*\d{2,3})?$|^\d{2,3}\+$");
|
||||
|
||||
internal static bool IsPotentialRowLabelFragment(XmlTextFragment fragment, int leftCutoff) =>
|
||||
internal static bool IsPotentialRowLabelFragment(PositionedTextFragment fragment, int leftCutoff) =>
|
||||
fragment.Left < leftCutoff &&
|
||||
(IsRollBandLabel(fragment.Text) || LooksLikeSplitRollBandStart(fragment.Text));
|
||||
|
||||
@@ -163,9 +163,9 @@ internal static class CriticalTableParserSupport
|
||||
return columns[^1].Key;
|
||||
}
|
||||
|
||||
internal static IReadOnlyList<ColumnarCellLine> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
|
||||
internal static IReadOnlyList<ColumnarCellLine> BuildLines(IReadOnlyList<PositionedTextFragment> fragments)
|
||||
{
|
||||
var lines = new List<List<XmlTextFragment>>();
|
||||
var lines = new List<List<PositionedTextFragment>>();
|
||||
|
||||
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
|
||||
{
|
||||
@@ -292,9 +292,9 @@ internal static class CriticalTableParserSupport
|
||||
.Replace('’', '\'')
|
||||
.Trim();
|
||||
|
||||
private static List<XmlTextFragment> RemoveRedundantContainedFragments(IReadOnlyList<XmlTextFragment> fragments)
|
||||
private static List<PositionedTextFragment> RemoveRedundantContainedFragments(IReadOnlyList<PositionedTextFragment> fragments)
|
||||
{
|
||||
var redundant = new HashSet<XmlTextFragment>();
|
||||
var redundant = new HashSet<PositionedTextFragment>();
|
||||
|
||||
foreach (var group in fragments.GroupBy(item => (item.PageNumber, item.Top, item.Height)))
|
||||
{
|
||||
@@ -331,7 +331,7 @@ internal static class CriticalTableParserSupport
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static bool IsHorizontallyContained(XmlTextFragment candidate, XmlTextFragment container)
|
||||
private static bool IsHorizontallyContained(PositionedTextFragment candidate, PositionedTextFragment container)
|
||||
{
|
||||
const int containmentTolerance = 1;
|
||||
|
||||
@@ -353,7 +353,7 @@ internal static class CriticalTableParserSupport
|
||||
return normalized.Length == 0 ? null : normalized;
|
||||
}
|
||||
|
||||
internal static int FindKeyTop(IReadOnlyList<XmlTextFragment> fragments) =>
|
||||
internal static int FindKeyTop(IReadOnlyList<PositionedTextFragment> fragments) =>
|
||||
fragments
|
||||
.Where(item =>
|
||||
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
|
||||
@@ -362,7 +362,7 @@ internal static class CriticalTableParserSupport
|
||||
.Select(item => (int?)item.Top)
|
||||
.Min() ?? int.MaxValue;
|
||||
|
||||
internal static AffixLegend ParseAffixLegend(IReadOnlyList<XmlTextFragment> fragments, int keyTop)
|
||||
internal static AffixLegend ParseAffixLegend(IReadOnlyList<PositionedTextFragment> fragments, int keyTop)
|
||||
{
|
||||
if (keyTop == int.MaxValue)
|
||||
{
|
||||
@@ -401,12 +401,12 @@ internal static class CriticalTableParserSupport
|
||||
supportsPowerPointModifier: footerText.Contains("powerpoint modification", StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
internal static List<XmlTextFragment> SplitBoundaryCrossingFragments(
|
||||
IReadOnlyList<XmlTextFragment> bodyFragments,
|
||||
internal static List<PositionedTextFragment> SplitBoundaryCrossingFragments(
|
||||
IReadOnlyList<PositionedTextFragment> bodyFragments,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
IReadOnlySet<string> affixLegendSymbols)
|
||||
{
|
||||
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count);
|
||||
var splitFragments = new List<PositionedTextFragment>(bodyFragments.Count);
|
||||
|
||||
foreach (var fragment in bodyFragments)
|
||||
{
|
||||
@@ -417,7 +417,7 @@ internal static class CriticalTableParserSupport
|
||||
}
|
||||
|
||||
internal static List<(int Top, bool IsAffixLike)> BuildBodyLines(
|
||||
IReadOnlyList<XmlTextFragment> bodyFragments,
|
||||
IReadOnlyList<PositionedTextFragment> bodyFragments,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
IReadOnlySet<string> affixLegendSymbols)
|
||||
{
|
||||
@@ -440,7 +440,7 @@ internal static class CriticalTableParserSupport
|
||||
return bodyLines;
|
||||
}
|
||||
|
||||
internal static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
|
||||
internal static bool IsFooterPageNumberFragment(PositionedTextFragment fragment, int keyTop)
|
||||
{
|
||||
if (keyTop == int.MaxValue)
|
||||
{
|
||||
@@ -451,9 +451,9 @@ internal static class CriticalTableParserSupport
|
||||
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
|
||||
}
|
||||
|
||||
internal static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
|
||||
internal static IEnumerable<List<PositionedTextFragment>> GroupByTop(IReadOnlyList<PositionedTextFragment> fragments)
|
||||
{
|
||||
var groups = new List<List<XmlTextFragment>>();
|
||||
var groups = new List<List<PositionedTextFragment>>();
|
||||
|
||||
foreach (var fragment in fragments)
|
||||
{
|
||||
@@ -469,7 +469,7 @@ internal static class CriticalTableParserSupport
|
||||
return groups;
|
||||
}
|
||||
|
||||
internal static List<RowAnchor> CreateRowAnchors(IReadOnlyList<XmlTextFragment> rowLabelFragments) =>
|
||||
internal static List<RowAnchor> CreateRowAnchors(IReadOnlyList<PositionedTextFragment> rowLabelFragments) =>
|
||||
rowLabelFragments
|
||||
.OrderBy(item => item.Top)
|
||||
.Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1))
|
||||
@@ -489,13 +489,13 @@ internal static class CriticalTableParserSupport
|
||||
rowAnchors[0].Top - HeaderToRowLabelMinimumGap - TopGroupingTolerance));
|
||||
}
|
||||
|
||||
internal static List<XmlTextFragment> BuildBodyFragments(
|
||||
IReadOnlyList<XmlTextFragment> fragments,
|
||||
internal static List<PositionedTextFragment> BuildBodyFragments(
|
||||
IReadOnlyList<PositionedTextFragment> fragments,
|
||||
int bodyStartTop,
|
||||
int keyTop,
|
||||
int leftCutoff,
|
||||
IReadOnlyList<RowAnchor> rowAnchors,
|
||||
IReadOnlyCollection<XmlTextFragment> excludedFragments,
|
||||
IReadOnlyCollection<PositionedTextFragment> excludedFragments,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
IReadOnlySet<string> affixLegendSymbols)
|
||||
{
|
||||
@@ -580,7 +580,9 @@ internal static class CriticalTableParserSupport
|
||||
AffixLegend affixLegend,
|
||||
List<ParsedCriticalCellArtifact> parsedCells,
|
||||
List<ParsedCriticalResult> parsedResults,
|
||||
List<string> validationErrors)
|
||||
List<string> validationErrors,
|
||||
List<string>? validationWarnings = null,
|
||||
bool downgradeCellContentValidationToWarnings = false)
|
||||
{
|
||||
var sharedLegend = ToSharedAffixLegend(affixLegend);
|
||||
|
||||
@@ -589,8 +591,16 @@ internal static class CriticalTableParserSupport
|
||||
var lineTexts = cellEntry.Lines.Select(line => line.Text).ToList();
|
||||
var content = SharedParsing.CriticalCellTextParser.Parse(lineTexts, sharedLegend);
|
||||
var sourceBounds = BuildSourceBounds(cellEntry.Lines.SelectMany(line => line.Fragments).ToList());
|
||||
validationErrors.AddRange(content.ValidationErrors.Select(error =>
|
||||
$"Cell '{BuildCellIdentifier(cellEntry)}': {error}"));
|
||||
var contentIssues = content.ValidationErrors.Select(error =>
|
||||
$"Cell '{BuildCellIdentifier(cellEntry)}': {error}");
|
||||
if (downgradeCellContentValidationToWarnings)
|
||||
{
|
||||
validationWarnings?.AddRange(contentIssues);
|
||||
}
|
||||
else
|
||||
{
|
||||
validationErrors.AddRange(contentIssues);
|
||||
}
|
||||
|
||||
var effects = content.Effects.Select(ToImportToolEffect).ToList();
|
||||
var branches = content.Branches.Select(ToImportToolBranch).ToList();
|
||||
@@ -621,7 +631,7 @@ internal static class CriticalTableParserSupport
|
||||
}
|
||||
}
|
||||
|
||||
private static ParsedCriticalSourceRect BuildSourceBounds(IReadOnlyList<XmlTextFragment> fragments)
|
||||
private static ParsedCriticalSourceRect BuildSourceBounds(IReadOnlyList<PositionedTextFragment> fragments)
|
||||
{
|
||||
if (fragments.Count == 0)
|
||||
{
|
||||
@@ -688,7 +698,7 @@ internal static class CriticalTableParserSupport
|
||||
private static bool LooksLikeSplitRollBandStart(string value) =>
|
||||
Regex.IsMatch(value.Trim(), @"^\d{2,3}\s*-$");
|
||||
|
||||
private static bool TryMergeSplitRollBand(IReadOnlyList<XmlTextFragment> candidates, int index, out XmlTextFragment mergedCandidate)
|
||||
private static bool TryMergeSplitRollBand(IReadOnlyList<PositionedTextFragment> candidates, int index, out PositionedTextFragment mergedCandidate)
|
||||
{
|
||||
var current = candidates[index];
|
||||
if (!LooksLikeSplitRollBandStart(current.Text) || index + 1 >= candidates.Count)
|
||||
@@ -712,7 +722,7 @@ internal static class CriticalTableParserSupport
|
||||
var mergedLabel = $"{startDigits}-{next.Text.Trim()}";
|
||||
var right = Math.Max(current.Left + current.Width, next.Left + next.Width);
|
||||
|
||||
mergedCandidate = new XmlTextFragment(
|
||||
mergedCandidate = new PositionedTextFragment(
|
||||
current.PageNumber,
|
||||
current.Top,
|
||||
Math.Min(current.Left, next.Left),
|
||||
@@ -722,8 +732,8 @@ internal static class CriticalTableParserSupport
|
||||
return true;
|
||||
}
|
||||
|
||||
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingFragment(
|
||||
XmlTextFragment fragment,
|
||||
private static IReadOnlyList<PositionedTextFragment> SplitBoundaryCrossingFragment(
|
||||
PositionedTextFragment fragment,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
IReadOnlySet<string> affixLegendSymbols)
|
||||
{
|
||||
@@ -746,8 +756,8 @@ internal static class CriticalTableParserSupport
|
||||
return [fragment];
|
||||
}
|
||||
|
||||
private static IReadOnlyList<XmlTextFragment> BuildSplitFragmentsFromMatches(
|
||||
XmlTextFragment fragment,
|
||||
private static IReadOnlyList<PositionedTextFragment> BuildSplitFragmentsFromMatches(
|
||||
PositionedTextFragment fragment,
|
||||
MatchCollection matches,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters)
|
||||
{
|
||||
@@ -757,7 +767,7 @@ internal static class CriticalTableParserSupport
|
||||
}
|
||||
|
||||
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
|
||||
var splitFragments = new List<XmlTextFragment>(matches.Count);
|
||||
var splitFragments = new List<PositionedTextFragment>(matches.Count);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
@@ -770,7 +780,7 @@ internal static class CriticalTableParserSupport
|
||||
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
|
||||
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
|
||||
|
||||
splitFragments.Add(new XmlTextFragment(
|
||||
splitFragments.Add(new PositionedTextFragment(
|
||||
fragment.PageNumber,
|
||||
fragment.Top,
|
||||
segmentLeft,
|
||||
@@ -796,9 +806,9 @@ internal static class CriticalTableParserSupport
|
||||
}
|
||||
|
||||
private static bool TrySplitProseFragmentAtBoundaries(
|
||||
XmlTextFragment fragment,
|
||||
PositionedTextFragment fragment,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
out IReadOnlyList<XmlTextFragment> splitFragments)
|
||||
out IReadOnlyList<PositionedTextFragment> splitFragments)
|
||||
{
|
||||
splitFragments = null!;
|
||||
|
||||
@@ -808,7 +818,7 @@ internal static class CriticalTableParserSupport
|
||||
return false;
|
||||
}
|
||||
|
||||
var segments = new List<XmlTextFragment>();
|
||||
var segments = new List<PositionedTextFragment>();
|
||||
var segmentStart = 0;
|
||||
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
|
||||
|
||||
@@ -839,7 +849,7 @@ internal static class CriticalTableParserSupport
|
||||
}
|
||||
|
||||
private static List<int> FindBoundarySplitIndexes(
|
||||
XmlTextFragment fragment,
|
||||
PositionedTextFragment fragment,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters)
|
||||
{
|
||||
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
|
||||
@@ -907,8 +917,8 @@ internal static class CriticalTableParserSupport
|
||||
return bestIndex;
|
||||
}
|
||||
|
||||
private static XmlTextFragment? CreateFragmentSegment(
|
||||
XmlTextFragment fragment,
|
||||
private static PositionedTextFragment? CreateFragmentSegment(
|
||||
PositionedTextFragment fragment,
|
||||
int startIndex,
|
||||
int length,
|
||||
double characterWidth)
|
||||
@@ -940,7 +950,7 @@ internal static class CriticalTableParserSupport
|
||||
var actualLength = trimmedEnd - trimmedStart + 1;
|
||||
var segmentText = CollapseWhitespace(fragment.Text.Substring(actualStart, actualLength));
|
||||
|
||||
return new XmlTextFragment(
|
||||
return new PositionedTextFragment(
|
||||
fragment.PageNumber,
|
||||
fragment.Top,
|
||||
fragment.Left + (int)Math.Round(characterWidth * actualStart),
|
||||
@@ -950,7 +960,7 @@ internal static class CriticalTableParserSupport
|
||||
}
|
||||
|
||||
private static bool CrossesColumnBoundary(
|
||||
XmlTextFragment fragment,
|
||||
PositionedTextFragment fragment,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters)
|
||||
{
|
||||
var fragmentRight = fragment.Left + fragment.Width;
|
||||
|
||||
@@ -14,10 +14,10 @@ public sealed class GroupedVariantCriticalTableParser
|
||||
new("SLAYING", "Slaying", "variant", 2)
|
||||
];
|
||||
|
||||
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source)
|
||||
{
|
||||
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
|
||||
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent);
|
||||
var fragments = source.Fragments;
|
||||
var pageGeometries = source.PageGeometries;
|
||||
var groupHeaders = FindGroupHeaders(fragments);
|
||||
var columnHeaders = FindColumnHeaders(fragments);
|
||||
var validationErrors = new List<string>();
|
||||
@@ -50,7 +50,7 @@ public sealed class GroupedVariantCriticalTableParser
|
||||
|
||||
if (rowAnchors.Count == 0)
|
||||
{
|
||||
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
||||
validationErrors.Add("No roll-band labels were found in the source artifact.");
|
||||
}
|
||||
|
||||
var columnCenters = combinedColumnAnchors
|
||||
@@ -136,16 +136,28 @@ public sealed class GroupedVariantCriticalTableParser
|
||||
entry.DisplayName,
|
||||
entry.Family,
|
||||
Path.GetFileName(entry.PdfPath),
|
||||
"Imported from PDF XML extraction.",
|
||||
source.ImportNotes,
|
||||
ExpectedGroups,
|
||||
ExpectedColumns,
|
||||
parsedRollBands,
|
||||
parsedResults);
|
||||
|
||||
return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport);
|
||||
return new CriticalTableParseResult(table, pageGeometries, fragments, source.RenderProfile, parsedCells, validationReport);
|
||||
}
|
||||
|
||||
private static List<XmlTextFragment> FindGroupHeaders(IReadOnlyList<XmlTextFragment> fragments)
|
||||
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
{
|
||||
return Parse(
|
||||
entry,
|
||||
new ExtractedCriticalSource(
|
||||
"xml",
|
||||
"Imported from PDF XML extraction.",
|
||||
SourceRenderProfile.XmlAligned(),
|
||||
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
|
||||
CriticalTableParserSupport.LoadFragments(xmlContent)));
|
||||
}
|
||||
|
||||
private static List<PositionedTextFragment> FindGroupHeaders(IReadOnlyList<PositionedTextFragment> fragments)
|
||||
{
|
||||
var expectedLabels = ExpectedGroups.Select(item => item.Label).ToList();
|
||||
var headerCandidates = fragments
|
||||
@@ -164,10 +176,10 @@ public sealed class GroupedVariantCriticalTableParser
|
||||
}
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Could not find the grouped-variant section headers in the XML artifact.");
|
||||
throw new InvalidOperationException("Could not find the grouped-variant section headers in the source artifact.");
|
||||
}
|
||||
|
||||
private static List<XmlTextFragment> FindColumnHeaders(IReadOnlyList<XmlTextFragment> fragments)
|
||||
private static List<PositionedTextFragment> FindColumnHeaders(IReadOnlyList<PositionedTextFragment> fragments)
|
||||
{
|
||||
var expectedLabels = new[] { "normal", "slaying", "normal", "slaying" };
|
||||
var headerCandidates = fragments
|
||||
@@ -190,6 +202,6 @@ public sealed class GroupedVariantCriticalTableParser
|
||||
}
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Could not find the grouped-variant column header row in the XML artifact.");
|
||||
throw new InvalidOperationException("Could not find the grouped-variant column header row in the source artifact.");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public class PositionedTextFragment(
|
||||
int pageNumber,
|
||||
int top,
|
||||
int left,
|
||||
int width,
|
||||
int height,
|
||||
string text,
|
||||
int? confidence = null)
|
||||
{
|
||||
public int PageNumber { get; } = pageNumber;
|
||||
public int Top { get; } = top;
|
||||
public int Left { get; } = left;
|
||||
public int Width { get; } = width;
|
||||
public int Height { get; } = height;
|
||||
public string Text { get; } = text;
|
||||
public int? Confidence { get; } = confidence;
|
||||
public double CenterX => Left + (Width / 2.0);
|
||||
}
|
||||
@@ -2,12 +2,14 @@ namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class StandardCriticalTableParseResult(
|
||||
ParsedCriticalTable table,
|
||||
IReadOnlyList<XmlTextFragment> fragments,
|
||||
IReadOnlyList<PositionedTextFragment> fragments,
|
||||
SourceRenderProfile renderProfile,
|
||||
IReadOnlyList<ParsedCriticalCellArtifact> cells,
|
||||
ImportValidationReport validationReport)
|
||||
{
|
||||
public ParsedCriticalTable Table { get; } = table;
|
||||
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
|
||||
public IReadOnlyList<PositionedTextFragment> Fragments { get; } = fragments;
|
||||
public SourceRenderProfile RenderProfile { get; } = renderProfile;
|
||||
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
|
||||
public ImportValidationReport ValidationReport { get; } = validationReport;
|
||||
}
|
||||
|
||||
@@ -2,23 +2,140 @@ namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class StandardCriticalTableParser
|
||||
{
|
||||
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
internal CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source, StandardTableLayout? layout = null)
|
||||
{
|
||||
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
|
||||
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent);
|
||||
var headerFragments = FindHeaderFragments(fragments);
|
||||
var fragments = source.Fragments;
|
||||
var pageGeometries = source.PageGeometries;
|
||||
var validationErrors = new List<string>();
|
||||
var validationWarnings = new List<string>();
|
||||
|
||||
layout ??= BuildLayout(fragments, validationErrors);
|
||||
validationWarnings.AddRange(layout.Warnings);
|
||||
|
||||
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, layout.KeyTop);
|
||||
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
||||
var bodyFragments = CriticalTableParserSupport.BuildBodyFragments(
|
||||
fragments,
|
||||
layout.BodyStartTop,
|
||||
layout.KeyTop,
|
||||
layout.LeftCutoff,
|
||||
layout.RowAnchors,
|
||||
layout.ExcludedFragments,
|
||||
layout.ColumnCenters,
|
||||
affixLegendSymbols);
|
||||
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, layout.ColumnCenters, affixLegendSymbols);
|
||||
|
||||
var parsedRollBands = layout.RowAnchors
|
||||
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
|
||||
.ToList();
|
||||
|
||||
var cellEntries = new List<ColumnarCellEntry>();
|
||||
|
||||
for (var rowIndex = 0; rowIndex < layout.RowAnchors.Count; rowIndex++)
|
||||
{
|
||||
var rowStart = rowIndex == 0
|
||||
? layout.BodyStartTop
|
||||
: CriticalTableParserSupport.ResolveRowBoundaryTop(layout.RowAnchors[rowIndex - 1], layout.RowAnchors[rowIndex], bodyLines);
|
||||
|
||||
var rowEnd = rowIndex == layout.RowAnchors.Count - 1
|
||||
? layout.KeyTop - 1
|
||||
: CriticalTableParserSupport.ResolveRowBoundaryTop(layout.RowAnchors[rowIndex], layout.RowAnchors[rowIndex + 1], bodyLines);
|
||||
|
||||
var rowFragments = bodyFragments
|
||||
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
|
||||
.ToList();
|
||||
|
||||
foreach (var columnAnchor in layout.ColumnCenters)
|
||||
{
|
||||
var cellFragments = rowFragments
|
||||
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, layout.ColumnCenters) == columnAnchor.Key)
|
||||
.OrderBy(item => item.Top)
|
||||
.ThenBy(item => item.Left)
|
||||
.ToList();
|
||||
|
||||
if (cellFragments.Count == 0)
|
||||
{
|
||||
validationErrors.Add($"Missing content for roll band '{layout.RowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
|
||||
continue;
|
||||
}
|
||||
|
||||
cellEntries.Add(new ColumnarCellEntry(
|
||||
null,
|
||||
layout.RowAnchors[rowIndex].Label,
|
||||
rowIndex,
|
||||
columnAnchor.Key,
|
||||
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
|
||||
}
|
||||
}
|
||||
|
||||
CriticalTableParserSupport.RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
|
||||
|
||||
var parsedCells = new List<ParsedCriticalCellArtifact>();
|
||||
var parsedResults = new List<ParsedCriticalResult>();
|
||||
CriticalTableParserSupport.BuildParsedArtifacts(
|
||||
cellEntries,
|
||||
affixLegend,
|
||||
parsedCells,
|
||||
parsedResults,
|
||||
validationErrors,
|
||||
validationWarnings,
|
||||
downgradeCellContentValidationToWarnings: string.Equals(source.ExtractionMethod, "ocr", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
if (layout.ColumnCenters.Count != 5)
|
||||
{
|
||||
validationErrors.Add($"Expected 5 standard-table columns but found {layout.ColumnCenters.Count}.");
|
||||
}
|
||||
|
||||
if (parsedCells.Count != layout.RowAnchors.Count * layout.ColumnCenters.Count)
|
||||
{
|
||||
validationErrors.Add(
|
||||
$"Expected {layout.RowAnchors.Count * layout.ColumnCenters.Count} parsed cells but produced {parsedCells.Count}.");
|
||||
}
|
||||
|
||||
var validationReport = new ImportValidationReport(
|
||||
validationErrors.Count == 0,
|
||||
validationErrors,
|
||||
validationWarnings,
|
||||
layout.RowAnchors.Count,
|
||||
parsedCells.Count);
|
||||
|
||||
var table = new ParsedCriticalTable(
|
||||
entry.Slug,
|
||||
entry.DisplayName,
|
||||
entry.Family,
|
||||
Path.GetFileName(entry.PdfPath),
|
||||
source.ImportNotes,
|
||||
[],
|
||||
layout.ColumnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
|
||||
parsedRollBands,
|
||||
parsedResults);
|
||||
|
||||
return new CriticalTableParseResult(table, pageGeometries, fragments, source.RenderProfile, parsedCells, validationReport);
|
||||
}
|
||||
|
||||
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
{
|
||||
return Parse(
|
||||
entry,
|
||||
new ExtractedCriticalSource(
|
||||
"xml",
|
||||
"Imported from PDF XML extraction.",
|
||||
SourceRenderProfile.XmlAligned(),
|
||||
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
|
||||
CriticalTableParserSupport.LoadFragments(xmlContent)));
|
||||
}
|
||||
|
||||
private static StandardTableLayout BuildLayout(
|
||||
IReadOnlyList<PositionedTextFragment> fragments,
|
||||
ICollection<string> validationErrors)
|
||||
{
|
||||
var headerFragments = FindHeaderFragments(fragments);
|
||||
var columnCenters = headerFragments
|
||||
.OrderBy(item => item.Left)
|
||||
.Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX))
|
||||
.ToList();
|
||||
|
||||
var headerTop = headerFragments.Max(item => item.Top);
|
||||
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
||||
var affixLegend = CriticalTableParserSupport.ParseAffixLegend(fragments, keyTop);
|
||||
var affixLegendSymbols = affixLegend.ClassificationSymbols;
|
||||
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
|
||||
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||
fragments,
|
||||
@@ -30,102 +147,13 @@ public sealed class StandardCriticalTableParser
|
||||
|
||||
if (rowAnchors.Count == 0)
|
||||
{
|
||||
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
||||
validationErrors.Add("No roll-band labels were found in the source artifact.");
|
||||
}
|
||||
|
||||
var bodyFragments = CriticalTableParserSupport.BuildBodyFragments(
|
||||
fragments,
|
||||
bodyStartTop,
|
||||
keyTop,
|
||||
leftCutoff,
|
||||
rowAnchors,
|
||||
headerFragments,
|
||||
columnCenters,
|
||||
affixLegendSymbols);
|
||||
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
|
||||
|
||||
var parsedRollBands = rowAnchors
|
||||
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
|
||||
.ToList();
|
||||
|
||||
var cellEntries = new List<ColumnarCellEntry>();
|
||||
|
||||
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
|
||||
{
|
||||
var rowStart = rowIndex == 0
|
||||
? bodyStartTop
|
||||
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
|
||||
|
||||
var rowEnd = rowIndex == rowAnchors.Count - 1
|
||||
? keyTop - 1
|
||||
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
|
||||
|
||||
var rowFragments = bodyFragments
|
||||
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
|
||||
.ToList();
|
||||
|
||||
foreach (var columnAnchor in columnCenters)
|
||||
{
|
||||
var cellFragments = rowFragments
|
||||
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
|
||||
.OrderBy(item => item.Top)
|
||||
.ThenBy(item => item.Left)
|
||||
.ToList();
|
||||
|
||||
if (cellFragments.Count == 0)
|
||||
{
|
||||
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
|
||||
continue;
|
||||
}
|
||||
|
||||
cellEntries.Add(new ColumnarCellEntry(
|
||||
null,
|
||||
rowAnchors[rowIndex].Label,
|
||||
rowIndex,
|
||||
columnAnchor.Key,
|
||||
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
|
||||
}
|
||||
}
|
||||
|
||||
CriticalTableParserSupport.RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
|
||||
|
||||
var parsedCells = new List<ParsedCriticalCellArtifact>();
|
||||
var parsedResults = new List<ParsedCriticalResult>();
|
||||
CriticalTableParserSupport.BuildParsedArtifacts(cellEntries, affixLegend, parsedCells, parsedResults, validationErrors);
|
||||
|
||||
if (columnCenters.Count != 5)
|
||||
{
|
||||
validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}.");
|
||||
}
|
||||
|
||||
if (parsedCells.Count != rowAnchors.Count * columnCenters.Count)
|
||||
{
|
||||
validationErrors.Add(
|
||||
$"Expected {rowAnchors.Count * columnCenters.Count} parsed cells but produced {parsedCells.Count}.");
|
||||
}
|
||||
|
||||
var validationReport = new ImportValidationReport(
|
||||
validationErrors.Count == 0,
|
||||
validationErrors,
|
||||
validationWarnings,
|
||||
rowAnchors.Count,
|
||||
parsedCells.Count);
|
||||
|
||||
var table = new ParsedCriticalTable(
|
||||
entry.Slug,
|
||||
entry.DisplayName,
|
||||
entry.Family,
|
||||
Path.GetFileName(entry.PdfPath),
|
||||
"Imported from PDF XML extraction.",
|
||||
[],
|
||||
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
|
||||
parsedRollBands,
|
||||
parsedResults);
|
||||
|
||||
return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport);
|
||||
return new StandardTableLayout(headerFragments, columnCenters, rowAnchors, headerTop, bodyStartTop, keyTop, leftCutoff);
|
||||
}
|
||||
|
||||
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
||||
private static List<PositionedTextFragment> FindHeaderFragments(IReadOnlyList<PositionedTextFragment> fragments)
|
||||
{
|
||||
var headerCandidates = fragments
|
||||
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
|
||||
@@ -143,6 +171,6 @@ public sealed class StandardCriticalTableParser
|
||||
}
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
|
||||
throw new InvalidOperationException("Could not find the standard-table A-E header row in the source artifact.");
|
||||
}
|
||||
}
|
||||
|
||||
150
src/RolemasterDb.ImportTool/Parsing/StandardOcrBootstrapper.cs
Normal file
150
src/RolemasterDb.ImportTool/Parsing/StandardOcrBootstrapper.cs
Normal file
@@ -0,0 +1,150 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
internal sealed class StandardOcrBootstrapper
|
||||
{
|
||||
private const int AnchorConfidenceWarningThreshold = 85;
|
||||
private const int HeaderTopTolerance = 12;
|
||||
|
||||
public StandardTableLayout Bootstrap(ExtractedCriticalSource source, StandardTableAxisTemplate template)
|
||||
{
|
||||
var fragments = source.Fragments;
|
||||
var headerFragments = FindHeaderFragments(fragments, template);
|
||||
var columnCenters = headerFragments
|
||||
.OrderBy(item => item.Left)
|
||||
.Select(item => (Key: NormalizeHeaderText(item.Text), CenterX: item.CenterX))
|
||||
.ToList();
|
||||
var headerTop = headerFragments.Max(item => item.Top);
|
||||
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
|
||||
var leftCutoff = ResolveRowLabelLeftCutoff(headerFragments);
|
||||
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||
fragments,
|
||||
leftCutoff,
|
||||
headerTop + CriticalTableParserSupport.HeaderToRowLabelMinimumGap,
|
||||
keyTop);
|
||||
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
|
||||
var bodyStartTop = CriticalTableParserSupport.ResolveBodyStartTop(headerTop, rowAnchors);
|
||||
var warnings = new List<string>();
|
||||
|
||||
if (rowAnchors.Count != template.RollBandLabels.Count)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"OCR bootstrap found {rowAnchors.Count} row anchors but template '{template.Slug}' expects {template.RollBandLabels.Count}.");
|
||||
}
|
||||
|
||||
var actualLabels = rowAnchors.Select(item => item.Label).ToList();
|
||||
if (!actualLabels.SequenceEqual(template.RollBandLabels, StringComparer.Ordinal))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"OCR bootstrap row anchors do not match template '{template.Slug}'.");
|
||||
}
|
||||
|
||||
var fuzzyHeaders = headerFragments
|
||||
.Where(item => !string.Equals(item.Text, NormalizeHeaderText(item.Text), StringComparison.Ordinal))
|
||||
.ToList();
|
||||
if (fuzzyHeaders.Count > 0)
|
||||
{
|
||||
warnings.Add(
|
||||
$"OCR header normalization was applied for: {string.Join(", ", fuzzyHeaders.Select(item => $"'{item.Text}' -> '{NormalizeHeaderText(item.Text)}'"))}.");
|
||||
}
|
||||
|
||||
var lowConfidenceAnchors = headerFragments
|
||||
.Concat(rowLabelFragments)
|
||||
.Where(item => item.Confidence is int confidence && confidence < AnchorConfidenceWarningThreshold)
|
||||
.Select(item => $"'{item.Text}' ({item.Confidence})")
|
||||
.ToList();
|
||||
if (lowConfidenceAnchors.Count > 0)
|
||||
{
|
||||
warnings.Add($"Low-confidence OCR anchors: {string.Join(", ", lowConfidenceAnchors)}.");
|
||||
}
|
||||
|
||||
return new StandardTableLayout(
|
||||
headerFragments,
|
||||
columnCenters,
|
||||
rowAnchors,
|
||||
headerTop,
|
||||
bodyStartTop,
|
||||
keyTop,
|
||||
leftCutoff,
|
||||
warnings);
|
||||
}
|
||||
|
||||
private static List<PositionedTextFragment> FindHeaderFragments(
|
||||
IReadOnlyList<PositionedTextFragment> fragments,
|
||||
StandardTableAxisTemplate template)
|
||||
{
|
||||
var headerCandidates = fragments
|
||||
.Where(item => TryNormalizeHeaderText(item.Text, out _))
|
||||
.OrderBy(item => item.Top)
|
||||
.ThenBy(item => item.Left)
|
||||
.ToList();
|
||||
|
||||
foreach (var group in GroupHeaderCandidates(headerCandidates))
|
||||
{
|
||||
var ordered = group.OrderBy(item => item.Left).ToList();
|
||||
var labels = ordered.Select(item => NormalizeHeaderText(item.Text)).ToList();
|
||||
if (labels.SequenceEqual(template.ColumnKeys, StringComparer.Ordinal))
|
||||
{
|
||||
return ordered;
|
||||
}
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Could not find the OCR standard-table A-E header row.");
|
||||
}
|
||||
|
||||
private static string NormalizeHeaderText(string value)
|
||||
{
|
||||
if (!TryNormalizeHeaderText(value, out var normalized))
|
||||
{
|
||||
throw new InvalidOperationException($"Unsupported OCR header fragment '{value}'.");
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
private static bool TryNormalizeHeaderText(string value, out string normalized)
|
||||
{
|
||||
normalized = value.Trim().ToUpperInvariant();
|
||||
if (normalized is "A" or "B" or "D" or "E")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (normalized is "C" or "CC")
|
||||
{
|
||||
normalized = "C";
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static IEnumerable<List<PositionedTextFragment>> GroupHeaderCandidates(IReadOnlyList<PositionedTextFragment> fragments)
|
||||
{
|
||||
var groups = new List<List<PositionedTextFragment>>();
|
||||
|
||||
foreach (var fragment in fragments)
|
||||
{
|
||||
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > HeaderTopTolerance)
|
||||
{
|
||||
groups.Add([fragment]);
|
||||
continue;
|
||||
}
|
||||
|
||||
groups[^1].Add(fragment);
|
||||
}
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
private static int ResolveRowLabelLeftCutoff(IReadOnlyList<PositionedTextFragment> headerFragments)
|
||||
{
|
||||
var ordered = headerFragments.OrderBy(item => item.Left).ToList();
|
||||
if (ordered.Count < 2)
|
||||
{
|
||||
return Math.Max(0, ordered[0].Left - 10);
|
||||
}
|
||||
|
||||
var firstColumnGap = ordered[1].Left - ordered[0].Left;
|
||||
return Math.Max(0, ordered[0].Left - (firstColumnGap / 2));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
internal sealed class StandardTableAxisTemplate(
|
||||
string slug,
|
||||
IReadOnlyList<string> columnKeys,
|
||||
IReadOnlyList<string> rollBandLabels)
|
||||
{
|
||||
public string Slug { get; } = slug;
|
||||
public IReadOnlyList<string> ColumnKeys { get; } = columnKeys;
|
||||
public IReadOnlyList<string> RollBandLabels { get; } = rollBandLabels;
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
internal static class StandardTableAxisTemplateCatalog
|
||||
{
|
||||
internal static StandardTableAxisTemplate Resolve(string? slug)
|
||||
{
|
||||
if (string.Equals(slug, "mana-standard-19", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return new StandardTableAxisTemplate(
|
||||
"mana-standard-19",
|
||||
["A", "B", "C", "D", "E"],
|
||||
["01-05", "06-10", "11-15", "16-20", "21-35", "36-45", "46-50", "51-55", "56-60", "61-65", "66", "67-70", "71-75", "76-80", "81-85", "86-90", "91-95", "96-99", "100"]);
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Unsupported standard-table axis template '{slug ?? "<null>"}'.");
|
||||
}
|
||||
}
|
||||
21
src/RolemasterDb.ImportTool/Parsing/StandardTableLayout.cs
Normal file
21
src/RolemasterDb.ImportTool/Parsing/StandardTableLayout.cs
Normal file
@@ -0,0 +1,21 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
internal sealed class StandardTableLayout(
|
||||
IReadOnlyList<PositionedTextFragment> excludedFragments,
|
||||
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||
IReadOnlyList<RowAnchor> rowAnchors,
|
||||
int headerTop,
|
||||
int bodyStartTop,
|
||||
int keyTop,
|
||||
int leftCutoff,
|
||||
IReadOnlyList<string>? warnings = null)
|
||||
{
|
||||
public IReadOnlyList<PositionedTextFragment> ExcludedFragments { get; } = excludedFragments;
|
||||
public IReadOnlyList<(string Key, double CenterX)> ColumnCenters { get; } = columnCenters;
|
||||
public IReadOnlyList<RowAnchor> RowAnchors { get; } = rowAnchors;
|
||||
public int HeaderTop { get; } = headerTop;
|
||||
public int BodyStartTop { get; } = bodyStartTop;
|
||||
public int KeyTop { get; } = keyTop;
|
||||
public int LeftCutoff { get; } = leftCutoff;
|
||||
public IReadOnlyList<string> Warnings { get; } = warnings ?? [];
|
||||
}
|
||||
@@ -11,10 +11,10 @@ public sealed class VariantColumnCriticalTableParser
|
||||
new("SLAYING", "Slaying")
|
||||
];
|
||||
|
||||
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, ExtractedCriticalSource source)
|
||||
{
|
||||
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
|
||||
var pageGeometries = CriticalTableParserSupport.LoadPageGeometries(xmlContent);
|
||||
var fragments = source.Fragments;
|
||||
var pageGeometries = source.PageGeometries;
|
||||
var headerFragments = FindHeaderFragments(fragments);
|
||||
var validationErrors = new List<string>();
|
||||
var validationWarnings = new List<string>();
|
||||
@@ -43,7 +43,7 @@ public sealed class VariantColumnCriticalTableParser
|
||||
|
||||
if (rowAnchors.Count == 0)
|
||||
{
|
||||
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
||||
validationErrors.Add("No roll-band labels were found in the source artifact.");
|
||||
}
|
||||
|
||||
var columnCenters = columnAnchors
|
||||
@@ -132,16 +132,28 @@ public sealed class VariantColumnCriticalTableParser
|
||||
entry.DisplayName,
|
||||
entry.Family,
|
||||
Path.GetFileName(entry.PdfPath),
|
||||
"Imported from PDF XML extraction.",
|
||||
source.ImportNotes,
|
||||
[],
|
||||
ExpectedColumns.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Label, "variant", index + 1)).ToList(),
|
||||
parsedRollBands,
|
||||
parsedResults);
|
||||
|
||||
return new CriticalTableParseResult(table, pageGeometries, fragments, parsedCells, validationReport);
|
||||
return new CriticalTableParseResult(table, pageGeometries, fragments, source.RenderProfile, parsedCells, validationReport);
|
||||
}
|
||||
|
||||
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
||||
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
{
|
||||
return Parse(
|
||||
entry,
|
||||
new ExtractedCriticalSource(
|
||||
"xml",
|
||||
"Imported from PDF XML extraction.",
|
||||
SourceRenderProfile.XmlAligned(),
|
||||
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
|
||||
CriticalTableParserSupport.LoadFragments(xmlContent)));
|
||||
}
|
||||
|
||||
private static List<PositionedTextFragment> FindHeaderFragments(IReadOnlyList<PositionedTextFragment> fragments)
|
||||
{
|
||||
var expectedLabels = ExpectedColumns
|
||||
.Select(item => item.Label.ToLowerInvariant())
|
||||
@@ -163,7 +175,7 @@ public sealed class VariantColumnCriticalTableParser
|
||||
}
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("Could not find the variant-column header row in the XML artifact.");
|
||||
throw new InvalidOperationException("Could not find the variant-column header row in the source artifact.");
|
||||
}
|
||||
|
||||
private static ColumnDefinition ResolveColumnDefinition(string value) =>
|
||||
|
||||
@@ -7,12 +7,6 @@ public sealed class XmlTextFragment(
|
||||
int width,
|
||||
int height,
|
||||
string text)
|
||||
: PositionedTextFragment(pageNumber, top, left, width, height, text)
|
||||
{
|
||||
public int PageNumber { get; } = pageNumber;
|
||||
public int Top { get; } = top;
|
||||
public int Left { get; } = left;
|
||||
public int Width { get; } = width;
|
||||
public int Height { get; } = height;
|
||||
public string Text { get; } = text;
|
||||
public double CenterX => Left + (Width / 2.0);
|
||||
}
|
||||
|
||||
8
src/RolemasterDb.ImportTool/PdfDocumentInfo.cs
Normal file
8
src/RolemasterDb.ImportTool/PdfDocumentInfo.cs
Normal file
@@ -0,0 +1,8 @@
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
public sealed class PdfDocumentInfo(int pageCount, double pageWidthPoints, double pageHeightPoints)
|
||||
{
|
||||
public int PageCount { get; } = pageCount;
|
||||
public double PageWidthPoints { get; } = pageWidthPoints;
|
||||
public double PageHeightPoints { get; } = pageHeightPoints;
|
||||
}
|
||||
@@ -1,4 +1,6 @@
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
@@ -7,6 +9,7 @@ public sealed class PdfXmlExtractor
|
||||
public const int RenderScaleFactor = 4;
|
||||
public const int XmlAlignedRenderDpi = 108;
|
||||
public const int ScaledRenderDpi = XmlAlignedRenderDpi * RenderScaleFactor;
|
||||
private const string PortableMiKTeXPath = @"D:\Code\miktex-portable\texmfs\install\miktex\bin\x64";
|
||||
|
||||
public static int ScaleCoordinate(int value) => checked(value * RenderScaleFactor);
|
||||
|
||||
@@ -16,7 +19,7 @@ public sealed class PdfXmlExtractor
|
||||
|
||||
var startInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = "pdftohtml",
|
||||
FileName = ResolveExecutable("ROLEMASTERDB_PDFTOHTML_PATH", "pdftohtml.exe"),
|
||||
RedirectStandardError = true,
|
||||
RedirectStandardOutput = true,
|
||||
UseShellExecute = false,
|
||||
@@ -40,12 +43,57 @@ public sealed class PdfXmlExtractor
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<PdfDocumentInfo> ReadDocumentInfoAsync(string pdfPath, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var startInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = ResolveExecutable("ROLEMASTERDB_PDFINFO_PATH", "pdfinfo.exe"),
|
||||
RedirectStandardError = true,
|
||||
RedirectStandardOutput = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true
|
||||
};
|
||||
|
||||
startInfo.ArgumentList.Add(pdfPath);
|
||||
|
||||
using var process = new Process { StartInfo = startInfo };
|
||||
process.Start();
|
||||
var output = await process.StandardOutput.ReadToEndAsync(cancellationToken);
|
||||
await process.WaitForExitAsync(cancellationToken);
|
||||
|
||||
if (process.ExitCode != 0)
|
||||
{
|
||||
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
|
||||
throw new InvalidOperationException($"pdfinfo failed for '{pdfPath}': {error}");
|
||||
}
|
||||
|
||||
var pageCountMatch = Regex.Match(output, @"Pages:\s*(\d+)", RegexOptions.Multiline);
|
||||
var sizeMatch = Regex.Match(output, @"Page size:\s*([0-9.]+)\s*x\s*([0-9.]+)\s*pts", RegexOptions.Multiline);
|
||||
if (!pageCountMatch.Success || !sizeMatch.Success)
|
||||
{
|
||||
throw new InvalidOperationException($"pdfinfo output for '{pdfPath}' could not be parsed.");
|
||||
}
|
||||
|
||||
return new PdfDocumentInfo(
|
||||
int.Parse(pageCountMatch.Groups[1].Value, CultureInfo.InvariantCulture),
|
||||
double.Parse(sizeMatch.Groups[1].Value, CultureInfo.InvariantCulture),
|
||||
double.Parse(sizeMatch.Groups[2].Value, CultureInfo.InvariantCulture));
|
||||
}
|
||||
|
||||
public Task RenderPagePngAsync(
|
||||
string pdfPath,
|
||||
int pageNumber,
|
||||
string outputPath,
|
||||
CancellationToken cancellationToken = default) =>
|
||||
RenderPngAsync(pdfPath, pageNumber, outputPath, null, null, null, null, cancellationToken);
|
||||
RenderPagePngAsync(pdfPath, pageNumber, outputPath, ScaledRenderDpi, cancellationToken);
|
||||
|
||||
public Task RenderPagePngAsync(
|
||||
string pdfPath,
|
||||
int pageNumber,
|
||||
string outputPath,
|
||||
int renderDpi,
|
||||
CancellationToken cancellationToken = default) =>
|
||||
RenderPngAsync(pdfPath, pageNumber, outputPath, renderDpi, null, null, null, null, cancellationToken);
|
||||
|
||||
public Task RenderCropPngAsync(
|
||||
string pdfPath,
|
||||
@@ -56,12 +104,25 @@ public sealed class PdfXmlExtractor
|
||||
int height,
|
||||
string outputPath,
|
||||
CancellationToken cancellationToken = default) =>
|
||||
RenderPngAsync(pdfPath, pageNumber, outputPath, left, top, width, height, cancellationToken);
|
||||
RenderCropPngAsync(pdfPath, pageNumber, left, top, width, height, outputPath, ScaledRenderDpi, cancellationToken);
|
||||
|
||||
public Task RenderCropPngAsync(
|
||||
string pdfPath,
|
||||
int pageNumber,
|
||||
int left,
|
||||
int top,
|
||||
int width,
|
||||
int height,
|
||||
string outputPath,
|
||||
int renderDpi,
|
||||
CancellationToken cancellationToken = default) =>
|
||||
RenderPngAsync(pdfPath, pageNumber, outputPath, renderDpi, left, top, width, height, cancellationToken);
|
||||
|
||||
private static async Task RenderPngAsync(
|
||||
string pdfPath,
|
||||
int pageNumber,
|
||||
string outputPath,
|
||||
int renderDpi,
|
||||
int? left,
|
||||
int? top,
|
||||
int? width,
|
||||
@@ -72,7 +133,7 @@ public sealed class PdfXmlExtractor
|
||||
|
||||
var startInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = "pdftoppm",
|
||||
FileName = ResolveExecutable("ROLEMASTERDB_PDFTOPPM_PATH", "pdftoppm.exe"),
|
||||
RedirectStandardError = true,
|
||||
RedirectStandardOutput = true,
|
||||
UseShellExecute = false,
|
||||
@@ -81,7 +142,7 @@ public sealed class PdfXmlExtractor
|
||||
|
||||
startInfo.ArgumentList.Add("-png");
|
||||
startInfo.ArgumentList.Add("-r");
|
||||
startInfo.ArgumentList.Add(ScaledRenderDpi.ToString());
|
||||
startInfo.ArgumentList.Add(renderDpi.ToString(CultureInfo.InvariantCulture));
|
||||
startInfo.ArgumentList.Add("-f");
|
||||
startInfo.ArgumentList.Add(pageNumber.ToString());
|
||||
startInfo.ArgumentList.Add("-l");
|
||||
@@ -118,4 +179,21 @@ public sealed class PdfXmlExtractor
|
||||
throw new InvalidOperationException($"pdftoppm completed but did not create '{outputPath}'.");
|
||||
}
|
||||
}
|
||||
|
||||
private static string ResolveExecutable(string environmentVariableName, string executableName)
|
||||
{
|
||||
var configuredPath = Environment.GetEnvironmentVariable(environmentVariableName);
|
||||
if (!string.IsNullOrWhiteSpace(configuredPath) && File.Exists(configuredPath))
|
||||
{
|
||||
return configuredPath;
|
||||
}
|
||||
|
||||
var portablePath = Path.Combine(PortableMiKTeXPath, executableName);
|
||||
if (File.Exists(portablePath))
|
||||
{
|
||||
return portablePath;
|
||||
}
|
||||
|
||||
return Path.GetFileNameWithoutExtension(executableName);
|
||||
}
|
||||
}
|
||||
|
||||
3
src/RolemasterDb.ImportTool/Properties/AssemblyInfo.cs
Normal file
3
src/RolemasterDb.ImportTool/Properties/AssemblyInfo.cs
Normal file
@@ -0,0 +1,3 @@
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
[assembly: InternalsVisibleTo("RolemasterDb.ImportTool.Tests")]
|
||||
15
src/RolemasterDb.ImportTool/SourceRenderProfile.cs
Normal file
15
src/RolemasterDb.ImportTool/SourceRenderProfile.cs
Normal file
@@ -0,0 +1,15 @@
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
public sealed class SourceRenderProfile(int renderDpi, int scaleFactor)
|
||||
{
|
||||
public int RenderDpi { get; } = renderDpi;
|
||||
public int ScaleFactor { get; } = scaleFactor;
|
||||
|
||||
public int ScaleCoordinate(int value) => checked(value * ScaleFactor);
|
||||
|
||||
public static SourceRenderProfile XmlAligned() =>
|
||||
new(PdfXmlExtractor.ScaledRenderDpi, PdfXmlExtractor.RenderScaleFactor);
|
||||
|
||||
public static SourceRenderProfile OcrPixels(int renderDpi) =>
|
||||
new(renderDpi, 1);
|
||||
}
|
||||
28
src/RolemasterDb.ImportTool/XmlCriticalSourceExtractor.cs
Normal file
28
src/RolemasterDb.ImportTool/XmlCriticalSourceExtractor.cs
Normal file
@@ -0,0 +1,28 @@
|
||||
using RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
public sealed class XmlCriticalSourceExtractor(PdfXmlExtractor pdfXmlExtractor) : ICriticalSourceExtractor
|
||||
{
|
||||
public async Task ExtractAsync(string pdfPath, ImportArtifactPaths artifactPaths, CancellationToken cancellationToken = default) =>
|
||||
await pdfXmlExtractor.ExtractAsync(pdfPath, artifactPaths.XmlPath, cancellationToken);
|
||||
|
||||
public async Task<ExtractedCriticalSource> LoadAsync(
|
||||
string pdfPath,
|
||||
ImportArtifactPaths artifactPaths,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!File.Exists(artifactPaths.XmlPath))
|
||||
{
|
||||
throw new FileNotFoundException($"Missing XML artifact: {artifactPaths.XmlPath}", artifactPaths.XmlPath);
|
||||
}
|
||||
|
||||
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath, cancellationToken);
|
||||
return new ExtractedCriticalSource(
|
||||
"xml",
|
||||
"Imported from PDF XML extraction.",
|
||||
SourceRenderProfile.XmlAligned(),
|
||||
CriticalTableParserSupport.LoadPageGeometries(xmlContent),
|
||||
CriticalTableParserSupport.LoadFragments(xmlContent));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user