Add OCR import support for void critical table

This commit is contained in:
2026-03-19 23:16:09 +01:00
parent b4c8f8c142
commit 7bb0c1b8d1
35 changed files with 4379 additions and 285 deletions

View File

@@ -9,9 +9,11 @@ public sealed class ImportArtifactPaths
string tableSlug,
string directoryPath,
string xmlPath,
string ocrTsvPath,
string fragmentsJsonPath,
string parsedCellsJsonPath,
string validationReportPath,
string ocrPagesDirectoryPath,
string pagesDirectoryPath,
string cellsDirectoryPath)
{
@@ -19,9 +21,11 @@ public sealed class ImportArtifactPaths
TableSlug = tableSlug;
DirectoryPath = directoryPath;
XmlPath = xmlPath;
OcrTsvPath = ocrTsvPath;
FragmentsJsonPath = fragmentsJsonPath;
ParsedCellsJsonPath = parsedCellsJsonPath;
ValidationReportPath = validationReportPath;
OcrPagesDirectoryPath = ocrPagesDirectoryPath;
PagesDirectoryPath = pagesDirectoryPath;
CellsDirectoryPath = cellsDirectoryPath;
}
@@ -30,15 +34,18 @@ public sealed class ImportArtifactPaths
public string TableSlug { get; }
public string DirectoryPath { get; }
public string XmlPath { get; }
public string OcrTsvPath { get; }
public string FragmentsJsonPath { get; }
public string ParsedCellsJsonPath { get; }
public string ValidationReportPath { get; }
public string OcrPagesDirectoryPath { get; }
public string PagesDirectoryPath { get; }
public string CellsDirectoryPath { get; }
public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug)
{
var directoryPath = Path.Combine(artifactsRootPath, tableSlug);
var ocrPagesDirectoryPath = Path.Combine(directoryPath, "ocr-pages");
var pagesDirectoryPath = Path.Combine(directoryPath, "pages");
var cellsDirectoryPath = Path.Combine(directoryPath, "cells");
@@ -47,13 +54,23 @@ public sealed class ImportArtifactPaths
tableSlug,
directoryPath,
Path.Combine(directoryPath, "source.xml"),
Path.Combine(directoryPath, "source.ocr.tsv"),
Path.Combine(directoryPath, "fragments.json"),
Path.Combine(directoryPath, "parsed-cells.json"),
Path.Combine(directoryPath, "validation-report.json"),
ocrPagesDirectoryPath,
pagesDirectoryPath,
cellsDirectoryPath);
}
public string GetSourceArtifactPath(string extractionMethod) =>
string.Equals(extractionMethod, "ocr", StringComparison.OrdinalIgnoreCase)
? OcrTsvPath
: XmlPath;
public string GetOcrPageImagePath(int pageNumber) =>
Path.Combine(OcrPagesDirectoryPath, $"page-{pageNumber:000}.png");
public string GetPageImagePath(int pageNumber) =>
Path.Combine(PagesDirectoryPath, $"page-{pageNumber:000}.png");