Use XML geometry for critical PDF import
This commit is contained in:
Binary file not shown.
@@ -5,7 +5,8 @@ namespace RolemasterDb.ImportTool;
|
|||||||
public sealed class CriticalImportCommandRunner
|
public sealed class CriticalImportCommandRunner
|
||||||
{
|
{
|
||||||
private readonly CriticalImportManifestLoader manifestLoader = new();
|
private readonly CriticalImportManifestLoader manifestLoader = new();
|
||||||
private readonly PdfTextExtractor pdfTextExtractor = new();
|
private readonly ImportArtifactWriter artifactWriter = new();
|
||||||
|
private readonly PdfXmlExtractor pdfXmlExtractor = new();
|
||||||
private readonly StandardCriticalTableParser standardParser = new();
|
private readonly StandardCriticalTableParser standardParser = new();
|
||||||
|
|
||||||
public async Task<int> RunAsync(ResetOptions options)
|
public async Task<int> RunAsync(ResetOptions options)
|
||||||
@@ -26,8 +27,8 @@ public sealed class CriticalImportCommandRunner
|
|||||||
{
|
{
|
||||||
var entry = GetManifestEntry(options.Table);
|
var entry = GetManifestEntry(options.Table);
|
||||||
var artifactPaths = CreateArtifactPaths(entry.Slug);
|
var artifactPaths = CreateArtifactPaths(entry.Slug);
|
||||||
await pdfTextExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.ExtractedTextPath);
|
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath);
|
||||||
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.ExtractedTextPath}");
|
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.XmlPath}");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -36,16 +37,24 @@ public sealed class CriticalImportCommandRunner
|
|||||||
var entry = GetManifestEntry(options.Table);
|
var entry = GetManifestEntry(options.Table);
|
||||||
var artifactPaths = CreateArtifactPaths(entry.Slug);
|
var artifactPaths = CreateArtifactPaths(entry.Slug);
|
||||||
|
|
||||||
if (!File.Exists(artifactPaths.ExtractedTextPath))
|
if (!File.Exists(artifactPaths.XmlPath))
|
||||||
{
|
{
|
||||||
Console.Error.WriteLine($"Missing extracted text artifact: {artifactPaths.ExtractedTextPath}");
|
Console.Error.WriteLine($"Missing XML artifact: {artifactPaths.XmlPath}");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
var extractedText = await File.ReadAllTextAsync(artifactPaths.ExtractedTextPath);
|
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
|
||||||
var parsedTable = Parse(entry, extractedText);
|
var parseResult = Parse(entry, xmlContent);
|
||||||
|
await artifactWriter.WriteAsync(artifactPaths, parseResult, CancellationToken.None);
|
||||||
|
|
||||||
|
if (!parseResult.ValidationReport.IsValid)
|
||||||
|
{
|
||||||
|
throw new InvalidOperationException(
|
||||||
|
$"Validation failed for '{entry.Slug}'. See {artifactPaths.ValidationReportPath} for details.");
|
||||||
|
}
|
||||||
|
|
||||||
var loader = new CriticalImportLoader(ResolveDatabasePath(options.DatabasePath));
|
var loader = new CriticalImportLoader(ResolveDatabasePath(options.DatabasePath));
|
||||||
var result = await loader.LoadAsync(parsedTable);
|
var result = await loader.LoadAsync(parseResult.Table);
|
||||||
|
|
||||||
Console.WriteLine(
|
Console.WriteLine(
|
||||||
$"Loaded {result.TableSlug}: {result.ColumnCount} columns, {result.RollBandCount} roll bands, {result.ResultCount} results.");
|
$"Loaded {result.TableSlug}: {result.ColumnCount} columns, {result.RollBandCount} roll bands, {result.ResultCount} results.");
|
||||||
@@ -82,14 +91,14 @@ public sealed class CriticalImportCommandRunner
|
|||||||
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
|
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
|
||||||
}
|
}
|
||||||
|
|
||||||
private ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText)
|
private StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||||
{
|
{
|
||||||
if (!string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
|
if (!string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
|
||||||
{
|
{
|
||||||
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 1.");
|
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 2.");
|
||||||
}
|
}
|
||||||
|
|
||||||
return standardParser.Parse(entry, extractedText);
|
return standardParser.Parse(entry, xmlContent);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static ImportArtifactPaths CreateArtifactPaths(string slug) =>
|
private static ImportArtifactPaths CreateArtifactPaths(string slug) =>
|
||||||
|
|||||||
@@ -2,18 +2,34 @@ namespace RolemasterDb.ImportTool;
|
|||||||
|
|
||||||
public sealed class ImportArtifactPaths
|
public sealed class ImportArtifactPaths
|
||||||
{
|
{
|
||||||
private ImportArtifactPaths(string directoryPath, string extractedTextPath)
|
private ImportArtifactPaths(
|
||||||
|
string directoryPath,
|
||||||
|
string xmlPath,
|
||||||
|
string fragmentsJsonPath,
|
||||||
|
string parsedCellsJsonPath,
|
||||||
|
string validationReportPath)
|
||||||
{
|
{
|
||||||
DirectoryPath = directoryPath;
|
DirectoryPath = directoryPath;
|
||||||
ExtractedTextPath = extractedTextPath;
|
XmlPath = xmlPath;
|
||||||
|
FragmentsJsonPath = fragmentsJsonPath;
|
||||||
|
ParsedCellsJsonPath = parsedCellsJsonPath;
|
||||||
|
ValidationReportPath = validationReportPath;
|
||||||
}
|
}
|
||||||
|
|
||||||
public string DirectoryPath { get; }
|
public string DirectoryPath { get; }
|
||||||
public string ExtractedTextPath { get; }
|
public string XmlPath { get; }
|
||||||
|
public string FragmentsJsonPath { get; }
|
||||||
|
public string ParsedCellsJsonPath { get; }
|
||||||
|
public string ValidationReportPath { get; }
|
||||||
|
|
||||||
public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug)
|
public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug)
|
||||||
{
|
{
|
||||||
var directoryPath = Path.Combine(artifactsRootPath, tableSlug);
|
var directoryPath = Path.Combine(artifactsRootPath, tableSlug);
|
||||||
return new ImportArtifactPaths(directoryPath, Path.Combine(directoryPath, "extracted.txt"));
|
return new ImportArtifactPaths(
|
||||||
|
directoryPath,
|
||||||
|
Path.Combine(directoryPath, "source.xml"),
|
||||||
|
Path.Combine(directoryPath, "fragments.json"),
|
||||||
|
Path.Combine(directoryPath, "parsed-cells.json"),
|
||||||
|
Path.Combine(directoryPath, "validation-report.json"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
33
src/RolemasterDb.ImportTool/ImportArtifactWriter.cs
Normal file
33
src/RolemasterDb.ImportTool/ImportArtifactWriter.cs
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
using System.Text.Json;
|
||||||
|
|
||||||
|
using RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
namespace RolemasterDb.ImportTool;
|
||||||
|
|
||||||
|
public sealed class ImportArtifactWriter
|
||||||
|
{
|
||||||
|
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||||
|
{
|
||||||
|
WriteIndented = true
|
||||||
|
};
|
||||||
|
|
||||||
|
public async Task WriteAsync(ImportArtifactPaths artifactPaths, StandardCriticalTableParseResult parseResult, CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
Directory.CreateDirectory(artifactPaths.DirectoryPath);
|
||||||
|
|
||||||
|
await File.WriteAllTextAsync(
|
||||||
|
artifactPaths.FragmentsJsonPath,
|
||||||
|
JsonSerializer.Serialize(parseResult.Fragments, JsonOptions),
|
||||||
|
cancellationToken);
|
||||||
|
|
||||||
|
await File.WriteAllTextAsync(
|
||||||
|
artifactPaths.ParsedCellsJsonPath,
|
||||||
|
JsonSerializer.Serialize(parseResult.Cells, JsonOptions),
|
||||||
|
cancellationToken);
|
||||||
|
|
||||||
|
await File.WriteAllTextAsync(
|
||||||
|
artifactPaths.ValidationReportPath,
|
||||||
|
JsonSerializer.Serialize(parseResult.ValidationReport, JsonOptions),
|
||||||
|
cancellationToken);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
public sealed class ImportValidationReport(
|
||||||
|
bool isValid,
|
||||||
|
IReadOnlyList<string> errors,
|
||||||
|
int rowCount,
|
||||||
|
int cellCount)
|
||||||
|
{
|
||||||
|
public bool IsValid { get; } = isValid;
|
||||||
|
public IReadOnlyList<string> Errors { get; } = errors;
|
||||||
|
public int RowCount { get; } = rowCount;
|
||||||
|
public int CellCount { get; } = cellCount;
|
||||||
|
}
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
public sealed class ParsedCriticalCellArtifact(
|
||||||
|
string rollBandLabel,
|
||||||
|
string columnKey,
|
||||||
|
IReadOnlyList<string> lines,
|
||||||
|
string rawCellText,
|
||||||
|
string descriptionText,
|
||||||
|
string? rawAffixText)
|
||||||
|
{
|
||||||
|
public string RollBandLabel { get; } = rollBandLabel;
|
||||||
|
public string ColumnKey { get; } = columnKey;
|
||||||
|
public IReadOnlyList<string> Lines { get; } = lines;
|
||||||
|
public string RawCellText { get; } = rawCellText;
|
||||||
|
public string DescriptionText { get; } = descriptionText;
|
||||||
|
public string? RawAffixText { get; } = rawAffixText;
|
||||||
|
}
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
public sealed class StandardCriticalTableParseResult(
|
||||||
|
ParsedCriticalTable table,
|
||||||
|
IReadOnlyList<XmlTextFragment> fragments,
|
||||||
|
IReadOnlyList<ParsedCriticalCellArtifact> cells,
|
||||||
|
ImportValidationReport validationReport)
|
||||||
|
{
|
||||||
|
public ParsedCriticalTable Table { get; } = table;
|
||||||
|
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
|
||||||
|
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
|
||||||
|
public ImportValidationReport ValidationReport { get; } = validationReport;
|
||||||
|
}
|
||||||
@@ -1,208 +1,206 @@
|
|||||||
using System.Text.RegularExpressions;
|
using System.Text.RegularExpressions;
|
||||||
|
using System.Xml;
|
||||||
|
using System.Xml.Linq;
|
||||||
|
|
||||||
namespace RolemasterDb.ImportTool.Parsing;
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
public sealed class StandardCriticalTableParser
|
public sealed class StandardCriticalTableParser
|
||||||
{
|
{
|
||||||
private static readonly Regex ColumnRegex = new(@"\b([A-E])\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
private const int HeaderToBodyMinimumGap = 20;
|
||||||
private static readonly Regex RollBandRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)\s*$", RegexOptions.Compiled);
|
private const int TopGroupingTolerance = 2;
|
||||||
private static readonly Regex RollBandLineRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)(?<rest>\s+.*)?$", RegexOptions.Compiled);
|
|
||||||
|
|
||||||
public ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText)
|
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||||
{
|
{
|
||||||
var lines = extractedText.Replace("\r\n", "\n", StringComparison.Ordinal)
|
var fragments = LoadFragments(xmlContent);
|
||||||
.Replace('\f', '\n')
|
var headerFragments = FindHeaderFragments(fragments);
|
||||||
.Split('\n');
|
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments);
|
||||||
|
var validationErrors = new List<string>();
|
||||||
|
|
||||||
var headerIndex = Array.FindIndex(lines, IsColumnHeaderLine);
|
var columnCenters = headerFragments
|
||||||
if (headerIndex < 0)
|
.OrderBy(item => item.Left)
|
||||||
{
|
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
|
||||||
throw new InvalidOperationException("The standard table header could not be found in the extracted text.");
|
|
||||||
}
|
|
||||||
|
|
||||||
var columnStarts = GetColumnStarts(lines[headerIndex]);
|
|
||||||
var boundaries = GetColumnBoundaries(columnStarts);
|
|
||||||
var columns = columnStarts
|
|
||||||
.Select((item, index) => new ParsedCriticalColumn(item.Label, item.Label, "severity", index + 1))
|
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
var firstRollBandIndex = FindNextRollBandIndex(lines, headerIndex + 1);
|
var rowAnchors = rowLabelFragments
|
||||||
if (firstRollBandIndex < 0)
|
.OrderBy(item => item.Top)
|
||||||
{
|
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
|
||||||
throw new InvalidOperationException("No roll bands were found in the extracted text.");
|
|
||||||
}
|
|
||||||
|
|
||||||
var keyLineIndex = Array.FindIndex(lines, firstRollBandIndex, item => item.TrimStart().StartsWith("Key:", StringComparison.OrdinalIgnoreCase));
|
|
||||||
if (keyLineIndex < 0)
|
|
||||||
{
|
|
||||||
keyLineIndex = lines.Length;
|
|
||||||
}
|
|
||||||
|
|
||||||
var leadingLines = lines[(headerIndex + 1)..firstRollBandIndex]
|
|
||||||
.Where(item => !string.IsNullOrWhiteSpace(item))
|
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
var rollBands = new List<ParsedCriticalRollBand>();
|
if (rowAnchors.Count == 0)
|
||||||
var results = new List<ParsedCriticalResult>();
|
|
||||||
var currentLabel = string.Empty;
|
|
||||||
var currentRowLines = new List<string>();
|
|
||||||
var rowIndex = 0;
|
|
||||||
|
|
||||||
void FlushCurrentRow()
|
|
||||||
{
|
{
|
||||||
if (string.IsNullOrEmpty(currentLabel))
|
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
rowIndex++;
|
|
||||||
var rollBand = CreateRollBand(currentLabel, rowIndex);
|
|
||||||
rollBands.Add(rollBand);
|
|
||||||
|
|
||||||
var cellLines = SplitRowLines(currentRowLines, boundaries, columns.Count);
|
|
||||||
for (var columnIndex = 0; columnIndex < columns.Count; columnIndex++)
|
|
||||||
{
|
|
||||||
var rawCellLines = cellLines[columnIndex]
|
|
||||||
.Where(item => !string.IsNullOrWhiteSpace(item))
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
var rawAffixLines = rawCellLines
|
|
||||||
.Where(IsAffixLikeLine)
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
var descriptionLines = rawCellLines
|
|
||||||
.Where(item => !IsAffixLikeLine(item))
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
results.Add(new ParsedCriticalResult(
|
|
||||||
columns[columnIndex].ColumnKey,
|
|
||||||
rollBand.Label,
|
|
||||||
string.Join(Environment.NewLine, rawCellLines),
|
|
||||||
CollapseWhitespace(string.Join(' ', descriptionLines)),
|
|
||||||
rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines)));
|
|
||||||
}
|
|
||||||
|
|
||||||
currentLabel = string.Empty;
|
|
||||||
currentRowLines = new List<string>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var lineIndex = firstRollBandIndex; lineIndex < keyLineIndex; lineIndex++)
|
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
||||||
|
var keyTop = fragments
|
||||||
|
.Where(item => string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase))
|
||||||
|
.Select(item => (int?)item.Top)
|
||||||
|
.Min() ?? int.MaxValue;
|
||||||
|
|
||||||
|
var bodyFragments = fragments
|
||||||
|
.Where(item =>
|
||||||
|
item.Top >= bodyStartTop &&
|
||||||
|
item.Top < keyTop - 1 &&
|
||||||
|
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
|
||||||
|
!headerFragments.Contains(item))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var parsedRollBands = rowAnchors
|
||||||
|
.Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var parsedCells = new List<ParsedCriticalCellArtifact>();
|
||||||
|
var parsedResults = new List<ParsedCriticalResult>();
|
||||||
|
|
||||||
|
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
|
||||||
{
|
{
|
||||||
if (TryParseRollBandLine(lines[lineIndex], out var label, out var trailingText))
|
var rowStart = rowIndex == 0
|
||||||
|
? bodyStartTop
|
||||||
|
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0);
|
||||||
|
|
||||||
|
var rowEnd = rowIndex == rowAnchors.Count - 1
|
||||||
|
? keyTop - 1
|
||||||
|
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0);
|
||||||
|
|
||||||
|
var rowFragments = bodyFragments
|
||||||
|
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
foreach (var columnAnchor in columnCenters)
|
||||||
{
|
{
|
||||||
var trailingTextBelongsToCurrentRow = IsAffixLikeLine(trailingText);
|
var cellFragments = rowFragments
|
||||||
|
.Where(item => ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ThenBy(item => item.Left)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
if (!string.IsNullOrWhiteSpace(trailingText) &&
|
if (cellFragments.Count == 0)
|
||||||
!string.IsNullOrEmpty(currentLabel) &&
|
|
||||||
!trailingTextBelongsToCurrentRow)
|
|
||||||
{
|
{
|
||||||
currentRowLines.Add(trailingText);
|
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
FlushCurrentRow();
|
var lines = BuildLines(cellFragments);
|
||||||
currentLabel = label;
|
var rawAffixLines = lines.Where(IsAffixLikeLine).ToList();
|
||||||
if (rowIndex == 0)
|
var descriptionLines = lines.Where(line => !IsAffixLikeLine(line)).ToList();
|
||||||
{
|
var rawCellText = string.Join(Environment.NewLine, lines);
|
||||||
currentRowLines.AddRange(leadingLines);
|
var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
|
||||||
}
|
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
|
||||||
|
|
||||||
if (!string.IsNullOrWhiteSpace(trailingText) && trailingTextBelongsToCurrentRow)
|
parsedCells.Add(new ParsedCriticalCellArtifact(
|
||||||
{
|
rowAnchors[rowIndex].Label,
|
||||||
currentRowLines.Add(trailingText);
|
columnAnchor.Key,
|
||||||
}
|
lines,
|
||||||
|
rawCellText,
|
||||||
|
descriptionText,
|
||||||
|
rawAffixText));
|
||||||
|
|
||||||
continue;
|
parsedResults.Add(new ParsedCriticalResult(
|
||||||
}
|
columnAnchor.Key,
|
||||||
|
rowAnchors[rowIndex].Label,
|
||||||
if (!string.IsNullOrWhiteSpace(lines[lineIndex]))
|
rawCellText,
|
||||||
{
|
descriptionText,
|
||||||
currentRowLines.Add(lines[lineIndex]);
|
rawAffixText));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
FlushCurrentRow();
|
if (columnCenters.Count != 5)
|
||||||
|
{
|
||||||
|
validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}.");
|
||||||
|
}
|
||||||
|
|
||||||
return new ParsedCriticalTable(
|
if (parsedCells.Count != rowAnchors.Count * columnCenters.Count)
|
||||||
|
{
|
||||||
|
validationErrors.Add(
|
||||||
|
$"Expected {rowAnchors.Count * columnCenters.Count} parsed cells but produced {parsedCells.Count}.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var validationReport = new ImportValidationReport(
|
||||||
|
validationErrors.Count == 0,
|
||||||
|
validationErrors,
|
||||||
|
rowAnchors.Count,
|
||||||
|
parsedCells.Count);
|
||||||
|
|
||||||
|
var table = new ParsedCriticalTable(
|
||||||
entry.Slug,
|
entry.Slug,
|
||||||
entry.DisplayName,
|
entry.DisplayName,
|
||||||
entry.Family,
|
entry.Family,
|
||||||
Path.GetFileName(entry.PdfPath),
|
Path.GetFileName(entry.PdfPath),
|
||||||
"Imported from PDF text extraction.",
|
"Imported from PDF XML extraction.",
|
||||||
columns,
|
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
|
||||||
rollBands,
|
parsedRollBands,
|
||||||
results);
|
parsedResults);
|
||||||
|
|
||||||
|
return new StandardCriticalTableParseResult(table, fragments, parsedCells, validationReport);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static bool IsColumnHeaderLine(string line)
|
private static List<XmlTextFragment> LoadFragments(string xmlContent)
|
||||||
{
|
{
|
||||||
var matches = ColumnRegex.Matches(line);
|
using var stringReader = new StringReader(xmlContent);
|
||||||
return matches.Count == 5;
|
using var xmlReader = XmlReader.Create(
|
||||||
}
|
stringReader,
|
||||||
|
new XmlReaderSettings
|
||||||
|
{
|
||||||
|
DtdProcessing = DtdProcessing.Ignore
|
||||||
|
});
|
||||||
|
|
||||||
private static List<(string Label, int Start)> GetColumnStarts(string headerLine)
|
var document = XDocument.Load(xmlReader);
|
||||||
{
|
|
||||||
var matches = ColumnRegex.Matches(headerLine);
|
return document.Descendants("page")
|
||||||
return matches
|
.SelectMany(page =>
|
||||||
.Select(match => (match.Groups[1].Value.ToUpperInvariant(), match.Index))
|
{
|
||||||
|
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
|
||||||
|
return page.Elements("text")
|
||||||
|
.Select(item => new XmlTextFragment(
|
||||||
|
pageNumber,
|
||||||
|
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
|
||||||
|
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
|
||||||
|
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
|
||||||
|
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
|
||||||
|
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
|
||||||
|
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
|
||||||
|
})
|
||||||
.ToList();
|
.ToList();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int[] GetColumnBoundaries(IReadOnlyList<(string Label, int Start)> columns)
|
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
||||||
{
|
{
|
||||||
var boundaries = new int[columns.Count - 1];
|
var groupedByTop = fragments
|
||||||
for (var index = 0; index < boundaries.Length; index++)
|
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
|
||||||
{
|
.GroupBy(item => item.Top)
|
||||||
boundaries[index] = (columns[index].Start + columns[index + 1].Start) / 2;
|
.OrderBy(group => group.Key);
|
||||||
}
|
|
||||||
|
|
||||||
return boundaries;
|
foreach (var group in groupedByTop)
|
||||||
}
|
|
||||||
|
|
||||||
private static int FindNextRollBandIndex(IReadOnlyList<string> lines, int startIndex)
|
|
||||||
{
|
|
||||||
for (var index = startIndex; index < lines.Count; index++)
|
|
||||||
{
|
{
|
||||||
if (TryParseRollBandLine(lines[index], out _, out _))
|
var ordered = group.OrderBy(item => item.Left).ToList();
|
||||||
|
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
|
||||||
|
if (labels.SequenceEqual(["A", "B", "C", "D", "E"]))
|
||||||
{
|
{
|
||||||
return index;
|
return ordered;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return -1;
|
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static bool TryParseRollBandLabel(string line, out string label)
|
private static List<XmlTextFragment> FindRowLabelFragments(
|
||||||
|
IReadOnlyList<XmlTextFragment> fragments,
|
||||||
|
IReadOnlyList<XmlTextFragment> headerFragments)
|
||||||
{
|
{
|
||||||
var match = RollBandRegex.Match(line);
|
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
|
||||||
if (!match.Success)
|
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
||||||
{
|
|
||||||
label = string.Empty;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
label = match.Groups[1].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
|
return fragments
|
||||||
return true;
|
.Where(item =>
|
||||||
|
item.Left < leftCutoff &&
|
||||||
|
item.Top >= bodyStartTop &&
|
||||||
|
IsRollBandLabel(item.Text))
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ToList();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static bool TryParseRollBandLine(string line, out string label, out string trailingText)
|
private static bool IsRollBandLabel(string value) =>
|
||||||
{
|
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$");
|
||||||
var match = RollBandLineRegex.Match(line);
|
|
||||||
if (!match.Success)
|
|
||||||
{
|
|
||||||
label = string.Empty;
|
|
||||||
trailingText = string.Empty;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
label = match.Groups["label"].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
|
|
||||||
|
|
||||||
var restGroup = match.Groups["rest"];
|
|
||||||
trailingText = restGroup.Success
|
|
||||||
? string.Concat(new string(' ', restGroup.Index), restGroup.Value.TrimEnd())
|
|
||||||
: string.Empty;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
|
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
|
||||||
{
|
{
|
||||||
@@ -217,35 +215,39 @@ public sealed class StandardCriticalTableParser
|
|||||||
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
|
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<string>[] SplitRowLines(IReadOnlyList<string> rowLines, int[] boundaries, int columnCount)
|
private static string ResolveColumn(double centerX, IReadOnlyList<ColumnAnchor> columns)
|
||||||
{
|
{
|
||||||
var result = Enumerable.Range(0, columnCount)
|
for (var index = 0; index < columns.Count - 1; index++)
|
||||||
.Select(_ => new List<string>())
|
|
||||||
.ToArray();
|
|
||||||
|
|
||||||
foreach (var line in rowLines)
|
|
||||||
{
|
{
|
||||||
for (var columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
|
||||||
|
if (centerX < boundary)
|
||||||
{
|
{
|
||||||
var start = columnIndex == 0 ? 0 : boundaries[columnIndex - 1];
|
return columns[index].Key;
|
||||||
var end = columnIndex == columnCount - 1
|
|
||||||
? line.Length
|
|
||||||
: Math.Min(boundaries[columnIndex], line.Length);
|
|
||||||
|
|
||||||
if (start >= line.Length || end <= start)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
var segment = line[start..end].Trim();
|
|
||||||
if (!string.IsNullOrWhiteSpace(segment))
|
|
||||||
{
|
|
||||||
result[columnIndex].Add(segment);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return columns[^1].Key;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
|
||||||
|
{
|
||||||
|
var lines = new List<List<XmlTextFragment>>();
|
||||||
|
|
||||||
|
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
|
||||||
|
{
|
||||||
|
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
|
||||||
|
{
|
||||||
|
lines.Add([fragment]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
lines[^1].Add(fragment);
|
||||||
|
}
|
||||||
|
|
||||||
|
return lines
|
||||||
|
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
|
||||||
|
.Where(item => !string.IsNullOrWhiteSpace(item))
|
||||||
|
.ToList();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static bool IsAffixLikeLine(string line)
|
private static bool IsAffixLikeLine(string line)
|
||||||
@@ -256,7 +258,7 @@ public sealed class StandardCriticalTableParser
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (value == "—")
|
if (value == "-" || value == "\u2014")
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -270,16 +272,27 @@ public sealed class StandardCriticalTableParser
|
|||||||
}
|
}
|
||||||
|
|
||||||
return value.StartsWith("+", StringComparison.Ordinal) ||
|
return value.StartsWith("+", StringComparison.Ordinal) ||
|
||||||
value.StartsWith('∑') ||
|
value.StartsWith("\u2211", StringComparison.Ordinal) ||
|
||||||
value.StartsWith('∏') ||
|
value.StartsWith("\u220F", StringComparison.Ordinal) ||
|
||||||
value.StartsWith('π') ||
|
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
|
||||||
value.StartsWith('∫') ||
|
value.StartsWith("\u222B", StringComparison.Ordinal) ||
|
||||||
char.IsDigit(value[0]) ||
|
char.IsDigit(value[0]) ||
|
||||||
value.Contains(" – ", StringComparison.Ordinal) ||
|
value.Contains(" - ", StringComparison.Ordinal) ||
|
||||||
value.Contains("(-", StringComparison.Ordinal) ||
|
value.Contains("(-", StringComparison.Ordinal) ||
|
||||||
value.Contains("(+", StringComparison.Ordinal);
|
value.Contains("(+", StringComparison.Ordinal);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string CollapseWhitespace(string value) =>
|
private static string CollapseWhitespace(string value) =>
|
||||||
Regex.Replace(value.Trim(), @"\s+", " ");
|
Regex.Replace(value.Trim(), @"\s+", " ");
|
||||||
|
|
||||||
|
private static string NormalizeText(string value) =>
|
||||||
|
value
|
||||||
|
.Replace('\u00a0', ' ')
|
||||||
|
.Replace('\r', ' ')
|
||||||
|
.Replace('\n', ' ')
|
||||||
|
.Trim();
|
||||||
|
|
||||||
|
private sealed record ColumnAnchor(string Key, double CenterX);
|
||||||
|
|
||||||
|
private sealed record RowAnchor(string Label, int Top, int SortOrder);
|
||||||
}
|
}
|
||||||
|
|||||||
18
src/RolemasterDb.ImportTool/Parsing/XmlTextFragment.cs
Normal file
18
src/RolemasterDb.ImportTool/Parsing/XmlTextFragment.cs
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
public sealed class XmlTextFragment(
|
||||||
|
int pageNumber,
|
||||||
|
int top,
|
||||||
|
int left,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
string text)
|
||||||
|
{
|
||||||
|
public int PageNumber { get; } = pageNumber;
|
||||||
|
public int Top { get; } = top;
|
||||||
|
public int Left { get; } = left;
|
||||||
|
public int Width { get; } = width;
|
||||||
|
public int Height { get; } = height;
|
||||||
|
public string Text { get; } = text;
|
||||||
|
public double CenterX => Left + (Width / 2.0);
|
||||||
|
}
|
||||||
@@ -2,7 +2,7 @@ using System.Diagnostics;
|
|||||||
|
|
||||||
namespace RolemasterDb.ImportTool;
|
namespace RolemasterDb.ImportTool;
|
||||||
|
|
||||||
public sealed class PdfTextExtractor
|
public sealed class PdfXmlExtractor
|
||||||
{
|
{
|
||||||
public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default)
|
public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
@@ -10,14 +10,16 @@ public sealed class PdfTextExtractor
|
|||||||
|
|
||||||
var startInfo = new ProcessStartInfo
|
var startInfo = new ProcessStartInfo
|
||||||
{
|
{
|
||||||
FileName = "pdftotext",
|
FileName = "pdftohtml",
|
||||||
RedirectStandardError = true,
|
RedirectStandardError = true,
|
||||||
RedirectStandardOutput = true,
|
RedirectStandardOutput = true,
|
||||||
UseShellExecute = false,
|
UseShellExecute = false,
|
||||||
CreateNoWindow = true
|
CreateNoWindow = true
|
||||||
};
|
};
|
||||||
|
|
||||||
startInfo.ArgumentList.Add("-layout");
|
startInfo.ArgumentList.Add("-xml");
|
||||||
|
startInfo.ArgumentList.Add("-i");
|
||||||
|
startInfo.ArgumentList.Add("-noframes");
|
||||||
startInfo.ArgumentList.Add(pdfPath);
|
startInfo.ArgumentList.Add(pdfPath);
|
||||||
startInfo.ArgumentList.Add(outputPath);
|
startInfo.ArgumentList.Add(outputPath);
|
||||||
|
|
||||||
@@ -28,7 +30,7 @@ public sealed class PdfTextExtractor
|
|||||||
if (process.ExitCode != 0)
|
if (process.ExitCode != 0)
|
||||||
{
|
{
|
||||||
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
|
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
|
||||||
throw new InvalidOperationException($"pdftotext failed for '{pdfPath}': {error}");
|
throw new InvalidOperationException($"pdftohtml failed for '{pdfPath}': {error}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user