Use XML geometry for critical PDF import

This commit is contained in:
2026-03-14 01:25:43 +01:00
parent f70d610c92
commit 719355da90
10 changed files with 335 additions and 201 deletions

Binary file not shown.

View File

@@ -5,7 +5,8 @@ namespace RolemasterDb.ImportTool;
public sealed class CriticalImportCommandRunner public sealed class CriticalImportCommandRunner
{ {
private readonly CriticalImportManifestLoader manifestLoader = new(); private readonly CriticalImportManifestLoader manifestLoader = new();
private readonly PdfTextExtractor pdfTextExtractor = new(); private readonly ImportArtifactWriter artifactWriter = new();
private readonly PdfXmlExtractor pdfXmlExtractor = new();
private readonly StandardCriticalTableParser standardParser = new(); private readonly StandardCriticalTableParser standardParser = new();
public async Task<int> RunAsync(ResetOptions options) public async Task<int> RunAsync(ResetOptions options)
@@ -26,8 +27,8 @@ public sealed class CriticalImportCommandRunner
{ {
var entry = GetManifestEntry(options.Table); var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug); var artifactPaths = CreateArtifactPaths(entry.Slug);
await pdfTextExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.ExtractedTextPath); await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath);
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.ExtractedTextPath}"); Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.XmlPath}");
return 0; return 0;
} }
@@ -36,16 +37,24 @@ public sealed class CriticalImportCommandRunner
var entry = GetManifestEntry(options.Table); var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug); var artifactPaths = CreateArtifactPaths(entry.Slug);
if (!File.Exists(artifactPaths.ExtractedTextPath)) if (!File.Exists(artifactPaths.XmlPath))
{ {
Console.Error.WriteLine($"Missing extracted text artifact: {artifactPaths.ExtractedTextPath}"); Console.Error.WriteLine($"Missing XML artifact: {artifactPaths.XmlPath}");
return 1; return 1;
} }
var extractedText = await File.ReadAllTextAsync(artifactPaths.ExtractedTextPath); var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
var parsedTable = Parse(entry, extractedText); var parseResult = Parse(entry, xmlContent);
await artifactWriter.WriteAsync(artifactPaths, parseResult, CancellationToken.None);
if (!parseResult.ValidationReport.IsValid)
{
throw new InvalidOperationException(
$"Validation failed for '{entry.Slug}'. See {artifactPaths.ValidationReportPath} for details.");
}
var loader = new CriticalImportLoader(ResolveDatabasePath(options.DatabasePath)); var loader = new CriticalImportLoader(ResolveDatabasePath(options.DatabasePath));
var result = await loader.LoadAsync(parsedTable); var result = await loader.LoadAsync(parseResult.Table);
Console.WriteLine( Console.WriteLine(
$"Loaded {result.TableSlug}: {result.ColumnCount} columns, {result.RollBandCount} roll bands, {result.ResultCount} results."); $"Loaded {result.TableSlug}: {result.ColumnCount} columns, {result.RollBandCount} roll bands, {result.ResultCount} results.");
@@ -82,14 +91,14 @@ public sealed class CriticalImportCommandRunner
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'."); ?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
} }
private ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText) private StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{ {
if (!string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase)) if (!string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
{ {
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 1."); throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 2.");
} }
return standardParser.Parse(entry, extractedText); return standardParser.Parse(entry, xmlContent);
} }
private static ImportArtifactPaths CreateArtifactPaths(string slug) => private static ImportArtifactPaths CreateArtifactPaths(string slug) =>

View File

@@ -2,18 +2,34 @@ namespace RolemasterDb.ImportTool;
public sealed class ImportArtifactPaths public sealed class ImportArtifactPaths
{ {
private ImportArtifactPaths(string directoryPath, string extractedTextPath) private ImportArtifactPaths(
string directoryPath,
string xmlPath,
string fragmentsJsonPath,
string parsedCellsJsonPath,
string validationReportPath)
{ {
DirectoryPath = directoryPath; DirectoryPath = directoryPath;
ExtractedTextPath = extractedTextPath; XmlPath = xmlPath;
FragmentsJsonPath = fragmentsJsonPath;
ParsedCellsJsonPath = parsedCellsJsonPath;
ValidationReportPath = validationReportPath;
} }
public string DirectoryPath { get; } public string DirectoryPath { get; }
public string ExtractedTextPath { get; } public string XmlPath { get; }
public string FragmentsJsonPath { get; }
public string ParsedCellsJsonPath { get; }
public string ValidationReportPath { get; }
public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug) public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug)
{ {
var directoryPath = Path.Combine(artifactsRootPath, tableSlug); var directoryPath = Path.Combine(artifactsRootPath, tableSlug);
return new ImportArtifactPaths(directoryPath, Path.Combine(directoryPath, "extracted.txt")); return new ImportArtifactPaths(
directoryPath,
Path.Combine(directoryPath, "source.xml"),
Path.Combine(directoryPath, "fragments.json"),
Path.Combine(directoryPath, "parsed-cells.json"),
Path.Combine(directoryPath, "validation-report.json"));
} }
} }

View File

@@ -0,0 +1,33 @@
using System.Text.Json;
using RolemasterDb.ImportTool.Parsing;
namespace RolemasterDb.ImportTool;
public sealed class ImportArtifactWriter
{
private static readonly JsonSerializerOptions JsonOptions = new()
{
WriteIndented = true
};
public async Task WriteAsync(ImportArtifactPaths artifactPaths, StandardCriticalTableParseResult parseResult, CancellationToken cancellationToken = default)
{
Directory.CreateDirectory(artifactPaths.DirectoryPath);
await File.WriteAllTextAsync(
artifactPaths.FragmentsJsonPath,
JsonSerializer.Serialize(parseResult.Fragments, JsonOptions),
cancellationToken);
await File.WriteAllTextAsync(
artifactPaths.ParsedCellsJsonPath,
JsonSerializer.Serialize(parseResult.Cells, JsonOptions),
cancellationToken);
await File.WriteAllTextAsync(
artifactPaths.ValidationReportPath,
JsonSerializer.Serialize(parseResult.ValidationReport, JsonOptions),
cancellationToken);
}
}

View File

@@ -0,0 +1,13 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ImportValidationReport(
bool isValid,
IReadOnlyList<string> errors,
int rowCount,
int cellCount)
{
public bool IsValid { get; } = isValid;
public IReadOnlyList<string> Errors { get; } = errors;
public int RowCount { get; } = rowCount;
public int CellCount { get; } = cellCount;
}

View File

@@ -0,0 +1,17 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalCellArtifact(
string rollBandLabel,
string columnKey,
IReadOnlyList<string> lines,
string rawCellText,
string descriptionText,
string? rawAffixText)
{
public string RollBandLabel { get; } = rollBandLabel;
public string ColumnKey { get; } = columnKey;
public IReadOnlyList<string> Lines { get; } = lines;
public string RawCellText { get; } = rawCellText;
public string DescriptionText { get; } = descriptionText;
public string? RawAffixText { get; } = rawAffixText;
}

View File

@@ -0,0 +1,13 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParseResult(
ParsedCriticalTable table,
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<ParsedCriticalCellArtifact> cells,
ImportValidationReport validationReport)
{
public ParsedCriticalTable Table { get; } = table;
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
public ImportValidationReport ValidationReport { get; } = validationReport;
}

View File

@@ -1,208 +1,206 @@
using System.Text.RegularExpressions; using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
namespace RolemasterDb.ImportTool.Parsing; namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParser public sealed class StandardCriticalTableParser
{ {
private static readonly Regex ColumnRegex = new(@"\b([A-E])\b", RegexOptions.IgnoreCase | RegexOptions.Compiled); private const int HeaderToBodyMinimumGap = 20;
private static readonly Regex RollBandRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)\s*$", RegexOptions.Compiled); private const int TopGroupingTolerance = 2;
private static readonly Regex RollBandLineRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)(?<rest>\s+.*)?$", RegexOptions.Compiled);
public ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText) public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{ {
var lines = extractedText.Replace("\r\n", "\n", StringComparison.Ordinal) var fragments = LoadFragments(xmlContent);
.Replace('\f', '\n') var headerFragments = FindHeaderFragments(fragments);
.Split('\n'); var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments);
var validationErrors = new List<string>();
var headerIndex = Array.FindIndex(lines, IsColumnHeaderLine); var columnCenters = headerFragments
if (headerIndex < 0) .OrderBy(item => item.Left)
{ .Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
throw new InvalidOperationException("The standard table header could not be found in the extracted text.");
}
var columnStarts = GetColumnStarts(lines[headerIndex]);
var boundaries = GetColumnBoundaries(columnStarts);
var columns = columnStarts
.Select((item, index) => new ParsedCriticalColumn(item.Label, item.Label, "severity", index + 1))
.ToList(); .ToList();
var firstRollBandIndex = FindNextRollBandIndex(lines, headerIndex + 1); var rowAnchors = rowLabelFragments
if (firstRollBandIndex < 0) .OrderBy(item => item.Top)
{ .Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
throw new InvalidOperationException("No roll bands were found in the extracted text.");
}
var keyLineIndex = Array.FindIndex(lines, firstRollBandIndex, item => item.TrimStart().StartsWith("Key:", StringComparison.OrdinalIgnoreCase));
if (keyLineIndex < 0)
{
keyLineIndex = lines.Length;
}
var leadingLines = lines[(headerIndex + 1)..firstRollBandIndex]
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList(); .ToList();
var rollBands = new List<ParsedCriticalRollBand>(); if (rowAnchors.Count == 0)
var results = new List<ParsedCriticalResult>();
var currentLabel = string.Empty;
var currentRowLines = new List<string>();
var rowIndex = 0;
void FlushCurrentRow()
{ {
if (string.IsNullOrEmpty(currentLabel)) validationErrors.Add("No roll-band labels were found in the XML artifact.");
{
return;
} }
rowIndex++; var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
var rollBand = CreateRollBand(currentLabel, rowIndex); var keyTop = fragments
rollBands.Add(rollBand); .Where(item => string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var cellLines = SplitRowLines(currentRowLines, boundaries, columns.Count); var bodyFragments = fragments
for (var columnIndex = 0; columnIndex < columns.Count; columnIndex++) .Where(item =>
{ item.Top >= bodyStartTop &&
var rawCellLines = cellLines[columnIndex] item.Top < keyTop - 1 &&
.Where(item => !string.IsNullOrWhiteSpace(item)) !rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList(); .ToList();
var rawAffixLines = rawCellLines var parsedRollBands = rowAnchors
.Where(IsAffixLikeLine) .Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList(); .ToList();
var descriptionLines = rawCellLines var parsedCells = new List<ParsedCriticalCellArtifact>();
.Where(item => !IsAffixLikeLine(item)) var parsedResults = new List<ParsedCriticalResult>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? bodyStartTop
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList(); .ToList();
results.Add(new ParsedCriticalResult( foreach (var columnAnchor in columnCenters)
columns[columnIndex].ColumnKey,
rollBand.Label,
string.Join(Environment.NewLine, rawCellLines),
CollapseWhitespace(string.Join(' ', descriptionLines)),
rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines)));
}
currentLabel = string.Empty;
currentRowLines = new List<string>();
}
for (var lineIndex = firstRollBandIndex; lineIndex < keyLineIndex; lineIndex++)
{ {
if (TryParseRollBandLine(lines[lineIndex], out var label, out var trailingText)) var cellFragments = rowFragments
{ .Where(item => ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
var trailingTextBelongsToCurrentRow = IsAffixLikeLine(trailingText); .OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (!string.IsNullOrWhiteSpace(trailingText) && if (cellFragments.Count == 0)
!string.IsNullOrEmpty(currentLabel) &&
!trailingTextBelongsToCurrentRow)
{ {
currentRowLines.Add(trailingText); validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
}
FlushCurrentRow();
currentLabel = label;
if (rowIndex == 0)
{
currentRowLines.AddRange(leadingLines);
}
if (!string.IsNullOrWhiteSpace(trailingText) && trailingTextBelongsToCurrentRow)
{
currentRowLines.Add(trailingText);
}
continue; continue;
} }
if (!string.IsNullOrWhiteSpace(lines[lineIndex])) var lines = BuildLines(cellFragments);
var rawAffixLines = lines.Where(IsAffixLikeLine).ToList();
var descriptionLines = lines.Where(line => !IsAffixLikeLine(line)).ToList();
var rawCellText = string.Join(Environment.NewLine, lines);
var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
rowAnchors[rowIndex].Label,
columnAnchor.Key,
lines,
rawCellText,
descriptionText,
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
columnAnchor.Key,
rowAnchors[rowIndex].Label,
rawCellText,
descriptionText,
rawAffixText));
}
}
if (columnCenters.Count != 5)
{ {
currentRowLines.Add(lines[lineIndex]); validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}.");
}
} }
FlushCurrentRow(); if (parsedCells.Count != rowAnchors.Count * columnCenters.Count)
{
validationErrors.Add(
$"Expected {rowAnchors.Count * columnCenters.Count} parsed cells but produced {parsedCells.Count}.");
}
return new ParsedCriticalTable( var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
rowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug, entry.Slug,
entry.DisplayName, entry.DisplayName,
entry.Family, entry.Family,
Path.GetFileName(entry.PdfPath), Path.GetFileName(entry.PdfPath),
"Imported from PDF text extraction.", "Imported from PDF XML extraction.",
columns, columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
rollBands, parsedRollBands,
results); parsedResults);
return new StandardCriticalTableParseResult(table, fragments, parsedCells, validationReport);
} }
private static bool IsColumnHeaderLine(string line) private static List<XmlTextFragment> LoadFragments(string xmlContent)
{ {
var matches = ColumnRegex.Matches(line); using var stringReader = new StringReader(xmlContent);
return matches.Count == 5; using var xmlReader = XmlReader.Create(
} stringReader,
new XmlReaderSettings
{
DtdProcessing = DtdProcessing.Ignore
});
private static List<(string Label, int Start)> GetColumnStarts(string headerLine) var document = XDocument.Load(xmlReader);
return document.Descendants("page")
.SelectMany(page =>
{ {
var matches = ColumnRegex.Matches(headerLine); var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
return matches return page.Elements("text")
.Select(match => (match.Groups[1].Value.ToUpperInvariant(), match.Index)) .Select(item => new XmlTextFragment(
pageNumber,
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
})
.ToList(); .ToList();
} }
private static int[] GetColumnBoundaries(IReadOnlyList<(string Label, int Start)> columns) private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
{ {
var boundaries = new int[columns.Count - 1]; var groupedByTop = fragments
for (var index = 0; index < boundaries.Length; index++) .Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
{ .GroupBy(item => item.Top)
boundaries[index] = (columns[index].Start + columns[index + 1].Start) / 2; .OrderBy(group => group.Key);
}
return boundaries; foreach (var group in groupedByTop)
}
private static int FindNextRollBandIndex(IReadOnlyList<string> lines, int startIndex)
{ {
for (var index = startIndex; index < lines.Count; index++) var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
if (labels.SequenceEqual(["A", "B", "C", "D", "E"]))
{ {
if (TryParseRollBandLine(lines[index], out _, out _)) return ordered;
{
return index;
} }
} }
return -1; throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
} }
private static bool TryParseRollBandLabel(string line, out string label) private static List<XmlTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<XmlTextFragment> headerFragments)
{ {
var match = RollBandRegex.Match(line); var leftCutoff = headerFragments.Min(item => item.Left) - 10;
if (!match.Success) var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
{
label = string.Empty; return fragments
return false; .Where(item =>
item.Left < leftCutoff &&
item.Top >= bodyStartTop &&
IsRollBandLabel(item.Text))
.OrderBy(item => item.Top)
.ToList();
} }
label = match.Groups[1].Value.Replace(" ", string.Empty, StringComparison.Ordinal); private static bool IsRollBandLabel(string value) =>
return true; Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$");
}
private static bool TryParseRollBandLine(string line, out string label, out string trailingText)
{
var match = RollBandLineRegex.Match(line);
if (!match.Success)
{
label = string.Empty;
trailingText = string.Empty;
return false;
}
label = match.Groups["label"].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
var restGroup = match.Groups["rest"];
trailingText = restGroup.Success
? string.Concat(new string(' ', restGroup.Index), restGroup.Value.TrimEnd())
: string.Empty;
return true;
}
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder) private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
{ {
@@ -217,35 +215,39 @@ public sealed class StandardCriticalTableParser
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder); : new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
} }
private static List<string>[] SplitRowLines(IReadOnlyList<string> rowLines, int[] boundaries, int columnCount) private static string ResolveColumn(double centerX, IReadOnlyList<ColumnAnchor> columns)
{ {
var result = Enumerable.Range(0, columnCount) for (var index = 0; index < columns.Count - 1; index++)
.Select(_ => new List<string>()) {
.ToArray(); var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
if (centerX < boundary)
{
return columns[index].Key;
}
}
foreach (var line in rowLines) return columns[^1].Key;
{ }
for (var columnIndex = 0; columnIndex < columnCount; columnIndex++)
{
var start = columnIndex == 0 ? 0 : boundaries[columnIndex - 1];
var end = columnIndex == columnCount - 1
? line.Length
: Math.Min(boundaries[columnIndex], line.Length);
if (start >= line.Length || end <= start) private static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
{ {
var lines = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
{
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
lines.Add([fragment]);
continue; continue;
} }
var segment = line[start..end].Trim(); lines[^1].Add(fragment);
if (!string.IsNullOrWhiteSpace(segment))
{
result[columnIndex].Add(segment);
}
}
} }
return result; return lines
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
} }
private static bool IsAffixLikeLine(string line) private static bool IsAffixLikeLine(string line)
@@ -256,7 +258,7 @@ public sealed class StandardCriticalTableParser
return false; return false;
} }
if (value == "") if (value == "-" || value == "\u2014")
{ {
return true; return true;
} }
@@ -270,16 +272,27 @@ public sealed class StandardCriticalTableParser
} }
return value.StartsWith("+", StringComparison.Ordinal) || return value.StartsWith("+", StringComparison.Ordinal) ||
value.StartsWith('∑') || value.StartsWith("\u2211", StringComparison.Ordinal) ||
value.StartsWith('∏') || value.StartsWith("\u220F", StringComparison.Ordinal) ||
value.StartsWith('π') || value.StartsWith("\u03C0", StringComparison.Ordinal) ||
value.StartsWith('∫') || value.StartsWith("\u222B", StringComparison.Ordinal) ||
char.IsDigit(value[0]) || char.IsDigit(value[0]) ||
value.Contains(" ", StringComparison.Ordinal) || value.Contains(" - ", StringComparison.Ordinal) ||
value.Contains("(-", StringComparison.Ordinal) || value.Contains("(-", StringComparison.Ordinal) ||
value.Contains("(+", StringComparison.Ordinal); value.Contains("(+", StringComparison.Ordinal);
} }
private static string CollapseWhitespace(string value) => private static string CollapseWhitespace(string value) =>
Regex.Replace(value.Trim(), @"\s+", " "); Regex.Replace(value.Trim(), @"\s+", " ");
private static string NormalizeText(string value) =>
value
.Replace('\u00a0', ' ')
.Replace('\r', ' ')
.Replace('\n', ' ')
.Trim();
private sealed record ColumnAnchor(string Key, double CenterX);
private sealed record RowAnchor(string Label, int Top, int SortOrder);
} }

View File

@@ -0,0 +1,18 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class XmlTextFragment(
int pageNumber,
int top,
int left,
int width,
int height,
string text)
{
public int PageNumber { get; } = pageNumber;
public int Top { get; } = top;
public int Left { get; } = left;
public int Width { get; } = width;
public int Height { get; } = height;
public string Text { get; } = text;
public double CenterX => Left + (Width / 2.0);
}

View File

@@ -2,7 +2,7 @@ using System.Diagnostics;
namespace RolemasterDb.ImportTool; namespace RolemasterDb.ImportTool;
public sealed class PdfTextExtractor public sealed class PdfXmlExtractor
{ {
public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default) public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default)
{ {
@@ -10,14 +10,16 @@ public sealed class PdfTextExtractor
var startInfo = new ProcessStartInfo var startInfo = new ProcessStartInfo
{ {
FileName = "pdftotext", FileName = "pdftohtml",
RedirectStandardError = true, RedirectStandardError = true,
RedirectStandardOutput = true, RedirectStandardOutput = true,
UseShellExecute = false, UseShellExecute = false,
CreateNoWindow = true CreateNoWindow = true
}; };
startInfo.ArgumentList.Add("-layout"); startInfo.ArgumentList.Add("-xml");
startInfo.ArgumentList.Add("-i");
startInfo.ArgumentList.Add("-noframes");
startInfo.ArgumentList.Add(pdfPath); startInfo.ArgumentList.Add(pdfPath);
startInfo.ArgumentList.Add(outputPath); startInfo.ArgumentList.Add(outputPath);
@@ -28,7 +30,7 @@ public sealed class PdfTextExtractor
if (process.ExitCode != 0) if (process.ExitCode != 0)
{ {
var error = await process.StandardError.ReadToEndAsync(cancellationToken); var error = await process.StandardError.ReadToEndAsync(cancellationToken);
throw new InvalidOperationException($"pdftotext failed for '{pdfPath}': {error}"); throw new InvalidOperationException($"pdftohtml failed for '{pdfPath}': {error}");
} }
} }
} }