Use XML geometry for critical PDF import

This commit is contained in:
2026-03-14 01:25:43 +01:00
parent f70d610c92
commit 719355da90
10 changed files with 335 additions and 201 deletions

Binary file not shown.

View File

@@ -5,7 +5,8 @@ namespace RolemasterDb.ImportTool;
public sealed class CriticalImportCommandRunner
{
private readonly CriticalImportManifestLoader manifestLoader = new();
private readonly PdfTextExtractor pdfTextExtractor = new();
private readonly ImportArtifactWriter artifactWriter = new();
private readonly PdfXmlExtractor pdfXmlExtractor = new();
private readonly StandardCriticalTableParser standardParser = new();
public async Task<int> RunAsync(ResetOptions options)
@@ -26,8 +27,8 @@ public sealed class CriticalImportCommandRunner
{
var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug);
await pdfTextExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.ExtractedTextPath);
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.ExtractedTextPath}");
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath);
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.XmlPath}");
return 0;
}
@@ -36,16 +37,24 @@ public sealed class CriticalImportCommandRunner
var entry = GetManifestEntry(options.Table);
var artifactPaths = CreateArtifactPaths(entry.Slug);
if (!File.Exists(artifactPaths.ExtractedTextPath))
if (!File.Exists(artifactPaths.XmlPath))
{
Console.Error.WriteLine($"Missing extracted text artifact: {artifactPaths.ExtractedTextPath}");
Console.Error.WriteLine($"Missing XML artifact: {artifactPaths.XmlPath}");
return 1;
}
var extractedText = await File.ReadAllTextAsync(artifactPaths.ExtractedTextPath);
var parsedTable = Parse(entry, extractedText);
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
var parseResult = Parse(entry, xmlContent);
await artifactWriter.WriteAsync(artifactPaths, parseResult, CancellationToken.None);
if (!parseResult.ValidationReport.IsValid)
{
throw new InvalidOperationException(
$"Validation failed for '{entry.Slug}'. See {artifactPaths.ValidationReportPath} for details.");
}
var loader = new CriticalImportLoader(ResolveDatabasePath(options.DatabasePath));
var result = await loader.LoadAsync(parsedTable);
var result = await loader.LoadAsync(parseResult.Table);
Console.WriteLine(
$"Loaded {result.TableSlug}: {result.ColumnCount} columns, {result.RollBandCount} roll bands, {result.ResultCount} results.");
@@ -82,14 +91,14 @@ public sealed class CriticalImportCommandRunner
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
}
private ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText)
private StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
if (!string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
{
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 1.");
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 2.");
}
return standardParser.Parse(entry, extractedText);
return standardParser.Parse(entry, xmlContent);
}
private static ImportArtifactPaths CreateArtifactPaths(string slug) =>

View File

@@ -2,18 +2,34 @@ namespace RolemasterDb.ImportTool;
public sealed class ImportArtifactPaths
{
private ImportArtifactPaths(string directoryPath, string extractedTextPath)
private ImportArtifactPaths(
string directoryPath,
string xmlPath,
string fragmentsJsonPath,
string parsedCellsJsonPath,
string validationReportPath)
{
DirectoryPath = directoryPath;
ExtractedTextPath = extractedTextPath;
XmlPath = xmlPath;
FragmentsJsonPath = fragmentsJsonPath;
ParsedCellsJsonPath = parsedCellsJsonPath;
ValidationReportPath = validationReportPath;
}
public string DirectoryPath { get; }
public string ExtractedTextPath { get; }
public string XmlPath { get; }
public string FragmentsJsonPath { get; }
public string ParsedCellsJsonPath { get; }
public string ValidationReportPath { get; }
public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug)
{
var directoryPath = Path.Combine(artifactsRootPath, tableSlug);
return new ImportArtifactPaths(directoryPath, Path.Combine(directoryPath, "extracted.txt"));
return new ImportArtifactPaths(
directoryPath,
Path.Combine(directoryPath, "source.xml"),
Path.Combine(directoryPath, "fragments.json"),
Path.Combine(directoryPath, "parsed-cells.json"),
Path.Combine(directoryPath, "validation-report.json"));
}
}

View File

@@ -0,0 +1,33 @@
using System.Text.Json;
using RolemasterDb.ImportTool.Parsing;
namespace RolemasterDb.ImportTool;
public sealed class ImportArtifactWriter
{
private static readonly JsonSerializerOptions JsonOptions = new()
{
WriteIndented = true
};
public async Task WriteAsync(ImportArtifactPaths artifactPaths, StandardCriticalTableParseResult parseResult, CancellationToken cancellationToken = default)
{
Directory.CreateDirectory(artifactPaths.DirectoryPath);
await File.WriteAllTextAsync(
artifactPaths.FragmentsJsonPath,
JsonSerializer.Serialize(parseResult.Fragments, JsonOptions),
cancellationToken);
await File.WriteAllTextAsync(
artifactPaths.ParsedCellsJsonPath,
JsonSerializer.Serialize(parseResult.Cells, JsonOptions),
cancellationToken);
await File.WriteAllTextAsync(
artifactPaths.ValidationReportPath,
JsonSerializer.Serialize(parseResult.ValidationReport, JsonOptions),
cancellationToken);
}
}

View File

@@ -0,0 +1,13 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ImportValidationReport(
bool isValid,
IReadOnlyList<string> errors,
int rowCount,
int cellCount)
{
public bool IsValid { get; } = isValid;
public IReadOnlyList<string> Errors { get; } = errors;
public int RowCount { get; } = rowCount;
public int CellCount { get; } = cellCount;
}

View File

@@ -0,0 +1,17 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalCellArtifact(
string rollBandLabel,
string columnKey,
IReadOnlyList<string> lines,
string rawCellText,
string descriptionText,
string? rawAffixText)
{
public string RollBandLabel { get; } = rollBandLabel;
public string ColumnKey { get; } = columnKey;
public IReadOnlyList<string> Lines { get; } = lines;
public string RawCellText { get; } = rawCellText;
public string DescriptionText { get; } = descriptionText;
public string? RawAffixText { get; } = rawAffixText;
}

View File

@@ -0,0 +1,13 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParseResult(
ParsedCriticalTable table,
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<ParsedCriticalCellArtifact> cells,
ImportValidationReport validationReport)
{
public ParsedCriticalTable Table { get; } = table;
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
public ImportValidationReport ValidationReport { get; } = validationReport;
}

View File

@@ -1,208 +1,206 @@
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParser
{
private static readonly Regex ColumnRegex = new(@"\b([A-E])\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex RollBandRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)\s*$", RegexOptions.Compiled);
private static readonly Regex RollBandLineRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)(?<rest>\s+.*)?$", RegexOptions.Compiled);
private const int HeaderToBodyMinimumGap = 20;
private const int TopGroupingTolerance = 2;
public ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText)
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
var lines = extractedText.Replace("\r\n", "\n", StringComparison.Ordinal)
.Replace('\f', '\n')
.Split('\n');
var fragments = LoadFragments(xmlContent);
var headerFragments = FindHeaderFragments(fragments);
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments);
var validationErrors = new List<string>();
var headerIndex = Array.FindIndex(lines, IsColumnHeaderLine);
if (headerIndex < 0)
{
throw new InvalidOperationException("The standard table header could not be found in the extracted text.");
}
var columnStarts = GetColumnStarts(lines[headerIndex]);
var boundaries = GetColumnBoundaries(columnStarts);
var columns = columnStarts
.Select((item, index) => new ParsedCriticalColumn(item.Label, item.Label, "severity", index + 1))
var columnCenters = headerFragments
.OrderBy(item => item.Left)
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
.ToList();
var firstRollBandIndex = FindNextRollBandIndex(lines, headerIndex + 1);
if (firstRollBandIndex < 0)
{
throw new InvalidOperationException("No roll bands were found in the extracted text.");
}
var keyLineIndex = Array.FindIndex(lines, firstRollBandIndex, item => item.TrimStart().StartsWith("Key:", StringComparison.OrdinalIgnoreCase));
if (keyLineIndex < 0)
{
keyLineIndex = lines.Length;
}
var leadingLines = lines[(headerIndex + 1)..firstRollBandIndex]
.Where(item => !string.IsNullOrWhiteSpace(item))
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
.ToList();
var rollBands = new List<ParsedCriticalRollBand>();
var results = new List<ParsedCriticalResult>();
var currentLabel = string.Empty;
var currentRowLines = new List<string>();
var rowIndex = 0;
void FlushCurrentRow()
if (rowAnchors.Count == 0)
{
if (string.IsNullOrEmpty(currentLabel))
{
return;
}
rowIndex++;
var rollBand = CreateRollBand(currentLabel, rowIndex);
rollBands.Add(rollBand);
var cellLines = SplitRowLines(currentRowLines, boundaries, columns.Count);
for (var columnIndex = 0; columnIndex < columns.Count; columnIndex++)
{
var rawCellLines = cellLines[columnIndex]
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
var rawAffixLines = rawCellLines
.Where(IsAffixLikeLine)
.ToList();
var descriptionLines = rawCellLines
.Where(item => !IsAffixLikeLine(item))
.ToList();
results.Add(new ParsedCriticalResult(
columns[columnIndex].ColumnKey,
rollBand.Label,
string.Join(Environment.NewLine, rawCellLines),
CollapseWhitespace(string.Join(' ', descriptionLines)),
rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines)));
}
currentLabel = string.Empty;
currentRowLines = new List<string>();
validationErrors.Add("No roll-band labels were found in the XML artifact.");
}
for (var lineIndex = firstRollBandIndex; lineIndex < keyLineIndex; lineIndex++)
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item => string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - 1 &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList();
var parsedRollBands = rowAnchors
.Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
if (TryParseRollBandLine(lines[lineIndex], out var label, out var trailingText))
var rowStart = rowIndex == 0
? bodyStartTop
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList();
foreach (var columnAnchor in columnCenters)
{
var trailingTextBelongsToCurrentRow = IsAffixLikeLine(trailingText);
var cellFragments = rowFragments
.Where(item => ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (!string.IsNullOrWhiteSpace(trailingText) &&
!string.IsNullOrEmpty(currentLabel) &&
!trailingTextBelongsToCurrentRow)
if (cellFragments.Count == 0)
{
currentRowLines.Add(trailingText);
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
continue;
}
FlushCurrentRow();
currentLabel = label;
if (rowIndex == 0)
{
currentRowLines.AddRange(leadingLines);
}
var lines = BuildLines(cellFragments);
var rawAffixLines = lines.Where(IsAffixLikeLine).ToList();
var descriptionLines = lines.Where(line => !IsAffixLikeLine(line)).ToList();
var rawCellText = string.Join(Environment.NewLine, lines);
var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
if (!string.IsNullOrWhiteSpace(trailingText) && trailingTextBelongsToCurrentRow)
{
currentRowLines.Add(trailingText);
}
parsedCells.Add(new ParsedCriticalCellArtifact(
rowAnchors[rowIndex].Label,
columnAnchor.Key,
lines,
rawCellText,
descriptionText,
rawAffixText));
continue;
}
if (!string.IsNullOrWhiteSpace(lines[lineIndex]))
{
currentRowLines.Add(lines[lineIndex]);
parsedResults.Add(new ParsedCriticalResult(
columnAnchor.Key,
rowAnchors[rowIndex].Label,
rawCellText,
descriptionText,
rawAffixText));
}
}
FlushCurrentRow();
if (columnCenters.Count != 5)
{
validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}.");
}
return new ParsedCriticalTable(
if (parsedCells.Count != rowAnchors.Count * columnCenters.Count)
{
validationErrors.Add(
$"Expected {rowAnchors.Count * columnCenters.Count} parsed cells but produced {parsedCells.Count}.");
}
var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
rowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF text extraction.",
columns,
rollBands,
results);
"Imported from PDF XML extraction.",
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new StandardCriticalTableParseResult(table, fragments, parsedCells, validationReport);
}
private static bool IsColumnHeaderLine(string line)
private static List<XmlTextFragment> LoadFragments(string xmlContent)
{
var matches = ColumnRegex.Matches(line);
return matches.Count == 5;
}
using var stringReader = new StringReader(xmlContent);
using var xmlReader = XmlReader.Create(
stringReader,
new XmlReaderSettings
{
DtdProcessing = DtdProcessing.Ignore
});
private static List<(string Label, int Start)> GetColumnStarts(string headerLine)
{
var matches = ColumnRegex.Matches(headerLine);
return matches
.Select(match => (match.Groups[1].Value.ToUpperInvariant(), match.Index))
var document = XDocument.Load(xmlReader);
return document.Descendants("page")
.SelectMany(page =>
{
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
return page.Elements("text")
.Select(item => new XmlTextFragment(
pageNumber,
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
})
.ToList();
}
private static int[] GetColumnBoundaries(IReadOnlyList<(string Label, int Start)> columns)
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
{
var boundaries = new int[columns.Count - 1];
for (var index = 0; index < boundaries.Length; index++)
{
boundaries[index] = (columns[index].Start + columns[index + 1].Start) / 2;
}
var groupedByTop = fragments
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
.GroupBy(item => item.Top)
.OrderBy(group => group.Key);
return boundaries;
}
private static int FindNextRollBandIndex(IReadOnlyList<string> lines, int startIndex)
{
for (var index = startIndex; index < lines.Count; index++)
foreach (var group in groupedByTop)
{
if (TryParseRollBandLine(lines[index], out _, out _))
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
if (labels.SequenceEqual(["A", "B", "C", "D", "E"]))
{
return index;
return ordered;
}
}
return -1;
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
}
private static bool TryParseRollBandLabel(string line, out string label)
private static List<XmlTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<XmlTextFragment> headerFragments)
{
var match = RollBandRegex.Match(line);
if (!match.Success)
{
label = string.Empty;
return false;
}
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
label = match.Groups[1].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
return true;
return fragments
.Where(item =>
item.Left < leftCutoff &&
item.Top >= bodyStartTop &&
IsRollBandLabel(item.Text))
.OrderBy(item => item.Top)
.ToList();
}
private static bool TryParseRollBandLine(string line, out string label, out string trailingText)
{
var match = RollBandLineRegex.Match(line);
if (!match.Success)
{
label = string.Empty;
trailingText = string.Empty;
return false;
}
label = match.Groups["label"].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
var restGroup = match.Groups["rest"];
trailingText = restGroup.Success
? string.Concat(new string(' ', restGroup.Index), restGroup.Value.TrimEnd())
: string.Empty;
return true;
}
private static bool IsRollBandLabel(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$");
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
{
@@ -217,35 +215,39 @@ public sealed class StandardCriticalTableParser
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
}
private static List<string>[] SplitRowLines(IReadOnlyList<string> rowLines, int[] boundaries, int columnCount)
private static string ResolveColumn(double centerX, IReadOnlyList<ColumnAnchor> columns)
{
var result = Enumerable.Range(0, columnCount)
.Select(_ => new List<string>())
.ToArray();
foreach (var line in rowLines)
for (var index = 0; index < columns.Count - 1; index++)
{
for (var columnIndex = 0; columnIndex < columnCount; columnIndex++)
var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
if (centerX < boundary)
{
var start = columnIndex == 0 ? 0 : boundaries[columnIndex - 1];
var end = columnIndex == columnCount - 1
? line.Length
: Math.Min(boundaries[columnIndex], line.Length);
if (start >= line.Length || end <= start)
{
continue;
}
var segment = line[start..end].Trim();
if (!string.IsNullOrWhiteSpace(segment))
{
result[columnIndex].Add(segment);
}
return columns[index].Key;
}
}
return result;
return columns[^1].Key;
}
private static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
{
var lines = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
{
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
lines.Add([fragment]);
continue;
}
lines[^1].Add(fragment);
}
return lines
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
}
private static bool IsAffixLikeLine(string line)
@@ -256,7 +258,7 @@ public sealed class StandardCriticalTableParser
return false;
}
if (value == "")
if (value == "-" || value == "\u2014")
{
return true;
}
@@ -270,16 +272,27 @@ public sealed class StandardCriticalTableParser
}
return value.StartsWith("+", StringComparison.Ordinal) ||
value.StartsWith('∑') ||
value.StartsWith('∏') ||
value.StartsWith('π') ||
value.StartsWith('∫') ||
value.StartsWith("\u2211", StringComparison.Ordinal) ||
value.StartsWith("\u220F", StringComparison.Ordinal) ||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
char.IsDigit(value[0]) ||
value.Contains(" ", StringComparison.Ordinal) ||
value.Contains(" - ", StringComparison.Ordinal) ||
value.Contains("(-", StringComparison.Ordinal) ||
value.Contains("(+", StringComparison.Ordinal);
}
private static string CollapseWhitespace(string value) =>
Regex.Replace(value.Trim(), @"\s+", " ");
private static string NormalizeText(string value) =>
value
.Replace('\u00a0', ' ')
.Replace('\r', ' ')
.Replace('\n', ' ')
.Trim();
private sealed record ColumnAnchor(string Key, double CenterX);
private sealed record RowAnchor(string Label, int Top, int SortOrder);
}

View File

@@ -0,0 +1,18 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class XmlTextFragment(
int pageNumber,
int top,
int left,
int width,
int height,
string text)
{
public int PageNumber { get; } = pageNumber;
public int Top { get; } = top;
public int Left { get; } = left;
public int Width { get; } = width;
public int Height { get; } = height;
public string Text { get; } = text;
public double CenterX => Left + (Width / 2.0);
}

View File

@@ -2,7 +2,7 @@ using System.Diagnostics;
namespace RolemasterDb.ImportTool;
public sealed class PdfTextExtractor
public sealed class PdfXmlExtractor
{
public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default)
{
@@ -10,14 +10,16 @@ public sealed class PdfTextExtractor
var startInfo = new ProcessStartInfo
{
FileName = "pdftotext",
FileName = "pdftohtml",
RedirectStandardError = true,
RedirectStandardOutput = true,
UseShellExecute = false,
CreateNoWindow = true
};
startInfo.ArgumentList.Add("-layout");
startInfo.ArgumentList.Add("-xml");
startInfo.ArgumentList.Add("-i");
startInfo.ArgumentList.Add("-noframes");
startInfo.ArgumentList.Add(pdfPath);
startInfo.ArgumentList.Add(outputPath);
@@ -28,7 +30,7 @@ public sealed class PdfTextExtractor
if (process.ExitCode != 0)
{
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
throw new InvalidOperationException($"pdftotext failed for '{pdfPath}': {error}");
throw new InvalidOperationException($"pdftohtml failed for '{pdfPath}': {error}");
}
}
}