Use XML geometry for critical PDF import

This commit is contained in:
2026-03-14 01:25:43 +01:00
parent f70d610c92
commit 719355da90
10 changed files with 335 additions and 201 deletions

View File

@@ -0,0 +1,36 @@
using System.Diagnostics;
namespace RolemasterDb.ImportTool;
public sealed class PdfXmlExtractor
{
public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default)
{
Directory.CreateDirectory(Path.GetDirectoryName(outputPath)!);
var startInfo = new ProcessStartInfo
{
FileName = "pdftohtml",
RedirectStandardError = true,
RedirectStandardOutput = true,
UseShellExecute = false,
CreateNoWindow = true
};
startInfo.ArgumentList.Add("-xml");
startInfo.ArgumentList.Add("-i");
startInfo.ArgumentList.Add("-noframes");
startInfo.ArgumentList.Add(pdfPath);
startInfo.ArgumentList.Add(outputPath);
using var process = new Process { StartInfo = startInfo };
process.Start();
await process.WaitForExitAsync(cancellationToken);
if (process.ExitCode != 0)
{
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
throw new InvalidOperationException($"pdftohtml failed for '{pdfPath}': {error}");
}
}
}