using System.Diagnostics; namespace RolemasterDb.ImportTool; public sealed class PdfXmlExtractor { public const int XmlAlignedRenderDpi = 108; public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default) { Directory.CreateDirectory(Path.GetDirectoryName(outputPath)!); var startInfo = new ProcessStartInfo { FileName = "pdftohtml", RedirectStandardError = true, RedirectStandardOutput = true, UseShellExecute = false, CreateNoWindow = true }; startInfo.ArgumentList.Add("-xml"); startInfo.ArgumentList.Add("-i"); startInfo.ArgumentList.Add("-noframes"); startInfo.ArgumentList.Add(pdfPath); startInfo.ArgumentList.Add(outputPath); using var process = new Process { StartInfo = startInfo }; process.Start(); await process.WaitForExitAsync(cancellationToken); if (process.ExitCode != 0) { var error = await process.StandardError.ReadToEndAsync(cancellationToken); throw new InvalidOperationException($"pdftohtml failed for '{pdfPath}': {error}"); } } public Task RenderPagePngAsync( string pdfPath, int pageNumber, string outputPath, CancellationToken cancellationToken = default) => RenderPngAsync(pdfPath, pageNumber, outputPath, null, null, null, null, cancellationToken); public Task RenderCropPngAsync( string pdfPath, int pageNumber, int left, int top, int width, int height, string outputPath, CancellationToken cancellationToken = default) => RenderPngAsync(pdfPath, pageNumber, outputPath, left, top, width, height, cancellationToken); private static async Task RenderPngAsync( string pdfPath, int pageNumber, string outputPath, int? left, int? top, int? width, int? height, CancellationToken cancellationToken) { Directory.CreateDirectory(Path.GetDirectoryName(outputPath)!); var startInfo = new ProcessStartInfo { FileName = "pdftoppm", RedirectStandardError = true, RedirectStandardOutput = true, UseShellExecute = false, CreateNoWindow = true }; startInfo.ArgumentList.Add("-png"); startInfo.ArgumentList.Add("-r"); startInfo.ArgumentList.Add(XmlAlignedRenderDpi.ToString()); startInfo.ArgumentList.Add("-f"); startInfo.ArgumentList.Add(pageNumber.ToString()); startInfo.ArgumentList.Add("-l"); startInfo.ArgumentList.Add(pageNumber.ToString()); startInfo.ArgumentList.Add("-singlefile"); if (left.HasValue && top.HasValue && width.HasValue && height.HasValue) { startInfo.ArgumentList.Add("-x"); startInfo.ArgumentList.Add(left.Value.ToString()); startInfo.ArgumentList.Add("-y"); startInfo.ArgumentList.Add(top.Value.ToString()); startInfo.ArgumentList.Add("-W"); startInfo.ArgumentList.Add(width.Value.ToString()); startInfo.ArgumentList.Add("-H"); startInfo.ArgumentList.Add(height.Value.ToString()); } startInfo.ArgumentList.Add(pdfPath); startInfo.ArgumentList.Add(Path.Combine(Path.GetDirectoryName(outputPath)!, Path.GetFileNameWithoutExtension(outputPath))); using var process = new Process { StartInfo = startInfo }; process.Start(); await process.WaitForExitAsync(cancellationToken); if (process.ExitCode != 0) { var error = await process.StandardError.ReadToEndAsync(cancellationToken); throw new InvalidOperationException($"pdftoppm failed for '{pdfPath}': {error}"); } if (!File.Exists(outputPath)) { throw new InvalidOperationException($"pdftoppm completed but did not create '{outputPath}'."); } } }