Files
RolemasterDB/src/RolemasterDb.ImportTool/PdfXmlExtractor.cs

118 lines
4.0 KiB
C#

using System.Diagnostics;
namespace RolemasterDb.ImportTool;
public sealed class PdfXmlExtractor
{
public const int XmlAlignedRenderDpi = 108;
public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default)
{
Directory.CreateDirectory(Path.GetDirectoryName(outputPath)!);
var startInfo = new ProcessStartInfo
{
FileName = "pdftohtml",
RedirectStandardError = true,
RedirectStandardOutput = true,
UseShellExecute = false,
CreateNoWindow = true
};
startInfo.ArgumentList.Add("-xml");
startInfo.ArgumentList.Add("-i");
startInfo.ArgumentList.Add("-noframes");
startInfo.ArgumentList.Add(pdfPath);
startInfo.ArgumentList.Add(outputPath);
using var process = new Process { StartInfo = startInfo };
process.Start();
await process.WaitForExitAsync(cancellationToken);
if (process.ExitCode != 0)
{
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
throw new InvalidOperationException($"pdftohtml failed for '{pdfPath}': {error}");
}
}
public Task RenderPagePngAsync(
string pdfPath,
int pageNumber,
string outputPath,
CancellationToken cancellationToken = default) =>
RenderPngAsync(pdfPath, pageNumber, outputPath, null, null, null, null, cancellationToken);
public Task RenderCropPngAsync(
string pdfPath,
int pageNumber,
int left,
int top,
int width,
int height,
string outputPath,
CancellationToken cancellationToken = default) =>
RenderPngAsync(pdfPath, pageNumber, outputPath, left, top, width, height, cancellationToken);
private static async Task RenderPngAsync(
string pdfPath,
int pageNumber,
string outputPath,
int? left,
int? top,
int? width,
int? height,
CancellationToken cancellationToken)
{
Directory.CreateDirectory(Path.GetDirectoryName(outputPath)!);
var startInfo = new ProcessStartInfo
{
FileName = "pdftoppm",
RedirectStandardError = true,
RedirectStandardOutput = true,
UseShellExecute = false,
CreateNoWindow = true
};
startInfo.ArgumentList.Add("-png");
startInfo.ArgumentList.Add("-r");
startInfo.ArgumentList.Add(XmlAlignedRenderDpi.ToString());
startInfo.ArgumentList.Add("-f");
startInfo.ArgumentList.Add(pageNumber.ToString());
startInfo.ArgumentList.Add("-l");
startInfo.ArgumentList.Add(pageNumber.ToString());
startInfo.ArgumentList.Add("-singlefile");
if (left.HasValue && top.HasValue && width.HasValue && height.HasValue)
{
startInfo.ArgumentList.Add("-x");
startInfo.ArgumentList.Add(left.Value.ToString());
startInfo.ArgumentList.Add("-y");
startInfo.ArgumentList.Add(top.Value.ToString());
startInfo.ArgumentList.Add("-W");
startInfo.ArgumentList.Add(width.Value.ToString());
startInfo.ArgumentList.Add("-H");
startInfo.ArgumentList.Add(height.Value.ToString());
}
startInfo.ArgumentList.Add(pdfPath);
startInfo.ArgumentList.Add(Path.Combine(Path.GetDirectoryName(outputPath)!, Path.GetFileNameWithoutExtension(outputPath)));
using var process = new Process { StartInfo = startInfo };
process.Start();
await process.WaitForExitAsync(cancellationToken);
if (process.ExitCode != 0)
{
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
throw new InvalidOperationException($"pdftoppm failed for '{pdfPath}': {error}");
}
if (!File.Exists(outputPath))
{
throw new InvalidOperationException($"pdftoppm completed but did not create '{outputPath}'.");
}
}
}