Use XML geometry for critical PDF import
This commit is contained in:
Binary file not shown.
@@ -5,7 +5,8 @@ namespace RolemasterDb.ImportTool;
|
||||
public sealed class CriticalImportCommandRunner
|
||||
{
|
||||
private readonly CriticalImportManifestLoader manifestLoader = new();
|
||||
private readonly PdfTextExtractor pdfTextExtractor = new();
|
||||
private readonly ImportArtifactWriter artifactWriter = new();
|
||||
private readonly PdfXmlExtractor pdfXmlExtractor = new();
|
||||
private readonly StandardCriticalTableParser standardParser = new();
|
||||
|
||||
public async Task<int> RunAsync(ResetOptions options)
|
||||
@@ -26,8 +27,8 @@ public sealed class CriticalImportCommandRunner
|
||||
{
|
||||
var entry = GetManifestEntry(options.Table);
|
||||
var artifactPaths = CreateArtifactPaths(entry.Slug);
|
||||
await pdfTextExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.ExtractedTextPath);
|
||||
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.ExtractedTextPath}");
|
||||
await pdfXmlExtractor.ExtractAsync(ResolveRepositoryPath(entry.PdfPath), artifactPaths.XmlPath);
|
||||
Console.WriteLine($"Extracted {entry.Slug} to {artifactPaths.XmlPath}");
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -36,16 +37,24 @@ public sealed class CriticalImportCommandRunner
|
||||
var entry = GetManifestEntry(options.Table);
|
||||
var artifactPaths = CreateArtifactPaths(entry.Slug);
|
||||
|
||||
if (!File.Exists(artifactPaths.ExtractedTextPath))
|
||||
if (!File.Exists(artifactPaths.XmlPath))
|
||||
{
|
||||
Console.Error.WriteLine($"Missing extracted text artifact: {artifactPaths.ExtractedTextPath}");
|
||||
Console.Error.WriteLine($"Missing XML artifact: {artifactPaths.XmlPath}");
|
||||
return 1;
|
||||
}
|
||||
|
||||
var extractedText = await File.ReadAllTextAsync(artifactPaths.ExtractedTextPath);
|
||||
var parsedTable = Parse(entry, extractedText);
|
||||
var xmlContent = await File.ReadAllTextAsync(artifactPaths.XmlPath);
|
||||
var parseResult = Parse(entry, xmlContent);
|
||||
await artifactWriter.WriteAsync(artifactPaths, parseResult, CancellationToken.None);
|
||||
|
||||
if (!parseResult.ValidationReport.IsValid)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Validation failed for '{entry.Slug}'. See {artifactPaths.ValidationReportPath} for details.");
|
||||
}
|
||||
|
||||
var loader = new CriticalImportLoader(ResolveDatabasePath(options.DatabasePath));
|
||||
var result = await loader.LoadAsync(parsedTable);
|
||||
var result = await loader.LoadAsync(parseResult.Table);
|
||||
|
||||
Console.WriteLine(
|
||||
$"Loaded {result.TableSlug}: {result.ColumnCount} columns, {result.RollBandCount} roll bands, {result.ResultCount} results.");
|
||||
@@ -82,14 +91,14 @@ public sealed class CriticalImportCommandRunner
|
||||
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
|
||||
}
|
||||
|
||||
private ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText)
|
||||
private StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
{
|
||||
if (!string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 1.");
|
||||
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 2.");
|
||||
}
|
||||
|
||||
return standardParser.Parse(entry, extractedText);
|
||||
return standardParser.Parse(entry, xmlContent);
|
||||
}
|
||||
|
||||
private static ImportArtifactPaths CreateArtifactPaths(string slug) =>
|
||||
|
||||
@@ -2,18 +2,34 @@ namespace RolemasterDb.ImportTool;
|
||||
|
||||
public sealed class ImportArtifactPaths
|
||||
{
|
||||
private ImportArtifactPaths(string directoryPath, string extractedTextPath)
|
||||
private ImportArtifactPaths(
|
||||
string directoryPath,
|
||||
string xmlPath,
|
||||
string fragmentsJsonPath,
|
||||
string parsedCellsJsonPath,
|
||||
string validationReportPath)
|
||||
{
|
||||
DirectoryPath = directoryPath;
|
||||
ExtractedTextPath = extractedTextPath;
|
||||
XmlPath = xmlPath;
|
||||
FragmentsJsonPath = fragmentsJsonPath;
|
||||
ParsedCellsJsonPath = parsedCellsJsonPath;
|
||||
ValidationReportPath = validationReportPath;
|
||||
}
|
||||
|
||||
public string DirectoryPath { get; }
|
||||
public string ExtractedTextPath { get; }
|
||||
public string XmlPath { get; }
|
||||
public string FragmentsJsonPath { get; }
|
||||
public string ParsedCellsJsonPath { get; }
|
||||
public string ValidationReportPath { get; }
|
||||
|
||||
public static ImportArtifactPaths Create(string artifactsRootPath, string tableSlug)
|
||||
{
|
||||
var directoryPath = Path.Combine(artifactsRootPath, tableSlug);
|
||||
return new ImportArtifactPaths(directoryPath, Path.Combine(directoryPath, "extracted.txt"));
|
||||
return new ImportArtifactPaths(
|
||||
directoryPath,
|
||||
Path.Combine(directoryPath, "source.xml"),
|
||||
Path.Combine(directoryPath, "fragments.json"),
|
||||
Path.Combine(directoryPath, "parsed-cells.json"),
|
||||
Path.Combine(directoryPath, "validation-report.json"));
|
||||
}
|
||||
}
|
||||
|
||||
33
src/RolemasterDb.ImportTool/ImportArtifactWriter.cs
Normal file
33
src/RolemasterDb.ImportTool/ImportArtifactWriter.cs
Normal file
@@ -0,0 +1,33 @@
|
||||
using System.Text.Json;
|
||||
|
||||
using RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
public sealed class ImportArtifactWriter
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
WriteIndented = true
|
||||
};
|
||||
|
||||
public async Task WriteAsync(ImportArtifactPaths artifactPaths, StandardCriticalTableParseResult parseResult, CancellationToken cancellationToken = default)
|
||||
{
|
||||
Directory.CreateDirectory(artifactPaths.DirectoryPath);
|
||||
|
||||
await File.WriteAllTextAsync(
|
||||
artifactPaths.FragmentsJsonPath,
|
||||
JsonSerializer.Serialize(parseResult.Fragments, JsonOptions),
|
||||
cancellationToken);
|
||||
|
||||
await File.WriteAllTextAsync(
|
||||
artifactPaths.ParsedCellsJsonPath,
|
||||
JsonSerializer.Serialize(parseResult.Cells, JsonOptions),
|
||||
cancellationToken);
|
||||
|
||||
await File.WriteAllTextAsync(
|
||||
artifactPaths.ValidationReportPath,
|
||||
JsonSerializer.Serialize(parseResult.ValidationReport, JsonOptions),
|
||||
cancellationToken);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class ImportValidationReport(
|
||||
bool isValid,
|
||||
IReadOnlyList<string> errors,
|
||||
int rowCount,
|
||||
int cellCount)
|
||||
{
|
||||
public bool IsValid { get; } = isValid;
|
||||
public IReadOnlyList<string> Errors { get; } = errors;
|
||||
public int RowCount { get; } = rowCount;
|
||||
public int CellCount { get; } = cellCount;
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class ParsedCriticalCellArtifact(
|
||||
string rollBandLabel,
|
||||
string columnKey,
|
||||
IReadOnlyList<string> lines,
|
||||
string rawCellText,
|
||||
string descriptionText,
|
||||
string? rawAffixText)
|
||||
{
|
||||
public string RollBandLabel { get; } = rollBandLabel;
|
||||
public string ColumnKey { get; } = columnKey;
|
||||
public IReadOnlyList<string> Lines { get; } = lines;
|
||||
public string RawCellText { get; } = rawCellText;
|
||||
public string DescriptionText { get; } = descriptionText;
|
||||
public string? RawAffixText { get; } = rawAffixText;
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class StandardCriticalTableParseResult(
|
||||
ParsedCriticalTable table,
|
||||
IReadOnlyList<XmlTextFragment> fragments,
|
||||
IReadOnlyList<ParsedCriticalCellArtifact> cells,
|
||||
ImportValidationReport validationReport)
|
||||
{
|
||||
public ParsedCriticalTable Table { get; } = table;
|
||||
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
|
||||
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
|
||||
public ImportValidationReport ValidationReport { get; } = validationReport;
|
||||
}
|
||||
@@ -1,208 +1,206 @@
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Xml;
|
||||
using System.Xml.Linq;
|
||||
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class StandardCriticalTableParser
|
||||
{
|
||||
private static readonly Regex ColumnRegex = new(@"\b([A-E])\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex RollBandRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)\s*$", RegexOptions.Compiled);
|
||||
private static readonly Regex RollBandLineRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)(?<rest>\s+.*)?$", RegexOptions.Compiled);
|
||||
private const int HeaderToBodyMinimumGap = 20;
|
||||
private const int TopGroupingTolerance = 2;
|
||||
|
||||
public ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText)
|
||||
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||
{
|
||||
var lines = extractedText.Replace("\r\n", "\n", StringComparison.Ordinal)
|
||||
.Replace('\f', '\n')
|
||||
.Split('\n');
|
||||
var fragments = LoadFragments(xmlContent);
|
||||
var headerFragments = FindHeaderFragments(fragments);
|
||||
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments);
|
||||
var validationErrors = new List<string>();
|
||||
|
||||
var headerIndex = Array.FindIndex(lines, IsColumnHeaderLine);
|
||||
if (headerIndex < 0)
|
||||
{
|
||||
throw new InvalidOperationException("The standard table header could not be found in the extracted text.");
|
||||
}
|
||||
|
||||
var columnStarts = GetColumnStarts(lines[headerIndex]);
|
||||
var boundaries = GetColumnBoundaries(columnStarts);
|
||||
var columns = columnStarts
|
||||
.Select((item, index) => new ParsedCriticalColumn(item.Label, item.Label, "severity", index + 1))
|
||||
var columnCenters = headerFragments
|
||||
.OrderBy(item => item.Left)
|
||||
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
|
||||
.ToList();
|
||||
|
||||
var firstRollBandIndex = FindNextRollBandIndex(lines, headerIndex + 1);
|
||||
if (firstRollBandIndex < 0)
|
||||
{
|
||||
throw new InvalidOperationException("No roll bands were found in the extracted text.");
|
||||
}
|
||||
|
||||
var keyLineIndex = Array.FindIndex(lines, firstRollBandIndex, item => item.TrimStart().StartsWith("Key:", StringComparison.OrdinalIgnoreCase));
|
||||
if (keyLineIndex < 0)
|
||||
{
|
||||
keyLineIndex = lines.Length;
|
||||
}
|
||||
|
||||
var leadingLines = lines[(headerIndex + 1)..firstRollBandIndex]
|
||||
.Where(item => !string.IsNullOrWhiteSpace(item))
|
||||
var rowAnchors = rowLabelFragments
|
||||
.OrderBy(item => item.Top)
|
||||
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
|
||||
.ToList();
|
||||
|
||||
var rollBands = new List<ParsedCriticalRollBand>();
|
||||
var results = new List<ParsedCriticalResult>();
|
||||
var currentLabel = string.Empty;
|
||||
var currentRowLines = new List<string>();
|
||||
var rowIndex = 0;
|
||||
|
||||
void FlushCurrentRow()
|
||||
if (rowAnchors.Count == 0)
|
||||
{
|
||||
if (string.IsNullOrEmpty(currentLabel))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
rowIndex++;
|
||||
var rollBand = CreateRollBand(currentLabel, rowIndex);
|
||||
rollBands.Add(rollBand);
|
||||
|
||||
var cellLines = SplitRowLines(currentRowLines, boundaries, columns.Count);
|
||||
for (var columnIndex = 0; columnIndex < columns.Count; columnIndex++)
|
||||
{
|
||||
var rawCellLines = cellLines[columnIndex]
|
||||
.Where(item => !string.IsNullOrWhiteSpace(item))
|
||||
.ToList();
|
||||
|
||||
var rawAffixLines = rawCellLines
|
||||
.Where(IsAffixLikeLine)
|
||||
.ToList();
|
||||
|
||||
var descriptionLines = rawCellLines
|
||||
.Where(item => !IsAffixLikeLine(item))
|
||||
.ToList();
|
||||
|
||||
results.Add(new ParsedCriticalResult(
|
||||
columns[columnIndex].ColumnKey,
|
||||
rollBand.Label,
|
||||
string.Join(Environment.NewLine, rawCellLines),
|
||||
CollapseWhitespace(string.Join(' ', descriptionLines)),
|
||||
rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines)));
|
||||
}
|
||||
|
||||
currentLabel = string.Empty;
|
||||
currentRowLines = new List<string>();
|
||||
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
||||
}
|
||||
|
||||
for (var lineIndex = firstRollBandIndex; lineIndex < keyLineIndex; lineIndex++)
|
||||
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
||||
var keyTop = fragments
|
||||
.Where(item => string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase))
|
||||
.Select(item => (int?)item.Top)
|
||||
.Min() ?? int.MaxValue;
|
||||
|
||||
var bodyFragments = fragments
|
||||
.Where(item =>
|
||||
item.Top >= bodyStartTop &&
|
||||
item.Top < keyTop - 1 &&
|
||||
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
|
||||
!headerFragments.Contains(item))
|
||||
.ToList();
|
||||
|
||||
var parsedRollBands = rowAnchors
|
||||
.Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder))
|
||||
.ToList();
|
||||
|
||||
var parsedCells = new List<ParsedCriticalCellArtifact>();
|
||||
var parsedResults = new List<ParsedCriticalResult>();
|
||||
|
||||
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
|
||||
{
|
||||
if (TryParseRollBandLine(lines[lineIndex], out var label, out var trailingText))
|
||||
var rowStart = rowIndex == 0
|
||||
? bodyStartTop
|
||||
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0);
|
||||
|
||||
var rowEnd = rowIndex == rowAnchors.Count - 1
|
||||
? keyTop - 1
|
||||
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0);
|
||||
|
||||
var rowFragments = bodyFragments
|
||||
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
|
||||
.ToList();
|
||||
|
||||
foreach (var columnAnchor in columnCenters)
|
||||
{
|
||||
var trailingTextBelongsToCurrentRow = IsAffixLikeLine(trailingText);
|
||||
var cellFragments = rowFragments
|
||||
.Where(item => ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
|
||||
.OrderBy(item => item.Top)
|
||||
.ThenBy(item => item.Left)
|
||||
.ToList();
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(trailingText) &&
|
||||
!string.IsNullOrEmpty(currentLabel) &&
|
||||
!trailingTextBelongsToCurrentRow)
|
||||
if (cellFragments.Count == 0)
|
||||
{
|
||||
currentRowLines.Add(trailingText);
|
||||
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
|
||||
continue;
|
||||
}
|
||||
|
||||
FlushCurrentRow();
|
||||
currentLabel = label;
|
||||
if (rowIndex == 0)
|
||||
{
|
||||
currentRowLines.AddRange(leadingLines);
|
||||
}
|
||||
var lines = BuildLines(cellFragments);
|
||||
var rawAffixLines = lines.Where(IsAffixLikeLine).ToList();
|
||||
var descriptionLines = lines.Where(line => !IsAffixLikeLine(line)).ToList();
|
||||
var rawCellText = string.Join(Environment.NewLine, lines);
|
||||
var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
|
||||
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(trailingText) && trailingTextBelongsToCurrentRow)
|
||||
{
|
||||
currentRowLines.Add(trailingText);
|
||||
}
|
||||
parsedCells.Add(new ParsedCriticalCellArtifact(
|
||||
rowAnchors[rowIndex].Label,
|
||||
columnAnchor.Key,
|
||||
lines,
|
||||
rawCellText,
|
||||
descriptionText,
|
||||
rawAffixText));
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(lines[lineIndex]))
|
||||
{
|
||||
currentRowLines.Add(lines[lineIndex]);
|
||||
parsedResults.Add(new ParsedCriticalResult(
|
||||
columnAnchor.Key,
|
||||
rowAnchors[rowIndex].Label,
|
||||
rawCellText,
|
||||
descriptionText,
|
||||
rawAffixText));
|
||||
}
|
||||
}
|
||||
|
||||
FlushCurrentRow();
|
||||
if (columnCenters.Count != 5)
|
||||
{
|
||||
validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}.");
|
||||
}
|
||||
|
||||
return new ParsedCriticalTable(
|
||||
if (parsedCells.Count != rowAnchors.Count * columnCenters.Count)
|
||||
{
|
||||
validationErrors.Add(
|
||||
$"Expected {rowAnchors.Count * columnCenters.Count} parsed cells but produced {parsedCells.Count}.");
|
||||
}
|
||||
|
||||
var validationReport = new ImportValidationReport(
|
||||
validationErrors.Count == 0,
|
||||
validationErrors,
|
||||
rowAnchors.Count,
|
||||
parsedCells.Count);
|
||||
|
||||
var table = new ParsedCriticalTable(
|
||||
entry.Slug,
|
||||
entry.DisplayName,
|
||||
entry.Family,
|
||||
Path.GetFileName(entry.PdfPath),
|
||||
"Imported from PDF text extraction.",
|
||||
columns,
|
||||
rollBands,
|
||||
results);
|
||||
"Imported from PDF XML extraction.",
|
||||
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
|
||||
parsedRollBands,
|
||||
parsedResults);
|
||||
|
||||
return new StandardCriticalTableParseResult(table, fragments, parsedCells, validationReport);
|
||||
}
|
||||
|
||||
private static bool IsColumnHeaderLine(string line)
|
||||
private static List<XmlTextFragment> LoadFragments(string xmlContent)
|
||||
{
|
||||
var matches = ColumnRegex.Matches(line);
|
||||
return matches.Count == 5;
|
||||
}
|
||||
using var stringReader = new StringReader(xmlContent);
|
||||
using var xmlReader = XmlReader.Create(
|
||||
stringReader,
|
||||
new XmlReaderSettings
|
||||
{
|
||||
DtdProcessing = DtdProcessing.Ignore
|
||||
});
|
||||
|
||||
private static List<(string Label, int Start)> GetColumnStarts(string headerLine)
|
||||
{
|
||||
var matches = ColumnRegex.Matches(headerLine);
|
||||
return matches
|
||||
.Select(match => (match.Groups[1].Value.ToUpperInvariant(), match.Index))
|
||||
var document = XDocument.Load(xmlReader);
|
||||
|
||||
return document.Descendants("page")
|
||||
.SelectMany(page =>
|
||||
{
|
||||
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
|
||||
return page.Elements("text")
|
||||
.Select(item => new XmlTextFragment(
|
||||
pageNumber,
|
||||
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
|
||||
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
|
||||
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
|
||||
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
|
||||
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
|
||||
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
|
||||
})
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static int[] GetColumnBoundaries(IReadOnlyList<(string Label, int Start)> columns)
|
||||
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
||||
{
|
||||
var boundaries = new int[columns.Count - 1];
|
||||
for (var index = 0; index < boundaries.Length; index++)
|
||||
{
|
||||
boundaries[index] = (columns[index].Start + columns[index + 1].Start) / 2;
|
||||
}
|
||||
var groupedByTop = fragments
|
||||
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
|
||||
.GroupBy(item => item.Top)
|
||||
.OrderBy(group => group.Key);
|
||||
|
||||
return boundaries;
|
||||
}
|
||||
|
||||
private static int FindNextRollBandIndex(IReadOnlyList<string> lines, int startIndex)
|
||||
{
|
||||
for (var index = startIndex; index < lines.Count; index++)
|
||||
foreach (var group in groupedByTop)
|
||||
{
|
||||
if (TryParseRollBandLine(lines[index], out _, out _))
|
||||
var ordered = group.OrderBy(item => item.Left).ToList();
|
||||
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
|
||||
if (labels.SequenceEqual(["A", "B", "C", "D", "E"]))
|
||||
{
|
||||
return index;
|
||||
return ordered;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
|
||||
}
|
||||
|
||||
private static bool TryParseRollBandLabel(string line, out string label)
|
||||
private static List<XmlTextFragment> FindRowLabelFragments(
|
||||
IReadOnlyList<XmlTextFragment> fragments,
|
||||
IReadOnlyList<XmlTextFragment> headerFragments)
|
||||
{
|
||||
var match = RollBandRegex.Match(line);
|
||||
if (!match.Success)
|
||||
{
|
||||
label = string.Empty;
|
||||
return false;
|
||||
}
|
||||
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
|
||||
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
||||
|
||||
label = match.Groups[1].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
|
||||
return true;
|
||||
return fragments
|
||||
.Where(item =>
|
||||
item.Left < leftCutoff &&
|
||||
item.Top >= bodyStartTop &&
|
||||
IsRollBandLabel(item.Text))
|
||||
.OrderBy(item => item.Top)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static bool TryParseRollBandLine(string line, out string label, out string trailingText)
|
||||
{
|
||||
var match = RollBandLineRegex.Match(line);
|
||||
if (!match.Success)
|
||||
{
|
||||
label = string.Empty;
|
||||
trailingText = string.Empty;
|
||||
return false;
|
||||
}
|
||||
|
||||
label = match.Groups["label"].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
|
||||
|
||||
var restGroup = match.Groups["rest"];
|
||||
trailingText = restGroup.Success
|
||||
? string.Concat(new string(' ', restGroup.Index), restGroup.Value.TrimEnd())
|
||||
: string.Empty;
|
||||
|
||||
return true;
|
||||
}
|
||||
private static bool IsRollBandLabel(string value) =>
|
||||
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$");
|
||||
|
||||
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
|
||||
{
|
||||
@@ -217,35 +215,39 @@ public sealed class StandardCriticalTableParser
|
||||
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
|
||||
}
|
||||
|
||||
private static List<string>[] SplitRowLines(IReadOnlyList<string> rowLines, int[] boundaries, int columnCount)
|
||||
private static string ResolveColumn(double centerX, IReadOnlyList<ColumnAnchor> columns)
|
||||
{
|
||||
var result = Enumerable.Range(0, columnCount)
|
||||
.Select(_ => new List<string>())
|
||||
.ToArray();
|
||||
|
||||
foreach (var line in rowLines)
|
||||
for (var index = 0; index < columns.Count - 1; index++)
|
||||
{
|
||||
for (var columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
||||
var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
|
||||
if (centerX < boundary)
|
||||
{
|
||||
var start = columnIndex == 0 ? 0 : boundaries[columnIndex - 1];
|
||||
var end = columnIndex == columnCount - 1
|
||||
? line.Length
|
||||
: Math.Min(boundaries[columnIndex], line.Length);
|
||||
|
||||
if (start >= line.Length || end <= start)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var segment = line[start..end].Trim();
|
||||
if (!string.IsNullOrWhiteSpace(segment))
|
||||
{
|
||||
result[columnIndex].Add(segment);
|
||||
}
|
||||
return columns[index].Key;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
return columns[^1].Key;
|
||||
}
|
||||
|
||||
private static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
|
||||
{
|
||||
var lines = new List<List<XmlTextFragment>>();
|
||||
|
||||
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
|
||||
{
|
||||
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
|
||||
{
|
||||
lines.Add([fragment]);
|
||||
continue;
|
||||
}
|
||||
|
||||
lines[^1].Add(fragment);
|
||||
}
|
||||
|
||||
return lines
|
||||
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
|
||||
.Where(item => !string.IsNullOrWhiteSpace(item))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static bool IsAffixLikeLine(string line)
|
||||
@@ -256,7 +258,7 @@ public sealed class StandardCriticalTableParser
|
||||
return false;
|
||||
}
|
||||
|
||||
if (value == "—")
|
||||
if (value == "-" || value == "\u2014")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@@ -270,16 +272,27 @@ public sealed class StandardCriticalTableParser
|
||||
}
|
||||
|
||||
return value.StartsWith("+", StringComparison.Ordinal) ||
|
||||
value.StartsWith('∑') ||
|
||||
value.StartsWith('∏') ||
|
||||
value.StartsWith('π') ||
|
||||
value.StartsWith('∫') ||
|
||||
value.StartsWith("\u2211", StringComparison.Ordinal) ||
|
||||
value.StartsWith("\u220F", StringComparison.Ordinal) ||
|
||||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
|
||||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
|
||||
char.IsDigit(value[0]) ||
|
||||
value.Contains(" – ", StringComparison.Ordinal) ||
|
||||
value.Contains(" - ", StringComparison.Ordinal) ||
|
||||
value.Contains("(-", StringComparison.Ordinal) ||
|
||||
value.Contains("(+", StringComparison.Ordinal);
|
||||
}
|
||||
|
||||
private static string CollapseWhitespace(string value) =>
|
||||
Regex.Replace(value.Trim(), @"\s+", " ");
|
||||
|
||||
private static string NormalizeText(string value) =>
|
||||
value
|
||||
.Replace('\u00a0', ' ')
|
||||
.Replace('\r', ' ')
|
||||
.Replace('\n', ' ')
|
||||
.Trim();
|
||||
|
||||
private sealed record ColumnAnchor(string Key, double CenterX);
|
||||
|
||||
private sealed record RowAnchor(string Label, int Top, int SortOrder);
|
||||
}
|
||||
|
||||
18
src/RolemasterDb.ImportTool/Parsing/XmlTextFragment.cs
Normal file
18
src/RolemasterDb.ImportTool/Parsing/XmlTextFragment.cs
Normal file
@@ -0,0 +1,18 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class XmlTextFragment(
|
||||
int pageNumber,
|
||||
int top,
|
||||
int left,
|
||||
int width,
|
||||
int height,
|
||||
string text)
|
||||
{
|
||||
public int PageNumber { get; } = pageNumber;
|
||||
public int Top { get; } = top;
|
||||
public int Left { get; } = left;
|
||||
public int Width { get; } = width;
|
||||
public int Height { get; } = height;
|
||||
public string Text { get; } = text;
|
||||
public double CenterX => Left + (Width / 2.0);
|
||||
}
|
||||
@@ -2,7 +2,7 @@ using System.Diagnostics;
|
||||
|
||||
namespace RolemasterDb.ImportTool;
|
||||
|
||||
public sealed class PdfTextExtractor
|
||||
public sealed class PdfXmlExtractor
|
||||
{
|
||||
public async Task ExtractAsync(string pdfPath, string outputPath, CancellationToken cancellationToken = default)
|
||||
{
|
||||
@@ -10,14 +10,16 @@ public sealed class PdfTextExtractor
|
||||
|
||||
var startInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = "pdftotext",
|
||||
FileName = "pdftohtml",
|
||||
RedirectStandardError = true,
|
||||
RedirectStandardOutput = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true
|
||||
};
|
||||
|
||||
startInfo.ArgumentList.Add("-layout");
|
||||
startInfo.ArgumentList.Add("-xml");
|
||||
startInfo.ArgumentList.Add("-i");
|
||||
startInfo.ArgumentList.Add("-noframes");
|
||||
startInfo.ArgumentList.Add(pdfPath);
|
||||
startInfo.ArgumentList.Add(outputPath);
|
||||
|
||||
@@ -28,7 +30,7 @@ public sealed class PdfTextExtractor
|
||||
if (process.ExitCode != 0)
|
||||
{
|
||||
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
|
||||
throw new InvalidOperationException($"pdftotext failed for '{pdfPath}': {error}");
|
||||
throw new InvalidOperationException($"pdftohtml failed for '{pdfPath}': {error}");
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user