Add phase 1 critical import tool

This commit is contained in:
2026-03-14 01:10:44 +01:00
parent 44af81cc38
commit f70d610c92
25 changed files with 851 additions and 166 deletions

View File

@@ -0,0 +1,9 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalColumn(string columnKey, string label, string role, int sortOrder)
{
public string ColumnKey { get; } = columnKey;
public string Label { get; } = label;
public string Role { get; } = role;
public int SortOrder { get; } = sortOrder;
}

View File

@@ -0,0 +1,15 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalResult(
string columnKey,
string rollBandLabel,
string rawCellText,
string descriptionText,
string? rawAffixText)
{
public string ColumnKey { get; } = columnKey;
public string RollBandLabel { get; } = rollBandLabel;
public string RawCellText { get; } = rawCellText;
public string DescriptionText { get; } = descriptionText;
public string? RawAffixText { get; } = rawAffixText;
}

View File

@@ -0,0 +1,9 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalRollBand(string label, int minRoll, int? maxRoll, int sortOrder)
{
public string Label { get; } = label;
public int MinRoll { get; } = minRoll;
public int? MaxRoll { get; } = maxRoll;
public int SortOrder { get; } = sortOrder;
}

View File

@@ -0,0 +1,21 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalTable(
string slug,
string displayName,
string family,
string sourceDocument,
string? notes,
IReadOnlyList<ParsedCriticalColumn> columns,
IReadOnlyList<ParsedCriticalRollBand> rollBands,
IReadOnlyList<ParsedCriticalResult> results)
{
public string Slug { get; } = slug;
public string DisplayName { get; } = displayName;
public string Family { get; } = family;
public string SourceDocument { get; } = sourceDocument;
public string? Notes { get; } = notes;
public IReadOnlyList<ParsedCriticalColumn> Columns { get; } = columns;
public IReadOnlyList<ParsedCriticalRollBand> RollBands { get; } = rollBands;
public IReadOnlyList<ParsedCriticalResult> Results { get; } = results;
}

View File

@@ -0,0 +1,285 @@
using System.Text.RegularExpressions;
namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParser
{
private static readonly Regex ColumnRegex = new(@"\b([A-E])\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static readonly Regex RollBandRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)\s*$", RegexOptions.Compiled);
private static readonly Regex RollBandLineRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)(?<rest>\s+.*)?$", RegexOptions.Compiled);
public ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText)
{
var lines = extractedText.Replace("\r\n", "\n", StringComparison.Ordinal)
.Replace('\f', '\n')
.Split('\n');
var headerIndex = Array.FindIndex(lines, IsColumnHeaderLine);
if (headerIndex < 0)
{
throw new InvalidOperationException("The standard table header could not be found in the extracted text.");
}
var columnStarts = GetColumnStarts(lines[headerIndex]);
var boundaries = GetColumnBoundaries(columnStarts);
var columns = columnStarts
.Select((item, index) => new ParsedCriticalColumn(item.Label, item.Label, "severity", index + 1))
.ToList();
var firstRollBandIndex = FindNextRollBandIndex(lines, headerIndex + 1);
if (firstRollBandIndex < 0)
{
throw new InvalidOperationException("No roll bands were found in the extracted text.");
}
var keyLineIndex = Array.FindIndex(lines, firstRollBandIndex, item => item.TrimStart().StartsWith("Key:", StringComparison.OrdinalIgnoreCase));
if (keyLineIndex < 0)
{
keyLineIndex = lines.Length;
}
var leadingLines = lines[(headerIndex + 1)..firstRollBandIndex]
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
var rollBands = new List<ParsedCriticalRollBand>();
var results = new List<ParsedCriticalResult>();
var currentLabel = string.Empty;
var currentRowLines = new List<string>();
var rowIndex = 0;
void FlushCurrentRow()
{
if (string.IsNullOrEmpty(currentLabel))
{
return;
}
rowIndex++;
var rollBand = CreateRollBand(currentLabel, rowIndex);
rollBands.Add(rollBand);
var cellLines = SplitRowLines(currentRowLines, boundaries, columns.Count);
for (var columnIndex = 0; columnIndex < columns.Count; columnIndex++)
{
var rawCellLines = cellLines[columnIndex]
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
var rawAffixLines = rawCellLines
.Where(IsAffixLikeLine)
.ToList();
var descriptionLines = rawCellLines
.Where(item => !IsAffixLikeLine(item))
.ToList();
results.Add(new ParsedCriticalResult(
columns[columnIndex].ColumnKey,
rollBand.Label,
string.Join(Environment.NewLine, rawCellLines),
CollapseWhitespace(string.Join(' ', descriptionLines)),
rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines)));
}
currentLabel = string.Empty;
currentRowLines = new List<string>();
}
for (var lineIndex = firstRollBandIndex; lineIndex < keyLineIndex; lineIndex++)
{
if (TryParseRollBandLine(lines[lineIndex], out var label, out var trailingText))
{
var trailingTextBelongsToCurrentRow = IsAffixLikeLine(trailingText);
if (!string.IsNullOrWhiteSpace(trailingText) &&
!string.IsNullOrEmpty(currentLabel) &&
!trailingTextBelongsToCurrentRow)
{
currentRowLines.Add(trailingText);
}
FlushCurrentRow();
currentLabel = label;
if (rowIndex == 0)
{
currentRowLines.AddRange(leadingLines);
}
if (!string.IsNullOrWhiteSpace(trailingText) && trailingTextBelongsToCurrentRow)
{
currentRowLines.Add(trailingText);
}
continue;
}
if (!string.IsNullOrWhiteSpace(lines[lineIndex]))
{
currentRowLines.Add(lines[lineIndex]);
}
}
FlushCurrentRow();
return new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF text extraction.",
columns,
rollBands,
results);
}
private static bool IsColumnHeaderLine(string line)
{
var matches = ColumnRegex.Matches(line);
return matches.Count == 5;
}
private static List<(string Label, int Start)> GetColumnStarts(string headerLine)
{
var matches = ColumnRegex.Matches(headerLine);
return matches
.Select(match => (match.Groups[1].Value.ToUpperInvariant(), match.Index))
.ToList();
}
private static int[] GetColumnBoundaries(IReadOnlyList<(string Label, int Start)> columns)
{
var boundaries = new int[columns.Count - 1];
for (var index = 0; index < boundaries.Length; index++)
{
boundaries[index] = (columns[index].Start + columns[index + 1].Start) / 2;
}
return boundaries;
}
private static int FindNextRollBandIndex(IReadOnlyList<string> lines, int startIndex)
{
for (var index = startIndex; index < lines.Count; index++)
{
if (TryParseRollBandLine(lines[index], out _, out _))
{
return index;
}
}
return -1;
}
private static bool TryParseRollBandLabel(string line, out string label)
{
var match = RollBandRegex.Match(line);
if (!match.Success)
{
label = string.Empty;
return false;
}
label = match.Groups[1].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
return true;
}
private static bool TryParseRollBandLine(string line, out string label, out string trailingText)
{
var match = RollBandLineRegex.Match(line);
if (!match.Success)
{
label = string.Empty;
trailingText = string.Empty;
return false;
}
label = match.Groups["label"].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
var restGroup = match.Groups["rest"];
trailingText = restGroup.Success
? string.Concat(new string(' ', restGroup.Index), restGroup.Value.TrimEnd())
: string.Empty;
return true;
}
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
{
if (label.EndsWith('+'))
{
return new ParsedCriticalRollBand(label, int.Parse(label[..^1]), null, sortOrder);
}
var parts = label.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
return parts.Length == 1
? new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder)
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
}
private static List<string>[] SplitRowLines(IReadOnlyList<string> rowLines, int[] boundaries, int columnCount)
{
var result = Enumerable.Range(0, columnCount)
.Select(_ => new List<string>())
.ToArray();
foreach (var line in rowLines)
{
for (var columnIndex = 0; columnIndex < columnCount; columnIndex++)
{
var start = columnIndex == 0 ? 0 : boundaries[columnIndex - 1];
var end = columnIndex == columnCount - 1
? line.Length
: Math.Min(boundaries[columnIndex], line.Length);
if (start >= line.Length || end <= start)
{
continue;
}
var segment = line[start..end].Trim();
if (!string.IsNullOrWhiteSpace(segment))
{
result[columnIndex].Add(segment);
}
}
}
return result;
}
private static bool IsAffixLikeLine(string line)
{
var value = line.Trim();
if (value.Length == 0)
{
return false;
}
if (value == "—")
{
return true;
}
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase))
{
return value.Contains(':', StringComparison.Ordinal);
}
return value.StartsWith("+", StringComparison.Ordinal) ||
value.StartsWith('∑') ||
value.StartsWith('∏') ||
value.StartsWith('π') ||
value.StartsWith('∫') ||
char.IsDigit(value[0]) ||
value.Contains(" ", StringComparison.Ordinal) ||
value.Contains("(-", StringComparison.Ordinal) ||
value.Contains("(+", StringComparison.Ordinal);
}
private static string CollapseWhitespace(string value) =>
Regex.Replace(value.Trim(), @"\s+", " ");
}