Add phase 1 critical import tool
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class ParsedCriticalColumn(string columnKey, string label, string role, int sortOrder)
|
||||
{
|
||||
public string ColumnKey { get; } = columnKey;
|
||||
public string Label { get; } = label;
|
||||
public string Role { get; } = role;
|
||||
public int SortOrder { get; } = sortOrder;
|
||||
}
|
||||
15
src/RolemasterDb.ImportTool/Parsing/ParsedCriticalResult.cs
Normal file
15
src/RolemasterDb.ImportTool/Parsing/ParsedCriticalResult.cs
Normal file
@@ -0,0 +1,15 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class ParsedCriticalResult(
|
||||
string columnKey,
|
||||
string rollBandLabel,
|
||||
string rawCellText,
|
||||
string descriptionText,
|
||||
string? rawAffixText)
|
||||
{
|
||||
public string ColumnKey { get; } = columnKey;
|
||||
public string RollBandLabel { get; } = rollBandLabel;
|
||||
public string RawCellText { get; } = rawCellText;
|
||||
public string DescriptionText { get; } = descriptionText;
|
||||
public string? RawAffixText { get; } = rawAffixText;
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class ParsedCriticalRollBand(string label, int minRoll, int? maxRoll, int sortOrder)
|
||||
{
|
||||
public string Label { get; } = label;
|
||||
public int MinRoll { get; } = minRoll;
|
||||
public int? MaxRoll { get; } = maxRoll;
|
||||
public int SortOrder { get; } = sortOrder;
|
||||
}
|
||||
21
src/RolemasterDb.ImportTool/Parsing/ParsedCriticalTable.cs
Normal file
21
src/RolemasterDb.ImportTool/Parsing/ParsedCriticalTable.cs
Normal file
@@ -0,0 +1,21 @@
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class ParsedCriticalTable(
|
||||
string slug,
|
||||
string displayName,
|
||||
string family,
|
||||
string sourceDocument,
|
||||
string? notes,
|
||||
IReadOnlyList<ParsedCriticalColumn> columns,
|
||||
IReadOnlyList<ParsedCriticalRollBand> rollBands,
|
||||
IReadOnlyList<ParsedCriticalResult> results)
|
||||
{
|
||||
public string Slug { get; } = slug;
|
||||
public string DisplayName { get; } = displayName;
|
||||
public string Family { get; } = family;
|
||||
public string SourceDocument { get; } = sourceDocument;
|
||||
public string? Notes { get; } = notes;
|
||||
public IReadOnlyList<ParsedCriticalColumn> Columns { get; } = columns;
|
||||
public IReadOnlyList<ParsedCriticalRollBand> RollBands { get; } = rollBands;
|
||||
public IReadOnlyList<ParsedCriticalResult> Results { get; } = results;
|
||||
}
|
||||
@@ -0,0 +1,285 @@
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace RolemasterDb.ImportTool.Parsing;
|
||||
|
||||
public sealed class StandardCriticalTableParser
|
||||
{
|
||||
private static readonly Regex ColumnRegex = new(@"\b([A-E])\b", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
private static readonly Regex RollBandRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)\s*$", RegexOptions.Compiled);
|
||||
private static readonly Regex RollBandLineRegex = new(@"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)(?<rest>\s+.*)?$", RegexOptions.Compiled);
|
||||
|
||||
public ParsedCriticalTable Parse(CriticalImportManifestEntry entry, string extractedText)
|
||||
{
|
||||
var lines = extractedText.Replace("\r\n", "\n", StringComparison.Ordinal)
|
||||
.Replace('\f', '\n')
|
||||
.Split('\n');
|
||||
|
||||
var headerIndex = Array.FindIndex(lines, IsColumnHeaderLine);
|
||||
if (headerIndex < 0)
|
||||
{
|
||||
throw new InvalidOperationException("The standard table header could not be found in the extracted text.");
|
||||
}
|
||||
|
||||
var columnStarts = GetColumnStarts(lines[headerIndex]);
|
||||
var boundaries = GetColumnBoundaries(columnStarts);
|
||||
var columns = columnStarts
|
||||
.Select((item, index) => new ParsedCriticalColumn(item.Label, item.Label, "severity", index + 1))
|
||||
.ToList();
|
||||
|
||||
var firstRollBandIndex = FindNextRollBandIndex(lines, headerIndex + 1);
|
||||
if (firstRollBandIndex < 0)
|
||||
{
|
||||
throw new InvalidOperationException("No roll bands were found in the extracted text.");
|
||||
}
|
||||
|
||||
var keyLineIndex = Array.FindIndex(lines, firstRollBandIndex, item => item.TrimStart().StartsWith("Key:", StringComparison.OrdinalIgnoreCase));
|
||||
if (keyLineIndex < 0)
|
||||
{
|
||||
keyLineIndex = lines.Length;
|
||||
}
|
||||
|
||||
var leadingLines = lines[(headerIndex + 1)..firstRollBandIndex]
|
||||
.Where(item => !string.IsNullOrWhiteSpace(item))
|
||||
.ToList();
|
||||
|
||||
var rollBands = new List<ParsedCriticalRollBand>();
|
||||
var results = new List<ParsedCriticalResult>();
|
||||
var currentLabel = string.Empty;
|
||||
var currentRowLines = new List<string>();
|
||||
var rowIndex = 0;
|
||||
|
||||
void FlushCurrentRow()
|
||||
{
|
||||
if (string.IsNullOrEmpty(currentLabel))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
rowIndex++;
|
||||
var rollBand = CreateRollBand(currentLabel, rowIndex);
|
||||
rollBands.Add(rollBand);
|
||||
|
||||
var cellLines = SplitRowLines(currentRowLines, boundaries, columns.Count);
|
||||
for (var columnIndex = 0; columnIndex < columns.Count; columnIndex++)
|
||||
{
|
||||
var rawCellLines = cellLines[columnIndex]
|
||||
.Where(item => !string.IsNullOrWhiteSpace(item))
|
||||
.ToList();
|
||||
|
||||
var rawAffixLines = rawCellLines
|
||||
.Where(IsAffixLikeLine)
|
||||
.ToList();
|
||||
|
||||
var descriptionLines = rawCellLines
|
||||
.Where(item => !IsAffixLikeLine(item))
|
||||
.ToList();
|
||||
|
||||
results.Add(new ParsedCriticalResult(
|
||||
columns[columnIndex].ColumnKey,
|
||||
rollBand.Label,
|
||||
string.Join(Environment.NewLine, rawCellLines),
|
||||
CollapseWhitespace(string.Join(' ', descriptionLines)),
|
||||
rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines)));
|
||||
}
|
||||
|
||||
currentLabel = string.Empty;
|
||||
currentRowLines = new List<string>();
|
||||
}
|
||||
|
||||
for (var lineIndex = firstRollBandIndex; lineIndex < keyLineIndex; lineIndex++)
|
||||
{
|
||||
if (TryParseRollBandLine(lines[lineIndex], out var label, out var trailingText))
|
||||
{
|
||||
var trailingTextBelongsToCurrentRow = IsAffixLikeLine(trailingText);
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(trailingText) &&
|
||||
!string.IsNullOrEmpty(currentLabel) &&
|
||||
!trailingTextBelongsToCurrentRow)
|
||||
{
|
||||
currentRowLines.Add(trailingText);
|
||||
}
|
||||
|
||||
FlushCurrentRow();
|
||||
currentLabel = label;
|
||||
if (rowIndex == 0)
|
||||
{
|
||||
currentRowLines.AddRange(leadingLines);
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(trailingText) && trailingTextBelongsToCurrentRow)
|
||||
{
|
||||
currentRowLines.Add(trailingText);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(lines[lineIndex]))
|
||||
{
|
||||
currentRowLines.Add(lines[lineIndex]);
|
||||
}
|
||||
}
|
||||
|
||||
FlushCurrentRow();
|
||||
|
||||
return new ParsedCriticalTable(
|
||||
entry.Slug,
|
||||
entry.DisplayName,
|
||||
entry.Family,
|
||||
Path.GetFileName(entry.PdfPath),
|
||||
"Imported from PDF text extraction.",
|
||||
columns,
|
||||
rollBands,
|
||||
results);
|
||||
}
|
||||
|
||||
private static bool IsColumnHeaderLine(string line)
|
||||
{
|
||||
var matches = ColumnRegex.Matches(line);
|
||||
return matches.Count == 5;
|
||||
}
|
||||
|
||||
private static List<(string Label, int Start)> GetColumnStarts(string headerLine)
|
||||
{
|
||||
var matches = ColumnRegex.Matches(headerLine);
|
||||
return matches
|
||||
.Select(match => (match.Groups[1].Value.ToUpperInvariant(), match.Index))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static int[] GetColumnBoundaries(IReadOnlyList<(string Label, int Start)> columns)
|
||||
{
|
||||
var boundaries = new int[columns.Count - 1];
|
||||
for (var index = 0; index < boundaries.Length; index++)
|
||||
{
|
||||
boundaries[index] = (columns[index].Start + columns[index + 1].Start) / 2;
|
||||
}
|
||||
|
||||
return boundaries;
|
||||
}
|
||||
|
||||
private static int FindNextRollBandIndex(IReadOnlyList<string> lines, int startIndex)
|
||||
{
|
||||
for (var index = startIndex; index < lines.Count; index++)
|
||||
{
|
||||
if (TryParseRollBandLine(lines[index], out _, out _))
|
||||
{
|
||||
return index;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
private static bool TryParseRollBandLabel(string line, out string label)
|
||||
{
|
||||
var match = RollBandRegex.Match(line);
|
||||
if (!match.Success)
|
||||
{
|
||||
label = string.Empty;
|
||||
return false;
|
||||
}
|
||||
|
||||
label = match.Groups[1].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
|
||||
return true;
|
||||
}
|
||||
|
||||
private static bool TryParseRollBandLine(string line, out string label, out string trailingText)
|
||||
{
|
||||
var match = RollBandLineRegex.Match(line);
|
||||
if (!match.Success)
|
||||
{
|
||||
label = string.Empty;
|
||||
trailingText = string.Empty;
|
||||
return false;
|
||||
}
|
||||
|
||||
label = match.Groups["label"].Value.Replace(" ", string.Empty, StringComparison.Ordinal);
|
||||
|
||||
var restGroup = match.Groups["rest"];
|
||||
trailingText = restGroup.Success
|
||||
? string.Concat(new string(' ', restGroup.Index), restGroup.Value.TrimEnd())
|
||||
: string.Empty;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
|
||||
{
|
||||
if (label.EndsWith('+'))
|
||||
{
|
||||
return new ParsedCriticalRollBand(label, int.Parse(label[..^1]), null, sortOrder);
|
||||
}
|
||||
|
||||
var parts = label.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||||
return parts.Length == 1
|
||||
? new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder)
|
||||
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
|
||||
}
|
||||
|
||||
private static List<string>[] SplitRowLines(IReadOnlyList<string> rowLines, int[] boundaries, int columnCount)
|
||||
{
|
||||
var result = Enumerable.Range(0, columnCount)
|
||||
.Select(_ => new List<string>())
|
||||
.ToArray();
|
||||
|
||||
foreach (var line in rowLines)
|
||||
{
|
||||
for (var columnIndex = 0; columnIndex < columnCount; columnIndex++)
|
||||
{
|
||||
var start = columnIndex == 0 ? 0 : boundaries[columnIndex - 1];
|
||||
var end = columnIndex == columnCount - 1
|
||||
? line.Length
|
||||
: Math.Min(boundaries[columnIndex], line.Length);
|
||||
|
||||
if (start >= line.Length || end <= start)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var segment = line[start..end].Trim();
|
||||
if (!string.IsNullOrWhiteSpace(segment))
|
||||
{
|
||||
result[columnIndex].Add(segment);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static bool IsAffixLikeLine(string line)
|
||||
{
|
||||
var value = line.Trim();
|
||||
if (value.Length == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (value == "—")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
|
||||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
|
||||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
|
||||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return value.Contains(':', StringComparison.Ordinal);
|
||||
}
|
||||
|
||||
return value.StartsWith("+", StringComparison.Ordinal) ||
|
||||
value.StartsWith('∑') ||
|
||||
value.StartsWith('∏') ||
|
||||
value.StartsWith('π') ||
|
||||
value.StartsWith('∫') ||
|
||||
char.IsDigit(value[0]) ||
|
||||
value.Contains(" – ", StringComparison.Ordinal) ||
|
||||
value.Contains("(-", StringComparison.Ordinal) ||
|
||||
value.Contains("(+", StringComparison.Ordinal);
|
||||
}
|
||||
|
||||
private static string CollapseWhitespace(string value) =>
|
||||
Regex.Replace(value.Trim(), @"\s+", " ");
|
||||
}
|
||||
Reference in New Issue
Block a user