Implement phase 3 standard critical imports

This commit is contained in:
2026-03-14 02:03:37 +01:00
parent 5c4d540246
commit 6870aa2aef
7 changed files with 465 additions and 45 deletions

View File

@@ -0,0 +1,29 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<IsPackable>false</IsPackable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="coverlet.collector" Version="6.0.4" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.14.1" />
<PackageReference Include="xunit" Version="2.9.3" />
<PackageReference Include="xunit.runner.visualstudio" Version="3.1.4" />
</ItemGroup>
<ItemGroup>
<Using Include="Xunit" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\RolemasterDb.ImportTool\RolemasterDb.ImportTool.csproj" />
</ItemGroup>
<ItemGroup>
<Compile Remove="UnitTest1.cs" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,146 @@
using RolemasterDb.ImportTool.Parsing;
namespace RolemasterDb.ImportTool.Tests;
public sealed class StandardCriticalTableParserIntegrationTests
{
private static readonly string[] ExpectedPhase3Slugs =
[
"arcane-aether",
"arcane-nether",
"ballistic-shrapnel",
"brawling",
"cold",
"electricity",
"grapple",
"heat",
"impact",
"krush",
"ma-strikes",
"ma-sweeps",
"puncture",
"slash",
"subdual",
"tiny",
"unbalance"
];
private static readonly PdfXmlExtractor Extractor = new();
private static readonly StandardCriticalTableParser Parser = new();
public static IEnumerable<object[]> EnabledStandardTables() =>
LoadManifest().Tables
.Where(item => item.Enabled)
.OrderBy(item => item.Slug, StringComparer.Ordinal)
.Select(item => new object[] { item });
public static IEnumerable<object[]> RepresentativeCells()
{
yield return ["slash", "71-75", "A", "Blow falls on lower leg"];
yield return ["puncture", "66", "C", "Strike shatters foe's knee"];
yield return ["ballistic-shrapnel", "86-90", "E", "destroy his heart"];
yield return ["arcane-aether", "96-99", "E", "smoking pulp"];
yield return ["ma-strikes", "96-99", "E", "drives bone into brain"];
yield return ["tiny", "100", "E", "Vein and artery severed"];
}
[Fact]
public void Manifest_enables_the_phase_3_standard_table_set()
{
var manifest = LoadManifest();
var enabledTables = manifest.Tables
.Where(item => item.Enabled)
.OrderBy(item => item.Slug, StringComparer.Ordinal)
.ToList();
Assert.Equal(ExpectedPhase3Slugs, enabledTables.Select(item => item.Slug));
Assert.All(enabledTables, entry =>
{
Assert.Equal("standard", entry.Family);
Assert.Equal("xml", entry.ExtractionMethod);
Assert.True(File.Exists(Path.Combine(GetRepositoryRoot(), entry.PdfPath)), $"Missing source PDF for '{entry.Slug}'.");
});
}
[Theory]
[MemberData(nameof(EnabledStandardTables))]
public async Task Enabled_standard_tables_extract_and_parse_successfully(CriticalImportManifestEntry entry)
{
var parseResult = await LoadParseResultAsync(entry);
Assert.True(parseResult.ValidationReport.IsValid, string.Join(Environment.NewLine, parseResult.ValidationReport.Errors));
Assert.Equal(5, parseResult.Table.Columns.Count);
Assert.NotEmpty(parseResult.Table.RollBands);
Assert.Equal(parseResult.ValidationReport.RowCount * 5, parseResult.ValidationReport.CellCount);
Assert.Equal(parseResult.ValidationReport.CellCount, parseResult.Table.Results.Count);
}
[Theory]
[MemberData(nameof(RepresentativeCells))]
public async Task Representative_cells_keep_expected_descriptions(
string slug,
string rollBandLabel,
string columnKey,
string expectedSnippet)
{
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, slug, StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var result = parseResult.Table.Results.Single(item =>
string.Equals(item.RollBandLabel, rollBandLabel, StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal));
Assert.Contains(expectedSnippet, result.DescriptionText, StringComparison.OrdinalIgnoreCase);
}
[Fact]
public async Task Slash_boundary_repair_keeps_56_60_a_prose_first()
{
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "slash", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var result = parseResult.Table.Results.Single(item =>
string.Equals(item.RollBandLabel, "56-60", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "A", StringComparison.Ordinal));
Assert.StartsWith("You recover from your initial swing", result.RawCellText, StringComparison.Ordinal);
}
private static async Task<StandardCriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
{
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
if (!File.Exists(xmlPath))
{
await Extractor.ExtractAsync(Path.Combine(GetRepositoryRoot(), entry.PdfPath), xmlPath);
}
var xmlContent = await File.ReadAllTextAsync(xmlPath);
return Parser.Parse(entry, xmlContent);
}
private static CriticalImportManifest LoadManifest() =>
new CriticalImportManifestLoader().Load(Path.Combine(GetRepositoryRoot(), "sources", "critical-import-manifest.json"));
private static string GetArtifactCacheRoot()
{
var cacheRoot = Path.Combine(Path.GetTempPath(), "RolemasterDb.ImportTool.Tests");
Directory.CreateDirectory(cacheRoot);
return cacheRoot;
}
private static string GetRepositoryRoot()
{
var probe = new DirectoryInfo(AppContext.BaseDirectory);
while (probe is not null)
{
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
{
return probe.FullName;
}
probe = probe.Parent;
}
throw new InvalidOperationException("Could not find the repository root for integration tests.");
}
}

View File

@@ -0,0 +1,3 @@
using Xunit;
[assembly: CollectionBehavior(DisableTestParallelization = true)]

View File

@@ -7,14 +7,17 @@ namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParser
{
private const int HeaderToBodyMinimumGap = 20;
private const int FooterLabelExclusionGap = 15;
private const int FooterPageNumberExclusionGap = 80;
private const int RowLabelDuplicateTolerance = 15;
private const int TopGroupingTolerance = 2;
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[-])", RegexOptions.Compiled);
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled);
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
var fragments = LoadFragments(xmlContent);
var headerFragments = FindHeaderFragments(fragments);
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments);
var validationErrors = new List<string>();
var columnCenters = headerFragments
@@ -22,6 +25,16 @@ public sealed class StandardCriticalTableParser
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
.ToList();
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments, keyTop);
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
@@ -32,16 +45,11 @@ public sealed class StandardCriticalTableParser
validationErrors.Add("No roll-band labels were found in the XML artifact.");
}
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item => string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - 1 &&
!IsFooterPageNumberFragment(item, keyTop) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList();
@@ -56,11 +64,11 @@ public sealed class StandardCriticalTableParser
{
var rowStart = rowIndex == 0
? bodyStartTop
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0);
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0) + 1;
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0);
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0) + 1;
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
@@ -95,26 +103,12 @@ public sealed class StandardCriticalTableParser
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
{
var firstProseIndex = cellEntry.Lines.FindIndex(line => !IsAffixLikeLine(line));
var firstAffixIndex = cellEntry.Lines.FindIndex(IsAffixLikeLine);
var segmentCount = CountLineTypeSegments(cellEntry.Lines);
if (firstProseIndex > 0)
if (segmentCount > 2)
{
validationErrors.Add(
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' begins with affix-like lines before prose.");
}
if (firstAffixIndex >= 0)
{
var proseAfterAffix = cellEntry.Lines
.Skip(firstAffixIndex + 1)
.Any(line => !IsAffixLikeLine(line));
if (proseAfterAffix)
{
validationErrors.Add(
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' contains prose after affix lines.");
}
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
}
var rawAffixLines = cellEntry.Lines.Where(IsAffixLikeLine).ToList();
@@ -200,12 +194,13 @@ public sealed class StandardCriticalTableParser
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
{
var groupedByTop = fragments
var headerCandidates = fragments
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
.GroupBy(item => item.Top)
.OrderBy(group => group.Key);
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
foreach (var group in groupedByTop)
foreach (var group in GroupByTop(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
@@ -220,18 +215,37 @@ public sealed class StandardCriticalTableParser
private static List<XmlTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<XmlTextFragment> headerFragments)
IReadOnlyList<XmlTextFragment> headerFragments,
int keyTop)
{
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
return fragments
var candidates = fragments
.Where(item =>
item.Left < leftCutoff &&
item.Top >= bodyStartTop &&
item.Top < keyTop - FooterLabelExclusionGap &&
IsRollBandLabel(item.Text))
.OrderBy(item => item.Top)
.ToList();
var deduped = new List<XmlTextFragment>();
foreach (var candidate in candidates)
{
var previous = deduped.LastOrDefault();
if (previous is not null &&
string.Equals(previous.Text, candidate.Text, StringComparison.OrdinalIgnoreCase) &&
Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance)
{
continue;
}
deduped.Add(candidate);
}
return deduped;
}
private static bool IsRollBandLabel(string value) =>
@@ -293,7 +307,7 @@ public sealed class StandardCriticalTableParser
return false;
}
if (value == "-" || value == "\u2014")
if (value == "-" || value == "\u2013" || value == "\u2014")
{
return true;
}
@@ -301,7 +315,10 @@ public sealed class StandardCriticalTableParser
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase))
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase))
{
return value.Contains(':', StringComparison.Ordinal);
}
@@ -311,10 +328,9 @@ public sealed class StandardCriticalTableParser
value.StartsWith("\u220F", StringComparison.Ordinal) ||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
StandaloneModifierAffixLineRegex.IsMatch(value) ||
NumericAffixLineRegex.IsMatch(value) ||
value.Contains(" - ", StringComparison.Ordinal) ||
value.Contains("(-", StringComparison.Ordinal) ||
value.Contains("(+", StringComparison.Ordinal);
value.Contains(" - ", StringComparison.Ordinal);
}
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries)
@@ -361,6 +377,55 @@ public sealed class StandardCriticalTableParser
.Replace('\n', ' ')
.Trim();
private static int CountLineTypeSegments(IReadOnlyList<string> lines)
{
var segmentCount = 0;
bool? previousIsAffix = null;
foreach (var line in lines)
{
var currentIsAffix = IsAffixLikeLine(line);
if (previousIsAffix == currentIsAffix)
{
continue;
}
segmentCount++;
previousIsAffix = currentIsAffix;
}
return segmentCount;
}
private static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
{
if (keyTop == int.MaxValue)
{
return false;
}
return fragment.Top >= keyTop - FooterPageNumberExclusionGap &&
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
}
private static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
{
var groups = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments)
{
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
groups.Add([fragment]);
continue;
}
groups[^1].Add(fragment);
}
return groups;
}
private sealed record ColumnAnchor(string Key, double CenterX);
private sealed record RowAnchor(string Label, int Top, int SortOrder);