Implement phase 3 standard critical imports
This commit is contained in:
@@ -2,5 +2,6 @@
|
|||||||
<Folder Name="/src/">
|
<Folder Name="/src/">
|
||||||
<Project Path="src/RolemasterDb.App/RolemasterDb.App.csproj" />
|
<Project Path="src/RolemasterDb.App/RolemasterDb.App.csproj" />
|
||||||
<Project Path="src/RolemasterDb.ImportTool/RolemasterDb.ImportTool.csproj" />
|
<Project Path="src/RolemasterDb.ImportTool/RolemasterDb.ImportTool.csproj" />
|
||||||
|
<Project Path="src/RolemasterDb.ImportTool.Tests/RolemasterDb.ImportTool.Tests.csproj" />
|
||||||
</Folder>
|
</Folder>
|
||||||
</Solution>
|
</Solution>
|
||||||
|
|||||||
@@ -31,14 +31,33 @@ The current implementation supports:
|
|||||||
- manifest-driven source selection
|
- manifest-driven source selection
|
||||||
- `standard` critical tables with columns `A-E`
|
- `standard` critical tables with columns `A-E`
|
||||||
- XML-based extraction using `pdftohtml -xml`
|
- XML-based extraction using `pdftohtml -xml`
|
||||||
- geometry-based parsing for `Slash.pdf`
|
- geometry-based parsing across the currently enabled phase-3 tables:
|
||||||
|
- `arcane-aether`
|
||||||
|
- `arcane-nether`
|
||||||
|
- `ballistic-shrapnel`
|
||||||
|
- `brawling`
|
||||||
|
- `cold`
|
||||||
|
- `electricity`
|
||||||
|
- `grapple`
|
||||||
|
- `heat`
|
||||||
|
- `impact`
|
||||||
|
- `krush`
|
||||||
|
- `ma-strikes`
|
||||||
|
- `ma-sweeps`
|
||||||
|
- `puncture`
|
||||||
|
- `slash`
|
||||||
|
- `subdual`
|
||||||
|
- `tiny`
|
||||||
|
- `unbalance`
|
||||||
- row-boundary repair for trailing affix leakage
|
- row-boundary repair for trailing affix leakage
|
||||||
|
- footer/page-number filtering during body parsing
|
||||||
- transactional loading into SQLite
|
- transactional loading into SQLite
|
||||||
|
|
||||||
The current implementation does not yet support:
|
The current implementation does not yet support:
|
||||||
|
|
||||||
- variant-column critical tables
|
- variant-column critical tables
|
||||||
- grouped variant tables
|
- grouped variant tables
|
||||||
|
- `Mana.pdf`, whose current XML layout and affix notation still need a dedicated parser pass
|
||||||
- OCR/image-based PDFs such as `Void.pdf`
|
- OCR/image-based PDFs such as `Void.pdf`
|
||||||
- normalized `critical_branch` population
|
- normalized `critical_branch` population
|
||||||
- normalized `critical_effect` population
|
- normalized `critical_effect` population
|
||||||
@@ -183,10 +202,9 @@ The parser was hardened in two ways:
|
|||||||
|
|
||||||
The importer now explicitly rejects cells that still look structurally wrong after repair:
|
The importer now explicitly rejects cells that still look structurally wrong after repair:
|
||||||
|
|
||||||
- a cell may not begin with affix-like lines before prose
|
- prose and affix segments may not alternate more than once inside a cell
|
||||||
- a cell may not contain prose after affix lines
|
|
||||||
|
|
||||||
This hardening step is important because it closed a class of row-boundary bugs that simple row/cell counts could not detect.
|
This keeps the phase-2.1 safety goal in place while allowing broader standard-table layouts that render a single affix block either before or after the prose block.
|
||||||
|
|
||||||
## Planned Future Phases
|
## Planned Future Phases
|
||||||
|
|
||||||
@@ -194,9 +212,34 @@ The current architecture is intended to support additional phases:
|
|||||||
|
|
||||||
### Phase 3: Broader Table Coverage
|
### Phase 3: Broader Table Coverage
|
||||||
|
|
||||||
- add more `standard` critical PDFs
|
Phase 3 expands the manifest and validates the shared `standard` parser across a broader set of `A-E` tables.
|
||||||
- expand the manifest
|
|
||||||
- verify parser stability across more source layouts
|
The currently enabled phase-3 table set is:
|
||||||
|
|
||||||
|
- `arcane-aether`
|
||||||
|
- `arcane-nether`
|
||||||
|
- `ballistic-shrapnel`
|
||||||
|
- `brawling`
|
||||||
|
- `cold`
|
||||||
|
- `electricity`
|
||||||
|
- `grapple`
|
||||||
|
- `heat`
|
||||||
|
- `impact`
|
||||||
|
- `krush`
|
||||||
|
- `ma-strikes`
|
||||||
|
- `ma-sweeps`
|
||||||
|
- `puncture`
|
||||||
|
- `slash`
|
||||||
|
- `subdual`
|
||||||
|
- `tiny`
|
||||||
|
- `unbalance`
|
||||||
|
|
||||||
|
Current phase-3 notes:
|
||||||
|
|
||||||
|
- header detection now tolerates minor `top` misalignment across the `A-E` header glyphs
|
||||||
|
- footer page numbers are filtered out before body parsing
|
||||||
|
- validation allows a single contiguous affix block either before or after prose
|
||||||
|
- `Mana.pdf` is intentionally left out for now because its row-anchor geometry and notation still need dedicated handling
|
||||||
|
|
||||||
### Phase 4: Variant and Grouped Tables
|
### Phase 4: Variant and Grouped Tables
|
||||||
|
|
||||||
@@ -289,6 +332,11 @@ Each entry declares:
|
|||||||
|
|
||||||
The manifest is intentionally the control point for enabling importer support one table at a time.
|
The manifest is intentionally the control point for enabling importer support one table at a time.
|
||||||
|
|
||||||
|
For the currently enabled phase-3 entries:
|
||||||
|
|
||||||
|
- `family` is `standard`
|
||||||
|
- `extractionMethod` is `xml`
|
||||||
|
|
||||||
## Artifact Layout
|
## Artifact Layout
|
||||||
|
|
||||||
Artifacts are written under:
|
Artifacts are written under:
|
||||||
|
|||||||
@@ -1,12 +1,140 @@
|
|||||||
{
|
{
|
||||||
"tables": [
|
"tables": [
|
||||||
|
{
|
||||||
|
"slug": "arcane-aether",
|
||||||
|
"displayName": "Arcane Aether Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Arcane Aether.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "arcane-nether",
|
||||||
|
"displayName": "Arcane Nether Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Arcane Nether.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "ballistic-shrapnel",
|
||||||
|
"displayName": "Ballistic Shrapnel Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Ballistic Shrapnel.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "brawling",
|
||||||
|
"displayName": "Brawling Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Brawling.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "cold",
|
||||||
|
"displayName": "Cold Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Cold.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "electricity",
|
||||||
|
"displayName": "Electricity Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Electricity.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "grapple",
|
||||||
|
"displayName": "Grapple Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Grapple.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "heat",
|
||||||
|
"displayName": "Heat Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Heat.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "impact",
|
||||||
|
"displayName": "Impact Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Impact.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "krush",
|
||||||
|
"displayName": "Krush Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Krush.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "ma-strikes",
|
||||||
|
"displayName": "Martial Arts Strikes Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/MA Strikes.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "ma-sweeps",
|
||||||
|
"displayName": "Martial Arts Sweeps Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/MA Sweeps.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "puncture",
|
||||||
|
"displayName": "Puncture Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Puncture.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"slug": "slash",
|
"slug": "slash",
|
||||||
"displayName": "Slash Critical Strike Table",
|
"displayName": "Slash Critical Strike Table",
|
||||||
"family": "standard",
|
"family": "standard",
|
||||||
"extractionMethod": "text",
|
"extractionMethod": "xml",
|
||||||
"pdfPath": "sources/Slash.pdf",
|
"pdfPath": "sources/Slash.pdf",
|
||||||
"enabled": true
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "subdual",
|
||||||
|
"displayName": "Subdual Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Subdual.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "tiny",
|
||||||
|
"displayName": "Tiny Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Tiny.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "unbalance",
|
||||||
|
"displayName": "Unbalance Critical Strike Table",
|
||||||
|
"family": "standard",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Unbalance.pdf",
|
||||||
|
"enabled": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,29 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>net10.0</TargetFramework>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<IsPackable>false</IsPackable>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="coverlet.collector" Version="6.0.4" />
|
||||||
|
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.14.1" />
|
||||||
|
<PackageReference Include="xunit" Version="2.9.3" />
|
||||||
|
<PackageReference Include="xunit.runner.visualstudio" Version="3.1.4" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<Using Include="Xunit" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\RolemasterDb.ImportTool\RolemasterDb.ImportTool.csproj" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<Compile Remove="UnitTest1.cs" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,146 @@
|
|||||||
|
using RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
namespace RolemasterDb.ImportTool.Tests;
|
||||||
|
|
||||||
|
public sealed class StandardCriticalTableParserIntegrationTests
|
||||||
|
{
|
||||||
|
private static readonly string[] ExpectedPhase3Slugs =
|
||||||
|
[
|
||||||
|
"arcane-aether",
|
||||||
|
"arcane-nether",
|
||||||
|
"ballistic-shrapnel",
|
||||||
|
"brawling",
|
||||||
|
"cold",
|
||||||
|
"electricity",
|
||||||
|
"grapple",
|
||||||
|
"heat",
|
||||||
|
"impact",
|
||||||
|
"krush",
|
||||||
|
"ma-strikes",
|
||||||
|
"ma-sweeps",
|
||||||
|
"puncture",
|
||||||
|
"slash",
|
||||||
|
"subdual",
|
||||||
|
"tiny",
|
||||||
|
"unbalance"
|
||||||
|
];
|
||||||
|
|
||||||
|
private static readonly PdfXmlExtractor Extractor = new();
|
||||||
|
private static readonly StandardCriticalTableParser Parser = new();
|
||||||
|
|
||||||
|
public static IEnumerable<object[]> EnabledStandardTables() =>
|
||||||
|
LoadManifest().Tables
|
||||||
|
.Where(item => item.Enabled)
|
||||||
|
.OrderBy(item => item.Slug, StringComparer.Ordinal)
|
||||||
|
.Select(item => new object[] { item });
|
||||||
|
|
||||||
|
public static IEnumerable<object[]> RepresentativeCells()
|
||||||
|
{
|
||||||
|
yield return ["slash", "71-75", "A", "Blow falls on lower leg"];
|
||||||
|
yield return ["puncture", "66", "C", "Strike shatters foe's knee"];
|
||||||
|
yield return ["ballistic-shrapnel", "86-90", "E", "destroy his heart"];
|
||||||
|
yield return ["arcane-aether", "96-99", "E", "smoking pulp"];
|
||||||
|
yield return ["ma-strikes", "96-99", "E", "drives bone into brain"];
|
||||||
|
yield return ["tiny", "100", "E", "Vein and artery severed"];
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void Manifest_enables_the_phase_3_standard_table_set()
|
||||||
|
{
|
||||||
|
var manifest = LoadManifest();
|
||||||
|
var enabledTables = manifest.Tables
|
||||||
|
.Where(item => item.Enabled)
|
||||||
|
.OrderBy(item => item.Slug, StringComparer.Ordinal)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
Assert.Equal(ExpectedPhase3Slugs, enabledTables.Select(item => item.Slug));
|
||||||
|
Assert.All(enabledTables, entry =>
|
||||||
|
{
|
||||||
|
Assert.Equal("standard", entry.Family);
|
||||||
|
Assert.Equal("xml", entry.ExtractionMethod);
|
||||||
|
Assert.True(File.Exists(Path.Combine(GetRepositoryRoot(), entry.PdfPath)), $"Missing source PDF for '{entry.Slug}'.");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
[Theory]
|
||||||
|
[MemberData(nameof(EnabledStandardTables))]
|
||||||
|
public async Task Enabled_standard_tables_extract_and_parse_successfully(CriticalImportManifestEntry entry)
|
||||||
|
{
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
|
||||||
|
Assert.True(parseResult.ValidationReport.IsValid, string.Join(Environment.NewLine, parseResult.ValidationReport.Errors));
|
||||||
|
Assert.Equal(5, parseResult.Table.Columns.Count);
|
||||||
|
Assert.NotEmpty(parseResult.Table.RollBands);
|
||||||
|
Assert.Equal(parseResult.ValidationReport.RowCount * 5, parseResult.ValidationReport.CellCount);
|
||||||
|
Assert.Equal(parseResult.ValidationReport.CellCount, parseResult.Table.Results.Count);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Theory]
|
||||||
|
[MemberData(nameof(RepresentativeCells))]
|
||||||
|
public async Task Representative_cells_keep_expected_descriptions(
|
||||||
|
string slug,
|
||||||
|
string rollBandLabel,
|
||||||
|
string columnKey,
|
||||||
|
string expectedSnippet)
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, slug, StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var result = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.RollBandLabel, rollBandLabel, StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.Contains(expectedSnippet, result.DescriptionText, StringComparison.OrdinalIgnoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Slash_boundary_repair_keeps_56_60_a_prose_first()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "slash", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var result = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.RollBandLabel, "56-60", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "A", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.StartsWith("You recover from your initial swing", result.RawCellText, StringComparison.Ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static async Task<StandardCriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
|
||||||
|
{
|
||||||
|
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
|
||||||
|
|
||||||
|
if (!File.Exists(xmlPath))
|
||||||
|
{
|
||||||
|
await Extractor.ExtractAsync(Path.Combine(GetRepositoryRoot(), entry.PdfPath), xmlPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
var xmlContent = await File.ReadAllTextAsync(xmlPath);
|
||||||
|
return Parser.Parse(entry, xmlContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static CriticalImportManifest LoadManifest() =>
|
||||||
|
new CriticalImportManifestLoader().Load(Path.Combine(GetRepositoryRoot(), "sources", "critical-import-manifest.json"));
|
||||||
|
|
||||||
|
private static string GetArtifactCacheRoot()
|
||||||
|
{
|
||||||
|
var cacheRoot = Path.Combine(Path.GetTempPath(), "RolemasterDb.ImportTool.Tests");
|
||||||
|
Directory.CreateDirectory(cacheRoot);
|
||||||
|
return cacheRoot;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string GetRepositoryRoot()
|
||||||
|
{
|
||||||
|
var probe = new DirectoryInfo(AppContext.BaseDirectory);
|
||||||
|
|
||||||
|
while (probe is not null)
|
||||||
|
{
|
||||||
|
if (File.Exists(Path.Combine(probe.FullName, "RolemasterDB.slnx")))
|
||||||
|
{
|
||||||
|
return probe.FullName;
|
||||||
|
}
|
||||||
|
|
||||||
|
probe = probe.Parent;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new InvalidOperationException("Could not find the repository root for integration tests.");
|
||||||
|
}
|
||||||
|
}
|
||||||
3
src/RolemasterDb.ImportTool.Tests/TestAssembly.cs
Normal file
3
src/RolemasterDb.ImportTool.Tests/TestAssembly.cs
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
using Xunit;
|
||||||
|
|
||||||
|
[assembly: CollectionBehavior(DisableTestParallelization = true)]
|
||||||
@@ -7,14 +7,17 @@ namespace RolemasterDb.ImportTool.Parsing;
|
|||||||
public sealed class StandardCriticalTableParser
|
public sealed class StandardCriticalTableParser
|
||||||
{
|
{
|
||||||
private const int HeaderToBodyMinimumGap = 20;
|
private const int HeaderToBodyMinimumGap = 20;
|
||||||
|
private const int FooterLabelExclusionGap = 15;
|
||||||
|
private const int FooterPageNumberExclusionGap = 80;
|
||||||
|
private const int RowLabelDuplicateTolerance = 15;
|
||||||
private const int TopGroupingTolerance = 2;
|
private const int TopGroupingTolerance = 2;
|
||||||
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
||||||
|
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled);
|
||||||
|
|
||||||
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||||
{
|
{
|
||||||
var fragments = LoadFragments(xmlContent);
|
var fragments = LoadFragments(xmlContent);
|
||||||
var headerFragments = FindHeaderFragments(fragments);
|
var headerFragments = FindHeaderFragments(fragments);
|
||||||
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments);
|
|
||||||
var validationErrors = new List<string>();
|
var validationErrors = new List<string>();
|
||||||
|
|
||||||
var columnCenters = headerFragments
|
var columnCenters = headerFragments
|
||||||
@@ -22,6 +25,16 @@ public sealed class StandardCriticalTableParser
|
|||||||
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
|
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
|
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
||||||
|
var keyTop = fragments
|
||||||
|
.Where(item =>
|
||||||
|
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
|
||||||
|
.Select(item => (int?)item.Top)
|
||||||
|
.Min() ?? int.MaxValue;
|
||||||
|
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments, keyTop);
|
||||||
|
|
||||||
var rowAnchors = rowLabelFragments
|
var rowAnchors = rowLabelFragments
|
||||||
.OrderBy(item => item.Top)
|
.OrderBy(item => item.Top)
|
||||||
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
|
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
|
||||||
@@ -32,16 +45,11 @@ public sealed class StandardCriticalTableParser
|
|||||||
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
||||||
}
|
}
|
||||||
|
|
||||||
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
|
||||||
var keyTop = fragments
|
|
||||||
.Where(item => string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase))
|
|
||||||
.Select(item => (int?)item.Top)
|
|
||||||
.Min() ?? int.MaxValue;
|
|
||||||
|
|
||||||
var bodyFragments = fragments
|
var bodyFragments = fragments
|
||||||
.Where(item =>
|
.Where(item =>
|
||||||
item.Top >= bodyStartTop &&
|
item.Top >= bodyStartTop &&
|
||||||
item.Top < keyTop - 1 &&
|
item.Top < keyTop - 1 &&
|
||||||
|
!IsFooterPageNumberFragment(item, keyTop) &&
|
||||||
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
|
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
|
||||||
!headerFragments.Contains(item))
|
!headerFragments.Contains(item))
|
||||||
.ToList();
|
.ToList();
|
||||||
@@ -56,11 +64,11 @@ public sealed class StandardCriticalTableParser
|
|||||||
{
|
{
|
||||||
var rowStart = rowIndex == 0
|
var rowStart = rowIndex == 0
|
||||||
? bodyStartTop
|
? bodyStartTop
|
||||||
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0);
|
: (int)Math.Floor((rowAnchors[rowIndex - 1].Top + rowAnchors[rowIndex].Top) / 2.0) + 1;
|
||||||
|
|
||||||
var rowEnd = rowIndex == rowAnchors.Count - 1
|
var rowEnd = rowIndex == rowAnchors.Count - 1
|
||||||
? keyTop - 1
|
? keyTop - 1
|
||||||
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0);
|
: (int)Math.Floor((rowAnchors[rowIndex].Top + rowAnchors[rowIndex + 1].Top) / 2.0) + 1;
|
||||||
|
|
||||||
var rowFragments = bodyFragments
|
var rowFragments = bodyFragments
|
||||||
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
|
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
|
||||||
@@ -95,26 +103,12 @@ public sealed class StandardCriticalTableParser
|
|||||||
|
|
||||||
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
|
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
|
||||||
{
|
{
|
||||||
var firstProseIndex = cellEntry.Lines.FindIndex(line => !IsAffixLikeLine(line));
|
var segmentCount = CountLineTypeSegments(cellEntry.Lines);
|
||||||
var firstAffixIndex = cellEntry.Lines.FindIndex(IsAffixLikeLine);
|
|
||||||
|
|
||||||
if (firstProseIndex > 0)
|
if (segmentCount > 2)
|
||||||
{
|
{
|
||||||
validationErrors.Add(
|
validationErrors.Add(
|
||||||
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' begins with affix-like lines before prose.");
|
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
|
||||||
}
|
|
||||||
|
|
||||||
if (firstAffixIndex >= 0)
|
|
||||||
{
|
|
||||||
var proseAfterAffix = cellEntry.Lines
|
|
||||||
.Skip(firstAffixIndex + 1)
|
|
||||||
.Any(line => !IsAffixLikeLine(line));
|
|
||||||
|
|
||||||
if (proseAfterAffix)
|
|
||||||
{
|
|
||||||
validationErrors.Add(
|
|
||||||
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' contains prose after affix lines.");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var rawAffixLines = cellEntry.Lines.Where(IsAffixLikeLine).ToList();
|
var rawAffixLines = cellEntry.Lines.Where(IsAffixLikeLine).ToList();
|
||||||
@@ -200,12 +194,13 @@ public sealed class StandardCriticalTableParser
|
|||||||
|
|
||||||
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
||||||
{
|
{
|
||||||
var groupedByTop = fragments
|
var headerCandidates = fragments
|
||||||
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
|
.Where(item => item.Text.Length == 1 && char.IsLetter(item.Text[0]))
|
||||||
.GroupBy(item => item.Top)
|
.OrderBy(item => item.Top)
|
||||||
.OrderBy(group => group.Key);
|
.ThenBy(item => item.Left)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
foreach (var group in groupedByTop)
|
foreach (var group in GroupByTop(headerCandidates))
|
||||||
{
|
{
|
||||||
var ordered = group.OrderBy(item => item.Left).ToList();
|
var ordered = group.OrderBy(item => item.Left).ToList();
|
||||||
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
|
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
|
||||||
@@ -220,18 +215,37 @@ public sealed class StandardCriticalTableParser
|
|||||||
|
|
||||||
private static List<XmlTextFragment> FindRowLabelFragments(
|
private static List<XmlTextFragment> FindRowLabelFragments(
|
||||||
IReadOnlyList<XmlTextFragment> fragments,
|
IReadOnlyList<XmlTextFragment> fragments,
|
||||||
IReadOnlyList<XmlTextFragment> headerFragments)
|
IReadOnlyList<XmlTextFragment> headerFragments,
|
||||||
|
int keyTop)
|
||||||
{
|
{
|
||||||
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
|
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
|
||||||
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
||||||
|
|
||||||
return fragments
|
var candidates = fragments
|
||||||
.Where(item =>
|
.Where(item =>
|
||||||
item.Left < leftCutoff &&
|
item.Left < leftCutoff &&
|
||||||
item.Top >= bodyStartTop &&
|
item.Top >= bodyStartTop &&
|
||||||
|
item.Top < keyTop - FooterLabelExclusionGap &&
|
||||||
IsRollBandLabel(item.Text))
|
IsRollBandLabel(item.Text))
|
||||||
.OrderBy(item => item.Top)
|
.OrderBy(item => item.Top)
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
|
var deduped = new List<XmlTextFragment>();
|
||||||
|
|
||||||
|
foreach (var candidate in candidates)
|
||||||
|
{
|
||||||
|
var previous = deduped.LastOrDefault();
|
||||||
|
if (previous is not null &&
|
||||||
|
string.Equals(previous.Text, candidate.Text, StringComparison.OrdinalIgnoreCase) &&
|
||||||
|
Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
deduped.Add(candidate);
|
||||||
|
}
|
||||||
|
|
||||||
|
return deduped;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static bool IsRollBandLabel(string value) =>
|
private static bool IsRollBandLabel(string value) =>
|
||||||
@@ -293,7 +307,7 @@ public sealed class StandardCriticalTableParser
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (value == "-" || value == "\u2014")
|
if (value == "-" || value == "\u2013" || value == "\u2014")
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -301,7 +315,10 @@ public sealed class StandardCriticalTableParser
|
|||||||
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
|
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
|
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
|
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase))
|
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase))
|
||||||
{
|
{
|
||||||
return value.Contains(':', StringComparison.Ordinal);
|
return value.Contains(':', StringComparison.Ordinal);
|
||||||
}
|
}
|
||||||
@@ -311,10 +328,9 @@ public sealed class StandardCriticalTableParser
|
|||||||
value.StartsWith("\u220F", StringComparison.Ordinal) ||
|
value.StartsWith("\u220F", StringComparison.Ordinal) ||
|
||||||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
|
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
|
||||||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
|
value.StartsWith("\u222B", StringComparison.Ordinal) ||
|
||||||
|
StandaloneModifierAffixLineRegex.IsMatch(value) ||
|
||||||
NumericAffixLineRegex.IsMatch(value) ||
|
NumericAffixLineRegex.IsMatch(value) ||
|
||||||
value.Contains(" - ", StringComparison.Ordinal) ||
|
value.Contains(" - ", StringComparison.Ordinal);
|
||||||
value.Contains("(-", StringComparison.Ordinal) ||
|
|
||||||
value.Contains("(+", StringComparison.Ordinal);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries)
|
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries)
|
||||||
@@ -361,6 +377,55 @@ public sealed class StandardCriticalTableParser
|
|||||||
.Replace('\n', ' ')
|
.Replace('\n', ' ')
|
||||||
.Trim();
|
.Trim();
|
||||||
|
|
||||||
|
private static int CountLineTypeSegments(IReadOnlyList<string> lines)
|
||||||
|
{
|
||||||
|
var segmentCount = 0;
|
||||||
|
bool? previousIsAffix = null;
|
||||||
|
|
||||||
|
foreach (var line in lines)
|
||||||
|
{
|
||||||
|
var currentIsAffix = IsAffixLikeLine(line);
|
||||||
|
if (previousIsAffix == currentIsAffix)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
segmentCount++;
|
||||||
|
previousIsAffix = currentIsAffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
return segmentCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
|
||||||
|
{
|
||||||
|
if (keyTop == int.MaxValue)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return fragment.Top >= keyTop - FooterPageNumberExclusionGap &&
|
||||||
|
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
|
||||||
|
{
|
||||||
|
var groups = new List<List<XmlTextFragment>>();
|
||||||
|
|
||||||
|
foreach (var fragment in fragments)
|
||||||
|
{
|
||||||
|
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance)
|
||||||
|
{
|
||||||
|
groups.Add([fragment]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
groups[^1].Add(fragment);
|
||||||
|
}
|
||||||
|
|
||||||
|
return groups;
|
||||||
|
}
|
||||||
|
|
||||||
private sealed record ColumnAnchor(string Key, double CenterX);
|
private sealed record ColumnAnchor(string Key, double CenterX);
|
||||||
|
|
||||||
private sealed record RowAnchor(string Label, int Top, int SortOrder);
|
private sealed record RowAnchor(string Label, int Top, int SortOrder);
|
||||||
|
|||||||
Reference in New Issue
Block a user