Implement phase 4 critical table imports
This commit is contained in:
@@ -30,8 +30,10 @@ The current implementation supports:
|
|||||||
- explicit CLI commands for reset, extraction, and import
|
- explicit CLI commands for reset, extraction, and import
|
||||||
- manifest-driven source selection
|
- manifest-driven source selection
|
||||||
- `standard` critical tables with columns `A-E`
|
- `standard` critical tables with columns `A-E`
|
||||||
|
- `variant_column` critical tables with non-severity columns
|
||||||
|
- `grouped_variant` critical tables with a group axis plus variant columns
|
||||||
- XML-based extraction using `pdftohtml -xml`
|
- XML-based extraction using `pdftohtml -xml`
|
||||||
- geometry-based parsing across the currently enabled phase-3 tables:
|
- geometry-based parsing across the currently enabled table set:
|
||||||
- `arcane-aether`
|
- `arcane-aether`
|
||||||
- `arcane-nether`
|
- `arcane-nether`
|
||||||
- `ballistic-shrapnel`
|
- `ballistic-shrapnel`
|
||||||
@@ -42,22 +44,24 @@ The current implementation supports:
|
|||||||
- `heat`
|
- `heat`
|
||||||
- `impact`
|
- `impact`
|
||||||
- `krush`
|
- `krush`
|
||||||
|
- `large_creature_magic`
|
||||||
|
- `large_creature_weapon`
|
||||||
- `ma-strikes`
|
- `ma-strikes`
|
||||||
- `ma-sweeps`
|
- `ma-sweeps`
|
||||||
- `mana`
|
- `mana`
|
||||||
- `puncture`
|
- `puncture`
|
||||||
- `slash`
|
- `slash`
|
||||||
- `subdual`
|
- `subdual`
|
||||||
|
- `super_large_creature_weapon`
|
||||||
- `tiny`
|
- `tiny`
|
||||||
- `unbalance`
|
- `unbalance`
|
||||||
- row-boundary repair for trailing affix leakage
|
- row-boundary repair for trailing affix leakage
|
||||||
|
- split row-label reconstruction for tables that render labels such as `99-` / `100` as two fragments
|
||||||
- footer/page-number filtering during body parsing
|
- footer/page-number filtering during body parsing
|
||||||
- transactional loading into SQLite
|
- transactional loading into SQLite
|
||||||
|
|
||||||
The current implementation does not yet support:
|
The current implementation does not yet support:
|
||||||
|
|
||||||
- variant-column critical tables
|
|
||||||
- grouped variant tables
|
|
||||||
- OCR/image-based PDFs such as `Void.pdf`
|
- OCR/image-based PDFs such as `Void.pdf`
|
||||||
- normalized `critical_branch` population
|
- normalized `critical_branch` population
|
||||||
- normalized `critical_effect` population
|
- normalized `critical_effect` population
|
||||||
@@ -246,9 +250,28 @@ Current phase-3 notes:
|
|||||||
|
|
||||||
### Phase 4: Variant and Grouped Tables
|
### Phase 4: Variant and Grouped Tables
|
||||||
|
|
||||||
- support `variant_column` tables such as `Large Creature - Weapon.pdf`
|
Phase 4 extended the importer beyond `A-E` tables.
|
||||||
- support `grouped_variant` tables such as `Large Creature - Magic.pdf`
|
|
||||||
- add parser strategies for additional table families
|
The currently enabled phase-4 table set is:
|
||||||
|
|
||||||
|
- `large_creature_weapon`
|
||||||
|
- `family`: `variant_column`
|
||||||
|
- columns: `NORMAL`, `MAGIC`, `MITHRIL`, `HOLY_ARMS`, `SLAYING`
|
||||||
|
- `super_large_creature_weapon`
|
||||||
|
- `family`: `variant_column`
|
||||||
|
- columns: `NORMAL`, `MAGIC`, `MITHRIL`, `HOLY_ARMS`, `SLAYING`
|
||||||
|
- `large_creature_magic`
|
||||||
|
- `family`: `grouped_variant`
|
||||||
|
- groups: `large`, `super_large`
|
||||||
|
- columns: `NORMAL`, `SLAYING`
|
||||||
|
|
||||||
|
Phase-4 notes:
|
||||||
|
|
||||||
|
- grouped results now populate `critical_group` during SQLite load
|
||||||
|
- parser dispatch is family-based instead of standard-table only
|
||||||
|
- left-margin row labels can be reconstructed from split fragments such as `151-` / `175`
|
||||||
|
- the grouped magic PDF is imported once as `large_creature_magic`
|
||||||
|
- `sources/Large Creature - Magic.pdf` and `sources/Super Large Creature - Magic.pdf` are duplicate files
|
||||||
|
|
||||||
### Phase 5: Conditional Branch Extraction
|
### Phase 5: Conditional Branch Extraction
|
||||||
|
|
||||||
@@ -335,10 +358,12 @@ Each entry declares:
|
|||||||
|
|
||||||
The manifest is intentionally the control point for enabling importer support one table at a time.
|
The manifest is intentionally the control point for enabling importer support one table at a time.
|
||||||
|
|
||||||
For the currently enabled phase-3 entries:
|
For the currently enabled entries:
|
||||||
|
|
||||||
- `family` is `standard`
|
- standard tables use `family: standard`
|
||||||
- `extractionMethod` is `xml`
|
- creature weapon tables use `family: variant_column`
|
||||||
|
- grouped creature magic uses `family: grouped_variant`
|
||||||
|
- all enabled entries currently use `extractionMethod: xml`
|
||||||
|
|
||||||
## Artifact Layout
|
## Artifact Layout
|
||||||
|
|
||||||
|
|||||||
@@ -19,11 +19,12 @@ The PDFs are not one uniform table shape. I found three families:
|
|||||||
- Example: `Large Creature - Magic.pdf` has:
|
- Example: `Large Creature - Magic.pdf` has:
|
||||||
- group: `large`, `super_large`
|
- group: `large`, `super_large`
|
||||||
- column: `normal`, `slaying`
|
- column: `normal`, `slaying`
|
||||||
|
- In the current importer manifest, the grouped magic PDF is loaded once as `large_creature_magic` because the `Large Creature - Magic.pdf` and `Super Large Creature - Magic.pdf` source files are duplicates.
|
||||||
- row: roll band
|
- row: roll band
|
||||||
|
|
||||||
There are also extraction constraints:
|
There are also extraction constraints:
|
||||||
|
|
||||||
- Most PDFs are text extractable with `pdftotext -layout`.
|
- Most PDFs are text extractable with `pdftohtml -xml`.
|
||||||
- `Void.pdf` appears image-based and will need OCR or manual transcription.
|
- `Void.pdf` appears image-based and will need OCR or manual transcription.
|
||||||
- A single cell can contain:
|
- A single cell can contain:
|
||||||
- base description text
|
- base description text
|
||||||
@@ -282,4 +283,3 @@ Recommended import flow:
|
|||||||
6. Route image PDFs like `Void.pdf` through OCR before the same parser.
|
6. Route image PDFs like `Void.pdf` through OCR before the same parser.
|
||||||
|
|
||||||
The important design decision is: never throw away the original text. The prose is too irregular to rely on normalized fields alone.
|
The important design decision is: never throw away the original text. The prose is too irregular to rely on normalized fields alone.
|
||||||
|
|
||||||
|
|||||||
@@ -80,6 +80,22 @@
|
|||||||
"pdfPath": "sources/Krush.pdf",
|
"pdfPath": "sources/Krush.pdf",
|
||||||
"enabled": true
|
"enabled": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"slug": "large_creature_magic",
|
||||||
|
"displayName": "Spells Against Creatures Critical Strike Table",
|
||||||
|
"family": "grouped_variant",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Large Creature - Magic.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "large_creature_weapon",
|
||||||
|
"displayName": "Large Creature Critical Strike Table",
|
||||||
|
"family": "variant_column",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Large Creature - Weapon.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"slug": "ma-strikes",
|
"slug": "ma-strikes",
|
||||||
"displayName": "Martial Arts Strikes Critical Strike Table",
|
"displayName": "Martial Arts Strikes Critical Strike Table",
|
||||||
@@ -128,6 +144,14 @@
|
|||||||
"pdfPath": "sources/Subdual.pdf",
|
"pdfPath": "sources/Subdual.pdf",
|
||||||
"enabled": true
|
"enabled": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"slug": "super_large_creature_weapon",
|
||||||
|
"displayName": "Super Large Creature Critical Strike Table",
|
||||||
|
"family": "variant_column",
|
||||||
|
"extractionMethod": "xml",
|
||||||
|
"pdfPath": "sources/Super Large Creature - Weapon.pdf",
|
||||||
|
"enabled": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"slug": "tiny",
|
"slug": "tiny",
|
||||||
"displayName": "Tiny Critical Strike Table",
|
"displayName": "Tiny Critical Strike Table",
|
||||||
|
|||||||
Binary file not shown.
@@ -4,7 +4,7 @@ namespace RolemasterDb.ImportTool.Tests;
|
|||||||
|
|
||||||
public sealed class StandardCriticalTableParserIntegrationTests
|
public sealed class StandardCriticalTableParserIntegrationTests
|
||||||
{
|
{
|
||||||
private static readonly string[] ExpectedPhase3Slugs =
|
private static readonly string[] ExpectedEnabledSlugs =
|
||||||
[
|
[
|
||||||
"arcane-aether",
|
"arcane-aether",
|
||||||
"arcane-nether",
|
"arcane-nether",
|
||||||
@@ -16,20 +16,25 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
"heat",
|
"heat",
|
||||||
"impact",
|
"impact",
|
||||||
"krush",
|
"krush",
|
||||||
|
"large_creature_magic",
|
||||||
|
"large_creature_weapon",
|
||||||
"ma-strikes",
|
"ma-strikes",
|
||||||
"ma-sweeps",
|
"ma-sweeps",
|
||||||
"mana",
|
"mana",
|
||||||
"puncture",
|
"puncture",
|
||||||
"slash",
|
"slash",
|
||||||
"subdual",
|
"subdual",
|
||||||
|
"super_large_creature_weapon",
|
||||||
"tiny",
|
"tiny",
|
||||||
"unbalance"
|
"unbalance"
|
||||||
];
|
];
|
||||||
|
|
||||||
private static readonly PdfXmlExtractor Extractor = new();
|
private static readonly PdfXmlExtractor Extractor = new();
|
||||||
private static readonly StandardCriticalTableParser Parser = new();
|
private static readonly StandardCriticalTableParser StandardParser = new();
|
||||||
|
private static readonly VariantColumnCriticalTableParser VariantColumnParser = new();
|
||||||
|
private static readonly GroupedVariantCriticalTableParser GroupedVariantParser = new();
|
||||||
|
|
||||||
public static IEnumerable<object[]> EnabledStandardTables() =>
|
public static IEnumerable<object[]> EnabledTables() =>
|
||||||
LoadManifest().Tables
|
LoadManifest().Tables
|
||||||
.Where(item => item.Enabled)
|
.Where(item => item.Enabled)
|
||||||
.OrderBy(item => item.Slug, StringComparer.Ordinal)
|
.OrderBy(item => item.Slug, StringComparer.Ordinal)
|
||||||
@@ -37,18 +42,22 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
|
|
||||||
public static IEnumerable<object[]> RepresentativeCells()
|
public static IEnumerable<object[]> RepresentativeCells()
|
||||||
{
|
{
|
||||||
yield return ["slash", "71-75", "A", "Blow falls on lower leg"];
|
yield return new object[] { "slash", null!, "71-75", "A", "Blow falls on lower leg" };
|
||||||
yield return ["puncture", "66", "C", "Strike shatters foe's knee"];
|
yield return new object[] { "puncture", null!, "66", "C", "Strike shatters foe's knee" };
|
||||||
yield return ["ballistic-shrapnel", "86-90", "E", "destroy his heart"];
|
yield return new object[] { "ballistic-shrapnel", null!, "86-90", "E", "destroy his heart" };
|
||||||
yield return ["arcane-aether", "96-99", "E", "smoking pulp"];
|
yield return new object[] { "arcane-aether", null!, "96-99", "E", "smoking pulp" };
|
||||||
yield return ["ma-strikes", "96-99", "E", "drives bone into brain"];
|
yield return new object[] { "ma-strikes", null!, "96-99", "E", "drives bone into brain" };
|
||||||
yield return ["mana", "96-99", "E", "momentarily transformed"];
|
yield return new object[] { "mana", null!, "96-99", "E", "momentarily transformed" };
|
||||||
yield return ["mana", "100", "E", "Mana consumes everything"];
|
yield return new object[] { "mana", null!, "100", "E", "Mana consumes everything" };
|
||||||
yield return ["tiny", "100", "E", "Vein and artery severed"];
|
yield return new object[] { "tiny", null!, "100", "E", "Vein and artery severed" };
|
||||||
|
yield return new object[] { "large_creature_weapon", null!, "01-05", "NORMAL", "Weapon shatters" };
|
||||||
|
yield return new object[] { "super_large_creature_weapon", null!, "31-40", "SLAYING", "Boom! Solid without question" };
|
||||||
|
yield return new object[] { "large_creature_magic", "large", "251+", "NORMAL", "Foe lowers his eyes within your reach" };
|
||||||
|
yield return new object[] { "large_creature_magic", "super_large", "251+", "SLAYING", "Blast goes in through foe's eye" };
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void Manifest_enables_the_phase_3_standard_table_set()
|
public void Manifest_enables_the_phase_4_table_set()
|
||||||
{
|
{
|
||||||
var manifest = LoadManifest();
|
var manifest = LoadManifest();
|
||||||
var enabledTables = manifest.Tables
|
var enabledTables = manifest.Tables
|
||||||
@@ -56,25 +65,29 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
.OrderBy(item => item.Slug, StringComparer.Ordinal)
|
.OrderBy(item => item.Slug, StringComparer.Ordinal)
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
Assert.Equal(ExpectedPhase3Slugs, enabledTables.Select(item => item.Slug));
|
Assert.Equal(ExpectedEnabledSlugs, enabledTables.Select(item => item.Slug));
|
||||||
Assert.All(enabledTables, entry =>
|
Assert.All(enabledTables, entry =>
|
||||||
{
|
{
|
||||||
Assert.Equal("standard", entry.Family);
|
|
||||||
Assert.Equal("xml", entry.ExtractionMethod);
|
Assert.Equal("xml", entry.ExtractionMethod);
|
||||||
Assert.True(File.Exists(Path.Combine(GetRepositoryRoot(), entry.PdfPath)), $"Missing source PDF for '{entry.Slug}'.");
|
Assert.True(File.Exists(Path.Combine(GetRepositoryRoot(), entry.PdfPath)), $"Missing source PDF for '{entry.Slug}'.");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "large_creature_weapon").Family);
|
||||||
|
Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "super_large_creature_weapon").Family);
|
||||||
|
Assert.Equal("grouped_variant", enabledTables.Single(item => item.Slug == "large_creature_magic").Family);
|
||||||
}
|
}
|
||||||
|
|
||||||
[Theory]
|
[Theory]
|
||||||
[MemberData(nameof(EnabledStandardTables))]
|
[MemberData(nameof(EnabledTables))]
|
||||||
public async Task Enabled_standard_tables_extract_and_parse_successfully(CriticalImportManifestEntry entry)
|
public async Task Enabled_tables_extract_and_parse_successfully(CriticalImportManifestEntry entry)
|
||||||
{
|
{
|
||||||
var parseResult = await LoadParseResultAsync(entry);
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
var expectedGroupCount = Math.Max(parseResult.Table.Groups.Count, 1);
|
||||||
|
|
||||||
Assert.True(parseResult.ValidationReport.IsValid, string.Join(Environment.NewLine, parseResult.ValidationReport.Errors));
|
Assert.True(parseResult.ValidationReport.IsValid, string.Join(Environment.NewLine, parseResult.ValidationReport.Errors));
|
||||||
Assert.Equal(5, parseResult.Table.Columns.Count);
|
Assert.NotEmpty(parseResult.Table.Columns);
|
||||||
Assert.NotEmpty(parseResult.Table.RollBands);
|
Assert.NotEmpty(parseResult.Table.RollBands);
|
||||||
Assert.Equal(parseResult.ValidationReport.RowCount * 5, parseResult.ValidationReport.CellCount);
|
Assert.Equal(parseResult.ValidationReport.RowCount * parseResult.Table.Columns.Count * expectedGroupCount, parseResult.ValidationReport.CellCount);
|
||||||
Assert.Equal(parseResult.ValidationReport.CellCount, parseResult.Table.Results.Count);
|
Assert.Equal(parseResult.ValidationReport.CellCount, parseResult.Table.Results.Count);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -82,6 +95,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
[MemberData(nameof(RepresentativeCells))]
|
[MemberData(nameof(RepresentativeCells))]
|
||||||
public async Task Representative_cells_keep_expected_descriptions(
|
public async Task Representative_cells_keep_expected_descriptions(
|
||||||
string slug,
|
string slug,
|
||||||
|
string? groupKey,
|
||||||
string rollBandLabel,
|
string rollBandLabel,
|
||||||
string columnKey,
|
string columnKey,
|
||||||
string expectedSnippet)
|
string expectedSnippet)
|
||||||
@@ -89,6 +103,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, slug, StringComparison.Ordinal));
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, slug, StringComparison.Ordinal));
|
||||||
var parseResult = await LoadParseResultAsync(entry);
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
var result = parseResult.Table.Results.Single(item =>
|
var result = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) &&
|
||||||
string.Equals(item.RollBandLabel, rollBandLabel, StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, rollBandLabel, StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal));
|
||||||
|
|
||||||
@@ -101,6 +116,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "slash", StringComparison.Ordinal));
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "slash", StringComparison.Ordinal));
|
||||||
var parseResult = await LoadParseResultAsync(entry);
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
var result = parseResult.Table.Results.Single(item =>
|
var result = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "56-60", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "56-60", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "A", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "A", StringComparison.Ordinal));
|
||||||
|
|
||||||
@@ -113,9 +129,11 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
var parseResult = await LoadParseResultAsync(entry);
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
var row96E = parseResult.Table.Results.Single(item =>
|
var row96E = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "96-99", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "96-99", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
|
||||||
var row100E = parseResult.Table.Results.Single(item =>
|
var row100E = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "100", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "100", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
|
||||||
|
|
||||||
@@ -130,6 +148,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
var parseResult = await LoadParseResultAsync(entry);
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
var row100C = parseResult.Table.Results.Single(item =>
|
var row100C = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "100", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "100", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
|
||||||
|
|
||||||
@@ -143,9 +162,11 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
var parseResult = await LoadParseResultAsync(entry);
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
var row71A = parseResult.Table.Results.Single(item =>
|
var row71A = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "A", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "A", StringComparison.Ordinal));
|
||||||
var row71B = parseResult.Table.Results.Single(item =>
|
var row71B = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
|
||||||
|
|
||||||
@@ -159,9 +180,11 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
var parseResult = await LoadParseResultAsync(entry);
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
var row71D = parseResult.Table.Results.Single(item =>
|
var row71D = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "D", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "D", StringComparison.Ordinal));
|
||||||
var row71E = parseResult.Table.Results.Single(item =>
|
var row71E = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
|
||||||
|
|
||||||
@@ -175,9 +198,11 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
var parseResult = await LoadParseResultAsync(entry);
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
var row91B = parseResult.Table.Results.Single(item =>
|
var row91B = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
|
||||||
var row91C = parseResult.Table.Results.Single(item =>
|
var row91C = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
|
||||||
|
|
||||||
@@ -191,9 +216,11 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
|
||||||
var parseResult = await LoadParseResultAsync(entry);
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
var row86B = parseResult.Table.Results.Single(item =>
|
var row86B = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
|
||||||
var row86C = parseResult.Table.Results.Single(item =>
|
var row86C = parseResult.Table.Results.Single(item =>
|
||||||
|
item.GroupKey is null &&
|
||||||
string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) &&
|
string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) &&
|
||||||
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
|
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
|
||||||
|
|
||||||
@@ -201,7 +228,28 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
Assert.Contains("+16H - 8", row86C.RawAffixText, StringComparison.Ordinal);
|
Assert.Contains("+16H - 8", row86C.RawAffixText, StringComparison.Ordinal);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static async Task<StandardCriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
|
[Fact]
|
||||||
|
public async Task Grouped_magic_table_keeps_large_and_super_large_groups_distinct()
|
||||||
|
{
|
||||||
|
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "large_creature_magic", StringComparison.Ordinal));
|
||||||
|
var parseResult = await LoadParseResultAsync(entry);
|
||||||
|
|
||||||
|
Assert.Equal(["large", "super_large"], parseResult.Table.Groups.Select(item => item.GroupKey));
|
||||||
|
|
||||||
|
var largeNormal = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.GroupKey, "large", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.RollBandLabel, "251+", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "NORMAL", StringComparison.Ordinal));
|
||||||
|
var superSlaying = parseResult.Table.Results.Single(item =>
|
||||||
|
string.Equals(item.GroupKey, "super_large", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.RollBandLabel, "251+", StringComparison.Ordinal) &&
|
||||||
|
string.Equals(item.ColumnKey, "SLAYING", StringComparison.Ordinal));
|
||||||
|
|
||||||
|
Assert.DoesNotContain("Blast goes in through foe's eye", largeNormal.DescriptionText, StringComparison.OrdinalIgnoreCase);
|
||||||
|
Assert.Contains("Blast goes in through foe's eye", superSlaying.DescriptionText, StringComparison.OrdinalIgnoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static async Task<CriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
|
||||||
{
|
{
|
||||||
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
|
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
|
||||||
|
|
||||||
@@ -211,7 +259,13 @@ public sealed class StandardCriticalTableParserIntegrationTests
|
|||||||
}
|
}
|
||||||
|
|
||||||
var xmlContent = await File.ReadAllTextAsync(xmlPath);
|
var xmlContent = await File.ReadAllTextAsync(xmlPath);
|
||||||
return Parser.Parse(entry, xmlContent);
|
return entry.Family switch
|
||||||
|
{
|
||||||
|
"standard" => StandardParser.Parse(entry, xmlContent),
|
||||||
|
"variant_column" => VariantColumnParser.Parse(entry, xmlContent),
|
||||||
|
"grouped_variant" => GroupedVariantParser.Parse(entry, xmlContent),
|
||||||
|
_ => throw new InvalidOperationException($"Unsupported manifest family '{entry.Family}'.")
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private static CriticalImportManifest LoadManifest() =>
|
private static CriticalImportManifest LoadManifest() =>
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ public sealed class CriticalImportCommandRunner
|
|||||||
private readonly ImportArtifactWriter artifactWriter = new();
|
private readonly ImportArtifactWriter artifactWriter = new();
|
||||||
private readonly PdfXmlExtractor pdfXmlExtractor = new();
|
private readonly PdfXmlExtractor pdfXmlExtractor = new();
|
||||||
private readonly StandardCriticalTableParser standardParser = new();
|
private readonly StandardCriticalTableParser standardParser = new();
|
||||||
|
private readonly VariantColumnCriticalTableParser variantColumnParser = new();
|
||||||
|
private readonly GroupedVariantCriticalTableParser groupedVariantParser = new();
|
||||||
|
|
||||||
public async Task<int> RunAsync(ResetOptions options)
|
public async Task<int> RunAsync(ResetOptions options)
|
||||||
{
|
{
|
||||||
@@ -96,14 +98,24 @@ public sealed class CriticalImportCommandRunner
|
|||||||
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
|
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
|
||||||
}
|
}
|
||||||
|
|
||||||
private StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
private CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||||
{
|
{
|
||||||
if (!string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
|
if (string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
|
||||||
{
|
{
|
||||||
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 2.");
|
return standardParser.Parse(entry, xmlContent);
|
||||||
}
|
}
|
||||||
|
|
||||||
return standardParser.Parse(entry, xmlContent);
|
if (string.Equals(entry.Family, "variant_column", StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
return variantColumnParser.Parse(entry, xmlContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (string.Equals(entry.Family, "grouped_variant", StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
return groupedVariantParser.Parse(entry, xmlContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by the importer.");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static ImportArtifactPaths CreateArtifactPaths(string slug) =>
|
private static ImportArtifactPaths CreateArtifactPaths(string slug) =>
|
||||||
|
|||||||
@@ -43,6 +43,15 @@ public sealed class CriticalImportLoader(string databasePath)
|
|||||||
Notes = table.Notes
|
Notes = table.Notes
|
||||||
};
|
};
|
||||||
|
|
||||||
|
entity.Groups = table.Groups
|
||||||
|
.Select(item => new CriticalGroup
|
||||||
|
{
|
||||||
|
GroupKey = item.GroupKey,
|
||||||
|
Label = item.Label,
|
||||||
|
SortOrder = item.SortOrder
|
||||||
|
})
|
||||||
|
.ToList();
|
||||||
|
|
||||||
entity.Columns = table.Columns
|
entity.Columns = table.Columns
|
||||||
.Select(item => new CriticalColumn
|
.Select(item => new CriticalColumn
|
||||||
{
|
{
|
||||||
@@ -63,12 +72,14 @@ public sealed class CriticalImportLoader(string databasePath)
|
|||||||
})
|
})
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
|
var groupsByKey = entity.Groups.ToDictionary(item => item.GroupKey, StringComparer.OrdinalIgnoreCase);
|
||||||
var columnsByKey = entity.Columns.ToDictionary(item => item.ColumnKey, StringComparer.OrdinalIgnoreCase);
|
var columnsByKey = entity.Columns.ToDictionary(item => item.ColumnKey, StringComparer.OrdinalIgnoreCase);
|
||||||
var rollBandsByLabel = entity.RollBands.ToDictionary(item => item.Label, StringComparer.OrdinalIgnoreCase);
|
var rollBandsByLabel = entity.RollBands.ToDictionary(item => item.Label, StringComparer.OrdinalIgnoreCase);
|
||||||
|
|
||||||
entity.Results = table.Results
|
entity.Results = table.Results
|
||||||
.Select(item => new CriticalResult
|
.Select(item => new CriticalResult
|
||||||
{
|
{
|
||||||
|
CriticalGroup = item.GroupKey is null ? null : groupsByKey[item.GroupKey],
|
||||||
CriticalColumn = columnsByKey[item.ColumnKey],
|
CriticalColumn = columnsByKey[item.ColumnKey],
|
||||||
CriticalRollBand = rollBandsByLabel[item.RollBandLabel],
|
CriticalRollBand = rollBandsByLabel[item.RollBandLabel],
|
||||||
RawCellText = item.RawCellText,
|
RawCellText = item.RawCellText,
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ public sealed class ImportArtifactWriter
|
|||||||
WriteIndented = true
|
WriteIndented = true
|
||||||
};
|
};
|
||||||
|
|
||||||
public async Task WriteAsync(ImportArtifactPaths artifactPaths, StandardCriticalTableParseResult parseResult, CancellationToken cancellationToken = default)
|
public async Task WriteAsync(ImportArtifactPaths artifactPaths, CriticalTableParseResult parseResult, CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
Directory.CreateDirectory(artifactPaths.DirectoryPath);
|
Directory.CreateDirectory(artifactPaths.DirectoryPath);
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,13 @@
|
|||||||
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
public sealed class CriticalTableParseResult(
|
||||||
|
ParsedCriticalTable table,
|
||||||
|
IReadOnlyList<XmlTextFragment> fragments,
|
||||||
|
IReadOnlyList<ParsedCriticalCellArtifact> cells,
|
||||||
|
ImportValidationReport validationReport)
|
||||||
|
{
|
||||||
|
public ParsedCriticalTable Table { get; } = table;
|
||||||
|
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
|
||||||
|
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
|
||||||
|
public ImportValidationReport ValidationReport { get; } = validationReport;
|
||||||
|
}
|
||||||
@@ -0,0 +1,477 @@
|
|||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using System.Xml;
|
||||||
|
using System.Xml.Linq;
|
||||||
|
|
||||||
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
internal static class CriticalTableParserSupport
|
||||||
|
{
|
||||||
|
internal const int HeaderToBodyMinimumGap = 20;
|
||||||
|
internal const int FooterLabelExclusionGap = 15;
|
||||||
|
internal const int FooterPageNumberExclusionGap = 80;
|
||||||
|
internal const int RowLabelDuplicateTolerance = 15;
|
||||||
|
internal const int TopGroupingTolerance = 2;
|
||||||
|
|
||||||
|
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
|
||||||
|
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
||||||
|
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|–)\d+\)$", RegexOptions.Compiled);
|
||||||
|
|
||||||
|
internal static List<XmlTextFragment> LoadFragments(string xmlContent)
|
||||||
|
{
|
||||||
|
using var stringReader = new StringReader(xmlContent);
|
||||||
|
using var xmlReader = XmlReader.Create(
|
||||||
|
stringReader,
|
||||||
|
new XmlReaderSettings
|
||||||
|
{
|
||||||
|
DtdProcessing = DtdProcessing.Ignore
|
||||||
|
});
|
||||||
|
|
||||||
|
var document = XDocument.Load(xmlReader);
|
||||||
|
|
||||||
|
return document.Descendants("page")
|
||||||
|
.SelectMany(page =>
|
||||||
|
{
|
||||||
|
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
|
||||||
|
return page.Elements("text")
|
||||||
|
.Select(item => new XmlTextFragment(
|
||||||
|
pageNumber,
|
||||||
|
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
|
||||||
|
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
|
||||||
|
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
|
||||||
|
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
|
||||||
|
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
|
||||||
|
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
|
||||||
|
})
|
||||||
|
.ToList();
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static List<XmlTextFragment> FindRowLabelFragments(
|
||||||
|
IReadOnlyList<XmlTextFragment> fragments,
|
||||||
|
int leftCutoff,
|
||||||
|
int bodyStartTop,
|
||||||
|
int keyTop)
|
||||||
|
{
|
||||||
|
var candidates = fragments
|
||||||
|
.Where(item =>
|
||||||
|
item.Left < leftCutoff &&
|
||||||
|
item.Top >= bodyStartTop &&
|
||||||
|
item.Top < keyTop - FooterLabelExclusionGap &&
|
||||||
|
(IsRollBandLabel(item.Text) || LooksLikeSplitRollBandStart(item.Text)))
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ThenBy(item => item.Left)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var merged = new List<XmlTextFragment>();
|
||||||
|
|
||||||
|
for (var index = 0; index < candidates.Count; index++)
|
||||||
|
{
|
||||||
|
var candidate = candidates[index];
|
||||||
|
if (TryMergeSplitRollBand(candidates, index, out var mergedCandidate))
|
||||||
|
{
|
||||||
|
merged.Add(mergedCandidate);
|
||||||
|
index++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (IsRollBandLabel(candidate.Text))
|
||||||
|
{
|
||||||
|
merged.Add(candidate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var deduped = new List<XmlTextFragment>();
|
||||||
|
|
||||||
|
foreach (var candidate in merged)
|
||||||
|
{
|
||||||
|
var previous = deduped.LastOrDefault();
|
||||||
|
if (previous is not null &&
|
||||||
|
string.Equals(NormalizeRollBandLabel(previous.Text), NormalizeRollBandLabel(candidate.Text), StringComparison.OrdinalIgnoreCase) &&
|
||||||
|
Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
deduped.Add(candidate);
|
||||||
|
}
|
||||||
|
|
||||||
|
return deduped;
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static bool IsRollBandLabel(string value) =>
|
||||||
|
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:\s*-\s*\d{2,3})?$|^\d{2,3}\+$");
|
||||||
|
|
||||||
|
internal static bool IsPotentialRowLabelFragment(XmlTextFragment fragment, int leftCutoff) =>
|
||||||
|
fragment.Left < leftCutoff &&
|
||||||
|
(IsRollBandLabel(fragment.Text) || LooksLikeSplitRollBandStart(fragment.Text));
|
||||||
|
|
||||||
|
internal static string NormalizeRollBandLabel(string label) =>
|
||||||
|
Regex.Replace(CollapseWhitespace(label), @"\s*-\s*", "-");
|
||||||
|
|
||||||
|
internal static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
|
||||||
|
{
|
||||||
|
var normalizedLabel = NormalizeRollBandLabel(label);
|
||||||
|
if (normalizedLabel.EndsWith('+'))
|
||||||
|
{
|
||||||
|
return new ParsedCriticalRollBand(normalizedLabel, int.Parse(normalizedLabel[..^1]), null, sortOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
var parts = normalizedLabel.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||||||
|
return parts.Length == 1
|
||||||
|
? new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder)
|
||||||
|
: new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static string ResolveColumn(double centerX, IReadOnlyList<(string Key, double CenterX)> columns)
|
||||||
|
{
|
||||||
|
for (var index = 0; index < columns.Count - 1; index++)
|
||||||
|
{
|
||||||
|
var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
|
||||||
|
if (centerX < boundary)
|
||||||
|
{
|
||||||
|
return columns[index].Key;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return columns[^1].Key;
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
|
||||||
|
{
|
||||||
|
var lines = new List<List<XmlTextFragment>>();
|
||||||
|
|
||||||
|
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
|
||||||
|
{
|
||||||
|
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
|
||||||
|
{
|
||||||
|
lines.Add([fragment]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
lines[^1].Add(fragment);
|
||||||
|
}
|
||||||
|
|
||||||
|
return lines
|
||||||
|
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
|
||||||
|
.Where(item => !string.IsNullOrWhiteSpace(item))
|
||||||
|
.ToList();
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static bool IsAffixLikeLine(string line, ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
var value = line.Trim();
|
||||||
|
if (value.Length == 0)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value is "-" or "\u2013" or "\u2014")
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
return value.Contains(':', StringComparison.Ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (affixLegendSymbols.Count > 0 &&
|
||||||
|
affixLegendSymbols.Any(symbol => value.Contains(symbol, StringComparison.Ordinal)))
|
||||||
|
{
|
||||||
|
if (value.Any(char.IsDigit))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
var remainder = value;
|
||||||
|
foreach (var symbol in affixLegendSymbols.OrderByDescending(item => item.Length))
|
||||||
|
{
|
||||||
|
remainder = remainder.Replace(symbol, string.Empty, StringComparison.Ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
remainder = remainder
|
||||||
|
.Replace("+", string.Empty, StringComparison.Ordinal)
|
||||||
|
.Replace("-", string.Empty, StringComparison.Ordinal)
|
||||||
|
.Replace("–", string.Empty, StringComparison.Ordinal)
|
||||||
|
.Replace("(", string.Empty, StringComparison.Ordinal)
|
||||||
|
.Replace(")", string.Empty, StringComparison.Ordinal)
|
||||||
|
.Replace("/", string.Empty, StringComparison.Ordinal);
|
||||||
|
|
||||||
|
if (string.IsNullOrWhiteSpace(remainder))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return value.StartsWith("+", StringComparison.Ordinal) ||
|
||||||
|
value.StartsWith("\u2211", StringComparison.Ordinal) ||
|
||||||
|
value.StartsWith("\u220F", StringComparison.Ordinal) ||
|
||||||
|
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
|
||||||
|
value.StartsWith("\u222B", StringComparison.Ordinal) ||
|
||||||
|
StandaloneModifierAffixLineRegex.IsMatch(value) ||
|
||||||
|
NumericAffixLineRegex.IsMatch(value) ||
|
||||||
|
value.Contains(" - ", StringComparison.Ordinal) ||
|
||||||
|
value.Contains(" – ", StringComparison.Ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static int CountLineTypeSegments(IReadOnlyList<string> lines, ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
var segmentCount = 0;
|
||||||
|
bool? previousIsAffix = null;
|
||||||
|
|
||||||
|
foreach (var line in lines)
|
||||||
|
{
|
||||||
|
var currentIsAffix = IsAffixLikeLine(line, affixLegendSymbols);
|
||||||
|
if (previousIsAffix == currentIsAffix)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
segmentCount++;
|
||||||
|
previousIsAffix = currentIsAffix;
|
||||||
|
}
|
||||||
|
|
||||||
|
return segmentCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static string CollapseWhitespace(string value) =>
|
||||||
|
Regex.Replace(value.Trim(), @"\s+", " ");
|
||||||
|
|
||||||
|
internal static string NormalizeText(string value) =>
|
||||||
|
value
|
||||||
|
.Replace('\u00a0', ' ')
|
||||||
|
.Replace('\r', ' ')
|
||||||
|
.Replace('\n', ' ')
|
||||||
|
.Replace('’', '\'')
|
||||||
|
.Trim();
|
||||||
|
|
||||||
|
internal static HashSet<string> DetectAffixLegendSymbols(IReadOnlyList<XmlTextFragment> fragments, int keyTop)
|
||||||
|
{
|
||||||
|
if (keyTop == int.MaxValue)
|
||||||
|
{
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
var footerLines = GroupByTop(fragments
|
||||||
|
.Where(item => item.Top >= keyTop - TopGroupingTolerance)
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ThenBy(item => item.Left)
|
||||||
|
.ToList())
|
||||||
|
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var symbols = new HashSet<string>(StringComparer.Ordinal);
|
||||||
|
|
||||||
|
foreach (var footerLine in footerLines)
|
||||||
|
{
|
||||||
|
AddLegendMatch(symbols, footerLine, @"must parry\s*=\s*(\S)");
|
||||||
|
AddLegendMatch(symbols, footerLine, @"no parry\s*=\s*(\S)");
|
||||||
|
AddLegendMatch(symbols, footerLine, @"stun(?:ned)?\s*=\s*(\S)");
|
||||||
|
AddLegendMatch(symbols, footerLine, @"bleed\s*=\s*(\S)");
|
||||||
|
AddLegendMatch(symbols, footerLine, @"powerpoint modification.*=\s*(\S)");
|
||||||
|
}
|
||||||
|
|
||||||
|
return symbols;
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static List<XmlTextFragment> SplitBoundaryCrossingAffixFragments(
|
||||||
|
IReadOnlyList<XmlTextFragment> bodyFragments,
|
||||||
|
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||||
|
ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count);
|
||||||
|
|
||||||
|
foreach (var fragment in bodyFragments)
|
||||||
|
{
|
||||||
|
splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols));
|
||||||
|
}
|
||||||
|
|
||||||
|
return splitFragments;
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static List<(int Top, bool IsAffixLike)> BuildBodyLines(
|
||||||
|
IReadOnlyList<XmlTextFragment> bodyFragments,
|
||||||
|
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||||
|
ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
var bodyLines = new List<(int Top, bool IsAffixLike)>();
|
||||||
|
|
||||||
|
foreach (var lineFragments in GroupByTop(bodyFragments.OrderBy(item => item.Top).ThenBy(item => item.Left).ToList()))
|
||||||
|
{
|
||||||
|
var columnTexts = lineFragments
|
||||||
|
.GroupBy(item => ResolveColumn(item.CenterX, columnCenters), StringComparer.OrdinalIgnoreCase)
|
||||||
|
.Select(group => CollapseWhitespace(string.Join(' ', group.OrderBy(item => item.Left).Select(item => item.Text))))
|
||||||
|
.Where(item => !string.IsNullOrWhiteSpace(item))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var isAffixLike = columnTexts.Count > 0 &&
|
||||||
|
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols));
|
||||||
|
|
||||||
|
bodyLines.Add((lineFragments[0].Top, isAffixLike));
|
||||||
|
}
|
||||||
|
|
||||||
|
return bodyLines;
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
|
||||||
|
{
|
||||||
|
if (keyTop == int.MaxValue)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return fragment.Top >= keyTop - FooterPageNumberExclusionGap &&
|
||||||
|
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
|
||||||
|
{
|
||||||
|
var groups = new List<List<XmlTextFragment>>();
|
||||||
|
|
||||||
|
foreach (var fragment in fragments)
|
||||||
|
{
|
||||||
|
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance)
|
||||||
|
{
|
||||||
|
groups.Add([fragment]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
groups[^1].Add(fragment);
|
||||||
|
}
|
||||||
|
|
||||||
|
return groups;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool LooksLikeSplitRollBandStart(string value) =>
|
||||||
|
Regex.IsMatch(value.Trim(), @"^\d{2,3}\s*-$");
|
||||||
|
|
||||||
|
private static bool TryMergeSplitRollBand(IReadOnlyList<XmlTextFragment> candidates, int index, out XmlTextFragment mergedCandidate)
|
||||||
|
{
|
||||||
|
var current = candidates[index];
|
||||||
|
if (!LooksLikeSplitRollBandStart(current.Text) || index + 1 >= candidates.Count)
|
||||||
|
{
|
||||||
|
mergedCandidate = null!;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var next = candidates[index + 1];
|
||||||
|
if (current.PageNumber != next.PageNumber ||
|
||||||
|
!Regex.IsMatch(next.Text.Trim(), @"^\d{2,3}$") ||
|
||||||
|
next.Top <= current.Top ||
|
||||||
|
next.Top - current.Top > RowLabelDuplicateTolerance + 5 ||
|
||||||
|
Math.Abs(next.Left - current.Left) > 20)
|
||||||
|
{
|
||||||
|
mergedCandidate = null!;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var startDigits = Regex.Match(current.Text, @"\d{2,3}").Value;
|
||||||
|
var mergedLabel = $"{startDigits}-{next.Text.Trim()}";
|
||||||
|
var right = Math.Max(current.Left + current.Width, next.Left + next.Width);
|
||||||
|
|
||||||
|
mergedCandidate = new XmlTextFragment(
|
||||||
|
current.PageNumber,
|
||||||
|
current.Top,
|
||||||
|
Math.Min(current.Left, next.Left),
|
||||||
|
right - Math.Min(current.Left, next.Left),
|
||||||
|
Math.Max(current.Height, next.Height),
|
||||||
|
mergedLabel);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingAffixFragment(
|
||||||
|
XmlTextFragment fragment,
|
||||||
|
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||||
|
ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols))
|
||||||
|
{
|
||||||
|
return [fragment];
|
||||||
|
}
|
||||||
|
|
||||||
|
var matches = MultiFragmentSplitRegex.Matches(fragment.Text);
|
||||||
|
if (matches.Count < 2)
|
||||||
|
{
|
||||||
|
return [fragment];
|
||||||
|
}
|
||||||
|
|
||||||
|
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
|
||||||
|
var splitFragments = new List<XmlTextFragment>(matches.Count);
|
||||||
|
|
||||||
|
foreach (Match match in matches)
|
||||||
|
{
|
||||||
|
var segmentText = CollapseWhitespace(match.Value);
|
||||||
|
if (segmentText.Length == 0)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
|
||||||
|
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
|
||||||
|
|
||||||
|
splitFragments.Add(new XmlTextFragment(
|
||||||
|
fragment.PageNumber,
|
||||||
|
fragment.Top,
|
||||||
|
segmentLeft,
|
||||||
|
segmentWidth,
|
||||||
|
fragment.Height,
|
||||||
|
segmentText));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (splitFragments.Count < 2)
|
||||||
|
{
|
||||||
|
return [fragment];
|
||||||
|
}
|
||||||
|
|
||||||
|
var originalColumn = ResolveColumn(fragment.CenterX, columnCenters);
|
||||||
|
var distinctColumns = splitFragments
|
||||||
|
.Select(item => ResolveColumn(item.CenterX, columnCenters))
|
||||||
|
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase))
|
||||||
|
? splitFragments
|
||||||
|
: [fragment];
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool LooksLikeBoundaryCrossingAffixFragment(
|
||||||
|
XmlTextFragment fragment,
|
||||||
|
IReadOnlyList<(string Key, double CenterX)> columnCenters,
|
||||||
|
ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) ||
|
||||||
|
!fragment.Text.Contains(" ", StringComparison.Ordinal))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var fragmentRight = fragment.Left + fragment.Width;
|
||||||
|
|
||||||
|
for (var index = 0; index < columnCenters.Count - 1; index++)
|
||||||
|
{
|
||||||
|
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
|
||||||
|
if (fragment.Left < boundary && fragmentRight > boundary)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void AddLegendMatch(HashSet<string> symbols, string value, string pattern)
|
||||||
|
{
|
||||||
|
foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))
|
||||||
|
{
|
||||||
|
if (match.Groups.Count > 1)
|
||||||
|
{
|
||||||
|
symbols.Add(match.Groups[1].Value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,306 @@
|
|||||||
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
public sealed class GroupedVariantCriticalTableParser
|
||||||
|
{
|
||||||
|
private static readonly ParsedCriticalGroup[] ExpectedGroups =
|
||||||
|
[
|
||||||
|
new("large", "Large Creatures", 1),
|
||||||
|
new("super_large", "Super Large Creatures", 2)
|
||||||
|
];
|
||||||
|
|
||||||
|
private static readonly ParsedCriticalColumn[] ExpectedColumns =
|
||||||
|
[
|
||||||
|
new("NORMAL", "Normal", "variant", 1),
|
||||||
|
new("SLAYING", "Slaying", "variant", 2)
|
||||||
|
];
|
||||||
|
|
||||||
|
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||||
|
{
|
||||||
|
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
|
||||||
|
var groupHeaders = FindGroupHeaders(fragments);
|
||||||
|
var columnHeaders = FindColumnHeaders(fragments);
|
||||||
|
var validationErrors = new List<string>();
|
||||||
|
var validationWarnings = new List<string>();
|
||||||
|
|
||||||
|
var combinedColumnAnchors = columnHeaders
|
||||||
|
.OrderBy(item => item.Left)
|
||||||
|
.Select((item, index) =>
|
||||||
|
{
|
||||||
|
var group = ExpectedGroups[index / ExpectedColumns.Length];
|
||||||
|
var column = ExpectedColumns[index % ExpectedColumns.Length];
|
||||||
|
return (group.GroupKey, column.ColumnKey, CompositeKey: $"{group.GroupKey}:{column.ColumnKey}", item.CenterX);
|
||||||
|
})
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var bodyStartTop = Math.Max(
|
||||||
|
groupHeaders.Max(item => item.Top),
|
||||||
|
columnHeaders.Max(item => item.Top))
|
||||||
|
+ CriticalTableParserSupport.HeaderToBodyMinimumGap;
|
||||||
|
var keyTop = fragments
|
||||||
|
.Where(item =>
|
||||||
|
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
|
||||||
|
.Select(item => (int?)item.Top)
|
||||||
|
.Min() ?? int.MaxValue;
|
||||||
|
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
|
||||||
|
var leftCutoff = columnHeaders.Min(item => item.Left) - 10;
|
||||||
|
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||||
|
fragments,
|
||||||
|
leftCutoff,
|
||||||
|
bodyStartTop,
|
||||||
|
keyTop);
|
||||||
|
|
||||||
|
var rowAnchors = rowLabelFragments
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
if (rowAnchors.Count == 0)
|
||||||
|
{
|
||||||
|
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var columnCenters = combinedColumnAnchors
|
||||||
|
.Select(item => (item.CompositeKey, item.CenterX))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var bodyFragments = fragments
|
||||||
|
.Where(item =>
|
||||||
|
item.Top >= bodyStartTop &&
|
||||||
|
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
|
||||||
|
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
|
||||||
|
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
|
||||||
|
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
|
||||||
|
!groupHeaders.Contains(item) &&
|
||||||
|
!columnHeaders.Contains(item))
|
||||||
|
.ToList();
|
||||||
|
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
|
||||||
|
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
|
||||||
|
|
||||||
|
var parsedRollBands = rowAnchors
|
||||||
|
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var cellEntries = new List<CellEntry>();
|
||||||
|
|
||||||
|
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
|
||||||
|
{
|
||||||
|
var rowStart = rowIndex == 0
|
||||||
|
? bodyStartTop
|
||||||
|
: ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
|
||||||
|
|
||||||
|
var rowEnd = rowIndex == rowAnchors.Count - 1
|
||||||
|
? keyTop - 1
|
||||||
|
: ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
|
||||||
|
|
||||||
|
var rowFragments = bodyFragments
|
||||||
|
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
foreach (var anchor in combinedColumnAnchors)
|
||||||
|
{
|
||||||
|
var cellFragments = rowFragments
|
||||||
|
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == anchor.CompositeKey)
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ThenBy(item => item.Left)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
if (cellFragments.Count == 0)
|
||||||
|
{
|
||||||
|
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', group '{anchor.GroupKey}', column '{anchor.ColumnKey}'.");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
cellEntries.Add(new CellEntry(
|
||||||
|
anchor.GroupKey,
|
||||||
|
rowAnchors[rowIndex].Label,
|
||||||
|
rowIndex,
|
||||||
|
anchor.ColumnKey,
|
||||||
|
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
|
||||||
|
|
||||||
|
var parsedCells = new List<ParsedCriticalCellArtifact>();
|
||||||
|
var parsedResults = new List<ParsedCriticalResult>();
|
||||||
|
|
||||||
|
foreach (var cellEntry in cellEntries
|
||||||
|
.OrderBy(item => item.RowIndex)
|
||||||
|
.ThenBy(item => item.GroupKey, StringComparer.Ordinal)
|
||||||
|
.ThenBy(item => item.ColumnKey, StringComparer.Ordinal))
|
||||||
|
{
|
||||||
|
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
|
||||||
|
if (segmentCount > 2)
|
||||||
|
{
|
||||||
|
validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.GroupKey}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
|
||||||
|
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
|
||||||
|
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
|
||||||
|
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
|
||||||
|
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
|
||||||
|
|
||||||
|
parsedCells.Add(new ParsedCriticalCellArtifact(
|
||||||
|
cellEntry.GroupKey,
|
||||||
|
cellEntry.RollBandLabel,
|
||||||
|
cellEntry.ColumnKey,
|
||||||
|
cellEntry.Lines,
|
||||||
|
rawCellText,
|
||||||
|
descriptionText,
|
||||||
|
rawAffixText));
|
||||||
|
|
||||||
|
parsedResults.Add(new ParsedCriticalResult(
|
||||||
|
cellEntry.GroupKey,
|
||||||
|
cellEntry.ColumnKey,
|
||||||
|
cellEntry.RollBandLabel,
|
||||||
|
rawCellText,
|
||||||
|
descriptionText,
|
||||||
|
rawAffixText));
|
||||||
|
}
|
||||||
|
|
||||||
|
var expectedCellCount = rowAnchors.Count * ExpectedGroups.Length * ExpectedColumns.Length;
|
||||||
|
if (parsedCells.Count != expectedCellCount)
|
||||||
|
{
|
||||||
|
validationErrors.Add($"Expected {expectedCellCount} parsed cells but produced {parsedCells.Count}.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var validationReport = new ImportValidationReport(
|
||||||
|
validationErrors.Count == 0,
|
||||||
|
validationErrors,
|
||||||
|
validationWarnings,
|
||||||
|
rowAnchors.Count,
|
||||||
|
parsedCells.Count);
|
||||||
|
|
||||||
|
var table = new ParsedCriticalTable(
|
||||||
|
entry.Slug,
|
||||||
|
entry.DisplayName,
|
||||||
|
entry.Family,
|
||||||
|
Path.GetFileName(entry.PdfPath),
|
||||||
|
"Imported from PDF XML extraction.",
|
||||||
|
ExpectedGroups,
|
||||||
|
ExpectedColumns,
|
||||||
|
parsedRollBands,
|
||||||
|
parsedResults);
|
||||||
|
|
||||||
|
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<XmlTextFragment> FindGroupHeaders(IReadOnlyList<XmlTextFragment> fragments)
|
||||||
|
{
|
||||||
|
var expectedLabels = ExpectedGroups.Select(item => item.Label).ToList();
|
||||||
|
var headerCandidates = fragments
|
||||||
|
.Where(item => expectedLabels.Contains(item.Text.Trim(), StringComparer.OrdinalIgnoreCase))
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ThenBy(item => item.Left)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
|
||||||
|
{
|
||||||
|
var ordered = group.OrderBy(item => item.Left).ToList();
|
||||||
|
var labels = ordered.Select(item => item.Text.Trim()).ToList();
|
||||||
|
if (labels.SequenceEqual(expectedLabels, StringComparer.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
return ordered;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new InvalidOperationException("Could not find the grouped-variant section headers in the XML artifact.");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<XmlTextFragment> FindColumnHeaders(IReadOnlyList<XmlTextFragment> fragments)
|
||||||
|
{
|
||||||
|
var expectedLabels = new[] { "normal", "slaying", "normal", "slaying" };
|
||||||
|
var headerCandidates = fragments
|
||||||
|
.Where(item =>
|
||||||
|
{
|
||||||
|
var normalized = item.Text.Trim().ToLowerInvariant();
|
||||||
|
return normalized is "normal" or "slaying";
|
||||||
|
})
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ThenBy(item => item.Left)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
|
||||||
|
{
|
||||||
|
var ordered = group.OrderBy(item => item.Left).ToList();
|
||||||
|
var labels = ordered.Select(item => item.Text.Trim().ToLowerInvariant()).ToList();
|
||||||
|
if (labels.SequenceEqual(expectedLabels))
|
||||||
|
{
|
||||||
|
return ordered;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new InvalidOperationException("Could not find the grouped-variant column header row in the XML artifact.");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
|
||||||
|
var axes = cellEntries
|
||||||
|
.Select(item => (item.GroupKey, item.ColumnKey))
|
||||||
|
.Distinct()
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
|
||||||
|
{
|
||||||
|
foreach (var (groupKey, columnKey) in axes)
|
||||||
|
{
|
||||||
|
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.GroupKey == groupKey && item.ColumnKey == columnKey);
|
||||||
|
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.GroupKey == groupKey && item.ColumnKey == columnKey);
|
||||||
|
if (current is null || next is null)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var leadingAffixCount = 0;
|
||||||
|
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
|
||||||
|
{
|
||||||
|
leadingAffixCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
|
||||||
|
next.Lines.RemoveRange(0, leadingAffixCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int ResolveRowBoundaryTop(
|
||||||
|
RowAnchor current,
|
||||||
|
RowAnchor next,
|
||||||
|
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
|
||||||
|
{
|
||||||
|
var linesBetweenLabels = bodyLines
|
||||||
|
.Where(item => item.Top >= current.Top && item.Top < next.Top)
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
|
||||||
|
{
|
||||||
|
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
|
||||||
|
{
|
||||||
|
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed record RowAnchor(string Label, int Top, int SortOrder);
|
||||||
|
|
||||||
|
private sealed class CellEntry(string groupKey, string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
|
||||||
|
{
|
||||||
|
public string GroupKey { get; } = groupKey;
|
||||||
|
public string RollBandLabel { get; } = rollBandLabel;
|
||||||
|
public int RowIndex { get; } = rowIndex;
|
||||||
|
public string ColumnKey { get; } = columnKey;
|
||||||
|
public List<string> Lines { get; } = lines;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
namespace RolemasterDb.ImportTool.Parsing;
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
public sealed class ParsedCriticalCellArtifact(
|
public sealed class ParsedCriticalCellArtifact(
|
||||||
|
string? groupKey,
|
||||||
string rollBandLabel,
|
string rollBandLabel,
|
||||||
string columnKey,
|
string columnKey,
|
||||||
IReadOnlyList<string> lines,
|
IReadOnlyList<string> lines,
|
||||||
@@ -8,6 +9,7 @@ public sealed class ParsedCriticalCellArtifact(
|
|||||||
string descriptionText,
|
string descriptionText,
|
||||||
string? rawAffixText)
|
string? rawAffixText)
|
||||||
{
|
{
|
||||||
|
public string? GroupKey { get; } = groupKey;
|
||||||
public string RollBandLabel { get; } = rollBandLabel;
|
public string RollBandLabel { get; } = rollBandLabel;
|
||||||
public string ColumnKey { get; } = columnKey;
|
public string ColumnKey { get; } = columnKey;
|
||||||
public IReadOnlyList<string> Lines { get; } = lines;
|
public IReadOnlyList<string> Lines { get; } = lines;
|
||||||
|
|||||||
@@ -0,0 +1,8 @@
|
|||||||
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
public sealed class ParsedCriticalGroup(string groupKey, string label, int sortOrder)
|
||||||
|
{
|
||||||
|
public string GroupKey { get; } = groupKey;
|
||||||
|
public string Label { get; } = label;
|
||||||
|
public int SortOrder { get; } = sortOrder;
|
||||||
|
}
|
||||||
@@ -1,12 +1,14 @@
|
|||||||
namespace RolemasterDb.ImportTool.Parsing;
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
public sealed class ParsedCriticalResult(
|
public sealed class ParsedCriticalResult(
|
||||||
|
string? groupKey,
|
||||||
string columnKey,
|
string columnKey,
|
||||||
string rollBandLabel,
|
string rollBandLabel,
|
||||||
string rawCellText,
|
string rawCellText,
|
||||||
string descriptionText,
|
string descriptionText,
|
||||||
string? rawAffixText)
|
string? rawAffixText)
|
||||||
{
|
{
|
||||||
|
public string? GroupKey { get; } = groupKey;
|
||||||
public string ColumnKey { get; } = columnKey;
|
public string ColumnKey { get; } = columnKey;
|
||||||
public string RollBandLabel { get; } = rollBandLabel;
|
public string RollBandLabel { get; } = rollBandLabel;
|
||||||
public string RawCellText { get; } = rawCellText;
|
public string RawCellText { get; } = rawCellText;
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ public sealed class ParsedCriticalTable(
|
|||||||
string family,
|
string family,
|
||||||
string sourceDocument,
|
string sourceDocument,
|
||||||
string? notes,
|
string? notes,
|
||||||
|
IReadOnlyList<ParsedCriticalGroup> groups,
|
||||||
IReadOnlyList<ParsedCriticalColumn> columns,
|
IReadOnlyList<ParsedCriticalColumn> columns,
|
||||||
IReadOnlyList<ParsedCriticalRollBand> rollBands,
|
IReadOnlyList<ParsedCriticalRollBand> rollBands,
|
||||||
IReadOnlyList<ParsedCriticalResult> results)
|
IReadOnlyList<ParsedCriticalResult> results)
|
||||||
@@ -15,6 +16,7 @@ public sealed class ParsedCriticalTable(
|
|||||||
public string Family { get; } = family;
|
public string Family { get; } = family;
|
||||||
public string SourceDocument { get; } = sourceDocument;
|
public string SourceDocument { get; } = sourceDocument;
|
||||||
public string? Notes { get; } = notes;
|
public string? Notes { get; } = notes;
|
||||||
|
public IReadOnlyList<ParsedCriticalGroup> Groups { get; } = groups;
|
||||||
public IReadOnlyList<ParsedCriticalColumn> Columns { get; } = columns;
|
public IReadOnlyList<ParsedCriticalColumn> Columns { get; } = columns;
|
||||||
public IReadOnlyList<ParsedCriticalRollBand> RollBands { get; } = rollBands;
|
public IReadOnlyList<ParsedCriticalRollBand> RollBands { get; } = rollBands;
|
||||||
public IReadOnlyList<ParsedCriticalResult> Results { get; } = results;
|
public IReadOnlyList<ParsedCriticalResult> Results { get; } = results;
|
||||||
|
|||||||
@@ -1,33 +1,20 @@
|
|||||||
using System.Text.RegularExpressions;
|
|
||||||
using System.Xml;
|
|
||||||
using System.Xml.Linq;
|
|
||||||
|
|
||||||
namespace RolemasterDb.ImportTool.Parsing;
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
public sealed class StandardCriticalTableParser
|
public sealed class StandardCriticalTableParser
|
||||||
{
|
{
|
||||||
private const int HeaderToBodyMinimumGap = 20;
|
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||||
private const int FooterLabelExclusionGap = 15;
|
|
||||||
private const int FooterPageNumberExclusionGap = 80;
|
|
||||||
private const int RowLabelDuplicateTolerance = 15;
|
|
||||||
private const int TopGroupingTolerance = 2;
|
|
||||||
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
|
|
||||||
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);
|
|
||||||
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled);
|
|
||||||
|
|
||||||
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
|
||||||
{
|
{
|
||||||
var fragments = LoadFragments(xmlContent);
|
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
|
||||||
var headerFragments = FindHeaderFragments(fragments);
|
var headerFragments = FindHeaderFragments(fragments);
|
||||||
var validationErrors = new List<string>();
|
var validationErrors = new List<string>();
|
||||||
var validationWarnings = new List<string>();
|
var validationWarnings = new List<string>();
|
||||||
|
|
||||||
var columnCenters = headerFragments
|
var columnCenters = headerFragments
|
||||||
.OrderBy(item => item.Left)
|
.OrderBy(item => item.Left)
|
||||||
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
|
.Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX))
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
|
||||||
var keyTop = fragments
|
var keyTop = fragments
|
||||||
.Where(item =>
|
.Where(item =>
|
||||||
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
|
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
|
||||||
@@ -35,12 +22,17 @@ public sealed class StandardCriticalTableParser
|
|||||||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
|
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
|
||||||
.Select(item => (int?)item.Top)
|
.Select(item => (int?)item.Top)
|
||||||
.Min() ?? int.MaxValue;
|
.Min() ?? int.MaxValue;
|
||||||
var affixLegendSymbols = DetectAffixLegendSymbols(fragments, keyTop);
|
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
|
||||||
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments, keyTop);
|
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
|
||||||
|
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||||
|
fragments,
|
||||||
|
leftCutoff,
|
||||||
|
bodyStartTop,
|
||||||
|
keyTop);
|
||||||
|
|
||||||
var rowAnchors = rowLabelFragments
|
var rowAnchors = rowLabelFragments
|
||||||
.OrderBy(item => item.Top)
|
.OrderBy(item => item.Top)
|
||||||
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
|
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
if (rowAnchors.Count == 0)
|
if (rowAnchors.Count == 0)
|
||||||
@@ -51,16 +43,17 @@ public sealed class StandardCriticalTableParser
|
|||||||
var bodyFragments = fragments
|
var bodyFragments = fragments
|
||||||
.Where(item =>
|
.Where(item =>
|
||||||
item.Top >= bodyStartTop &&
|
item.Top >= bodyStartTop &&
|
||||||
item.Top < keyTop - TopGroupingTolerance &&
|
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
|
||||||
!IsFooterPageNumberFragment(item, keyTop) &&
|
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
|
||||||
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
|
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
|
||||||
|
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
|
||||||
!headerFragments.Contains(item))
|
!headerFragments.Contains(item))
|
||||||
.ToList();
|
.ToList();
|
||||||
bodyFragments = SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
|
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
|
||||||
var bodyLines = BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
|
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
|
||||||
|
|
||||||
var parsedRollBands = rowAnchors
|
var parsedRollBands = rowAnchors
|
||||||
.Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder))
|
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
var cellEntries = new List<CellEntry>();
|
var cellEntries = new List<CellEntry>();
|
||||||
@@ -82,7 +75,7 @@ public sealed class StandardCriticalTableParser
|
|||||||
foreach (var columnAnchor in columnCenters)
|
foreach (var columnAnchor in columnCenters)
|
||||||
{
|
{
|
||||||
var cellFragments = rowFragments
|
var cellFragments = rowFragments
|
||||||
.Where(item => ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
|
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
|
||||||
.OrderBy(item => item.Top)
|
.OrderBy(item => item.Top)
|
||||||
.ThenBy(item => item.Left)
|
.ThenBy(item => item.Left)
|
||||||
.ToList();
|
.ToList();
|
||||||
@@ -97,7 +90,7 @@ public sealed class StandardCriticalTableParser
|
|||||||
rowAnchors[rowIndex].Label,
|
rowAnchors[rowIndex].Label,
|
||||||
rowIndex,
|
rowIndex,
|
||||||
columnAnchor.Key,
|
columnAnchor.Key,
|
||||||
BuildLines(cellFragments).ToList()));
|
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -108,7 +101,7 @@ public sealed class StandardCriticalTableParser
|
|||||||
|
|
||||||
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
|
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
|
||||||
{
|
{
|
||||||
var segmentCount = CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
|
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
|
||||||
|
|
||||||
if (segmentCount > 2)
|
if (segmentCount > 2)
|
||||||
{
|
{
|
||||||
@@ -116,13 +109,14 @@ public sealed class StandardCriticalTableParser
|
|||||||
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
|
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
|
||||||
}
|
}
|
||||||
|
|
||||||
var rawAffixLines = cellEntry.Lines.Where(line => IsAffixLikeLine(line, affixLegendSymbols)).ToList();
|
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
|
||||||
var descriptionLines = cellEntry.Lines.Where(line => !IsAffixLikeLine(line, affixLegendSymbols)).ToList();
|
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
|
||||||
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
|
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
|
||||||
var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
|
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
|
||||||
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
|
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
|
||||||
|
|
||||||
parsedCells.Add(new ParsedCriticalCellArtifact(
|
parsedCells.Add(new ParsedCriticalCellArtifact(
|
||||||
|
null,
|
||||||
cellEntry.RollBandLabel,
|
cellEntry.RollBandLabel,
|
||||||
cellEntry.ColumnKey,
|
cellEntry.ColumnKey,
|
||||||
cellEntry.Lines,
|
cellEntry.Lines,
|
||||||
@@ -131,6 +125,7 @@ public sealed class StandardCriticalTableParser
|
|||||||
rawAffixText));
|
rawAffixText));
|
||||||
|
|
||||||
parsedResults.Add(new ParsedCriticalResult(
|
parsedResults.Add(new ParsedCriticalResult(
|
||||||
|
null,
|
||||||
cellEntry.ColumnKey,
|
cellEntry.ColumnKey,
|
||||||
cellEntry.RollBandLabel,
|
cellEntry.RollBandLabel,
|
||||||
rawCellText,
|
rawCellText,
|
||||||
@@ -162,40 +157,12 @@ public sealed class StandardCriticalTableParser
|
|||||||
entry.Family,
|
entry.Family,
|
||||||
Path.GetFileName(entry.PdfPath),
|
Path.GetFileName(entry.PdfPath),
|
||||||
"Imported from PDF XML extraction.",
|
"Imported from PDF XML extraction.",
|
||||||
|
[],
|
||||||
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
|
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
|
||||||
parsedRollBands,
|
parsedRollBands,
|
||||||
parsedResults);
|
parsedResults);
|
||||||
|
|
||||||
return new StandardCriticalTableParseResult(table, fragments, parsedCells, validationReport);
|
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport);
|
||||||
}
|
|
||||||
|
|
||||||
private static List<XmlTextFragment> LoadFragments(string xmlContent)
|
|
||||||
{
|
|
||||||
using var stringReader = new StringReader(xmlContent);
|
|
||||||
using var xmlReader = XmlReader.Create(
|
|
||||||
stringReader,
|
|
||||||
new XmlReaderSettings
|
|
||||||
{
|
|
||||||
DtdProcessing = DtdProcessing.Ignore
|
|
||||||
});
|
|
||||||
|
|
||||||
var document = XDocument.Load(xmlReader);
|
|
||||||
|
|
||||||
return document.Descendants("page")
|
|
||||||
.SelectMany(page =>
|
|
||||||
{
|
|
||||||
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
|
|
||||||
return page.Elements("text")
|
|
||||||
.Select(item => new XmlTextFragment(
|
|
||||||
pageNumber,
|
|
||||||
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
|
|
||||||
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
|
|
||||||
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
|
|
||||||
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
|
|
||||||
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
|
|
||||||
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
|
|
||||||
})
|
|
||||||
.ToList();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
||||||
@@ -206,7 +173,7 @@ public sealed class StandardCriticalTableParser
|
|||||||
.ThenBy(item => item.Left)
|
.ThenBy(item => item.Left)
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
foreach (var group in GroupByTop(headerCandidates))
|
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
|
||||||
{
|
{
|
||||||
var ordered = group.OrderBy(item => item.Left).ToList();
|
var ordered = group.OrderBy(item => item.Left).ToList();
|
||||||
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
|
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
|
||||||
@@ -219,156 +186,6 @@ public sealed class StandardCriticalTableParser
|
|||||||
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
|
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<XmlTextFragment> FindRowLabelFragments(
|
|
||||||
IReadOnlyList<XmlTextFragment> fragments,
|
|
||||||
IReadOnlyList<XmlTextFragment> headerFragments,
|
|
||||||
int keyTop)
|
|
||||||
{
|
|
||||||
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
|
|
||||||
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
|
|
||||||
|
|
||||||
var candidates = fragments
|
|
||||||
.Where(item =>
|
|
||||||
item.Left < leftCutoff &&
|
|
||||||
item.Top >= bodyStartTop &&
|
|
||||||
item.Top < keyTop - FooterLabelExclusionGap &&
|
|
||||||
IsRollBandLabel(item.Text))
|
|
||||||
.OrderBy(item => item.Top)
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
var deduped = new List<XmlTextFragment>();
|
|
||||||
|
|
||||||
foreach (var candidate in candidates)
|
|
||||||
{
|
|
||||||
var previous = deduped.LastOrDefault();
|
|
||||||
if (previous is not null &&
|
|
||||||
string.Equals(previous.Text, candidate.Text, StringComparison.OrdinalIgnoreCase) &&
|
|
||||||
Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
deduped.Add(candidate);
|
|
||||||
}
|
|
||||||
|
|
||||||
return deduped;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static bool IsRollBandLabel(string value) =>
|
|
||||||
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$");
|
|
||||||
|
|
||||||
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
|
|
||||||
{
|
|
||||||
if (label.EndsWith('+'))
|
|
||||||
{
|
|
||||||
return new ParsedCriticalRollBand(label, int.Parse(label[..^1]), null, sortOrder);
|
|
||||||
}
|
|
||||||
|
|
||||||
var parts = label.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
|
||||||
return parts.Length == 1
|
|
||||||
? new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder)
|
|
||||||
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static string ResolveColumn(double centerX, IReadOnlyList<ColumnAnchor> columns)
|
|
||||||
{
|
|
||||||
for (var index = 0; index < columns.Count - 1; index++)
|
|
||||||
{
|
|
||||||
var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
|
|
||||||
if (centerX < boundary)
|
|
||||||
{
|
|
||||||
return columns[index].Key;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return columns[^1].Key;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
|
|
||||||
{
|
|
||||||
var lines = new List<List<XmlTextFragment>>();
|
|
||||||
|
|
||||||
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
|
|
||||||
{
|
|
||||||
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
|
|
||||||
{
|
|
||||||
lines.Add([fragment]);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
lines[^1].Add(fragment);
|
|
||||||
}
|
|
||||||
|
|
||||||
return lines
|
|
||||||
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
|
|
||||||
.Where(item => !string.IsNullOrWhiteSpace(item))
|
|
||||||
.ToList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static bool IsAffixLikeLine(string line, ISet<string> affixLegendSymbols)
|
|
||||||
{
|
|
||||||
var value = line.Trim();
|
|
||||||
if (value.Length == 0)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (value == "-" || value == "\u2013" || value == "\u2014")
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
|
|
||||||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
|
|
||||||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
|
|
||||||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
|
|
||||||
value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
|
|
||||||
value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
|
|
||||||
value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase))
|
|
||||||
{
|
|
||||||
return value.Contains(':', StringComparison.Ordinal);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (affixLegendSymbols.Count > 0 &&
|
|
||||||
affixLegendSymbols.Any(symbol => value.Contains(symbol, StringComparison.Ordinal)))
|
|
||||||
{
|
|
||||||
if (value.Any(char.IsDigit))
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
var remainder = value;
|
|
||||||
foreach (var symbol in affixLegendSymbols.OrderByDescending(item => item.Length))
|
|
||||||
{
|
|
||||||
remainder = remainder.Replace(symbol, string.Empty, StringComparison.Ordinal);
|
|
||||||
}
|
|
||||||
|
|
||||||
remainder = remainder
|
|
||||||
.Replace("+", string.Empty, StringComparison.Ordinal)
|
|
||||||
.Replace("-", string.Empty, StringComparison.Ordinal)
|
|
||||||
.Replace("(", string.Empty, StringComparison.Ordinal)
|
|
||||||
.Replace(")", string.Empty, StringComparison.Ordinal)
|
|
||||||
.Replace("/", string.Empty, StringComparison.Ordinal);
|
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(remainder))
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return value.StartsWith("+", StringComparison.Ordinal) ||
|
|
||||||
value.StartsWith("\u2211", StringComparison.Ordinal) ||
|
|
||||||
value.StartsWith("\u220F", StringComparison.Ordinal) ||
|
|
||||||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
|
|
||||||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
|
|
||||||
StandaloneModifierAffixLineRegex.IsMatch(value) ||
|
|
||||||
NumericAffixLineRegex.IsMatch(value) ||
|
|
||||||
value.Contains(" - ", StringComparison.Ordinal);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries)
|
|
||||||
=> RepairLeadingAffixLeakage(cellEntries, new HashSet<string>(StringComparer.Ordinal));
|
|
||||||
|
|
||||||
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
|
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
|
||||||
{
|
{
|
||||||
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
|
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
|
||||||
@@ -380,14 +197,13 @@ public sealed class StandardCriticalTableParser
|
|||||||
{
|
{
|
||||||
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
|
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
|
||||||
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
|
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
|
||||||
|
|
||||||
if (current is null || next is null)
|
if (current is null || next is null)
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
var leadingAffixCount = 0;
|
var leadingAffixCount = 0;
|
||||||
while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
|
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
|
||||||
{
|
{
|
||||||
leadingAffixCount++;
|
leadingAffixCount++;
|
||||||
}
|
}
|
||||||
@@ -403,199 +219,10 @@ public sealed class StandardCriticalTableParser
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string CollapseWhitespace(string value) =>
|
|
||||||
Regex.Replace(value.Trim(), @"\s+", " ");
|
|
||||||
|
|
||||||
private static string NormalizeText(string value) =>
|
|
||||||
value
|
|
||||||
.Replace('\u00a0', ' ')
|
|
||||||
.Replace('\r', ' ')
|
|
||||||
.Replace('\n', ' ')
|
|
||||||
.Trim();
|
|
||||||
|
|
||||||
private static int CountLineTypeSegments(IReadOnlyList<string> lines, ISet<string> affixLegendSymbols)
|
|
||||||
{
|
|
||||||
var segmentCount = 0;
|
|
||||||
bool? previousIsAffix = null;
|
|
||||||
|
|
||||||
foreach (var line in lines)
|
|
||||||
{
|
|
||||||
var currentIsAffix = IsAffixLikeLine(line, affixLegendSymbols);
|
|
||||||
if (previousIsAffix == currentIsAffix)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
segmentCount++;
|
|
||||||
previousIsAffix = currentIsAffix;
|
|
||||||
}
|
|
||||||
|
|
||||||
return segmentCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static HashSet<string> DetectAffixLegendSymbols(IReadOnlyList<XmlTextFragment> fragments, int keyTop)
|
|
||||||
{
|
|
||||||
if (keyTop == int.MaxValue)
|
|
||||||
{
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
var footerLines = GroupByTop(fragments
|
|
||||||
.Where(item => item.Top >= keyTop - TopGroupingTolerance)
|
|
||||||
.OrderBy(item => item.Top)
|
|
||||||
.ThenBy(item => item.Left)
|
|
||||||
.ToList())
|
|
||||||
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
var symbols = new HashSet<string>(StringComparer.Ordinal);
|
|
||||||
|
|
||||||
foreach (var footerLine in footerLines)
|
|
||||||
{
|
|
||||||
AddLegendMatch(symbols, footerLine, @"must parry\s*=\s*(\S)");
|
|
||||||
AddLegendMatch(symbols, footerLine, @"no parry\s*=\s*(\S)");
|
|
||||||
AddLegendMatch(symbols, footerLine, @"stun(?:ned)?\s*=\s*(\S)");
|
|
||||||
AddLegendMatch(symbols, footerLine, @"bleed\s*=\s*(\S)");
|
|
||||||
AddLegendMatch(symbols, footerLine, @"powerpoint modification.*=\s*(\S)");
|
|
||||||
}
|
|
||||||
|
|
||||||
return symbols;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static List<XmlTextFragment> SplitBoundaryCrossingAffixFragments(
|
|
||||||
IReadOnlyList<XmlTextFragment> bodyFragments,
|
|
||||||
IReadOnlyList<ColumnAnchor> columnCenters,
|
|
||||||
ISet<string> affixLegendSymbols)
|
|
||||||
{
|
|
||||||
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count);
|
|
||||||
|
|
||||||
foreach (var fragment in bodyFragments)
|
|
||||||
{
|
|
||||||
splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols));
|
|
||||||
}
|
|
||||||
|
|
||||||
return splitFragments;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingAffixFragment(
|
|
||||||
XmlTextFragment fragment,
|
|
||||||
IReadOnlyList<ColumnAnchor> columnCenters,
|
|
||||||
ISet<string> affixLegendSymbols)
|
|
||||||
{
|
|
||||||
if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols))
|
|
||||||
{
|
|
||||||
return [fragment];
|
|
||||||
}
|
|
||||||
|
|
||||||
var matches = MultiFragmentSplitRegex.Matches(fragment.Text);
|
|
||||||
if (matches.Count < 2)
|
|
||||||
{
|
|
||||||
return [fragment];
|
|
||||||
}
|
|
||||||
|
|
||||||
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
|
|
||||||
var splitFragments = new List<XmlTextFragment>(matches.Count);
|
|
||||||
|
|
||||||
foreach (Match match in matches)
|
|
||||||
{
|
|
||||||
var segmentText = CollapseWhitespace(match.Value);
|
|
||||||
if (segmentText.Length == 0)
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
|
|
||||||
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
|
|
||||||
|
|
||||||
splitFragments.Add(new XmlTextFragment(
|
|
||||||
fragment.PageNumber,
|
|
||||||
fragment.Top,
|
|
||||||
segmentLeft,
|
|
||||||
segmentWidth,
|
|
||||||
fragment.Height,
|
|
||||||
segmentText));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (splitFragments.Count < 2)
|
|
||||||
{
|
|
||||||
return [fragment];
|
|
||||||
}
|
|
||||||
|
|
||||||
var originalColumn = ResolveColumn(fragment.CenterX, columnCenters);
|
|
||||||
var distinctColumns = splitFragments
|
|
||||||
.Select(item => ResolveColumn(item.CenterX, columnCenters))
|
|
||||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase))
|
|
||||||
? splitFragments
|
|
||||||
: [fragment];
|
|
||||||
}
|
|
||||||
|
|
||||||
private static bool LooksLikeBoundaryCrossingAffixFragment(
|
|
||||||
XmlTextFragment fragment,
|
|
||||||
IReadOnlyList<ColumnAnchor> columnCenters,
|
|
||||||
ISet<string> affixLegendSymbols)
|
|
||||||
{
|
|
||||||
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) ||
|
|
||||||
!fragment.Text.Contains(" ", StringComparison.Ordinal))
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
var fragmentRight = fragment.Left + fragment.Width;
|
|
||||||
|
|
||||||
for (var index = 0; index < columnCenters.Count - 1; index++)
|
|
||||||
{
|
|
||||||
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
|
|
||||||
if (fragment.Left < boundary && fragmentRight > boundary)
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void AddLegendMatch(HashSet<string> symbols, string value, string pattern)
|
|
||||||
{
|
|
||||||
foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))
|
|
||||||
{
|
|
||||||
if (match.Groups.Count > 1)
|
|
||||||
{
|
|
||||||
symbols.Add(match.Groups[1].Value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static List<BodyLine> BuildBodyLines(
|
|
||||||
IReadOnlyList<XmlTextFragment> bodyFragments,
|
|
||||||
IReadOnlyList<ColumnAnchor> columnCenters,
|
|
||||||
ISet<string> affixLegendSymbols)
|
|
||||||
{
|
|
||||||
var bodyLines = new List<BodyLine>();
|
|
||||||
|
|
||||||
foreach (var lineFragments in GroupByTop(bodyFragments.OrderBy(item => item.Top).ThenBy(item => item.Left).ToList()))
|
|
||||||
{
|
|
||||||
var columnTexts = lineFragments
|
|
||||||
.GroupBy(item => ResolveColumn(item.CenterX, columnCenters), StringComparer.OrdinalIgnoreCase)
|
|
||||||
.Select(group => CollapseWhitespace(string.Join(' ', group.OrderBy(item => item.Left).Select(item => item.Text))))
|
|
||||||
.Where(item => !string.IsNullOrWhiteSpace(item))
|
|
||||||
.ToList();
|
|
||||||
|
|
||||||
var isAffixLike = columnTexts.Count > 0 &&
|
|
||||||
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols));
|
|
||||||
|
|
||||||
bodyLines.Add(new BodyLine(lineFragments[0].Top, isAffixLike));
|
|
||||||
}
|
|
||||||
|
|
||||||
return bodyLines;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int ResolveRowBoundaryTop(
|
private static int ResolveRowBoundaryTop(
|
||||||
RowAnchor current,
|
RowAnchor current,
|
||||||
RowAnchor next,
|
RowAnchor next,
|
||||||
IReadOnlyList<BodyLine> bodyLines)
|
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
|
||||||
{
|
{
|
||||||
var linesBetweenLabels = bodyLines
|
var linesBetweenLabels = bodyLines
|
||||||
.Where(item => item.Top >= current.Top && item.Top < next.Top)
|
.Where(item => item.Top >= current.Top && item.Top < next.Top)
|
||||||
@@ -613,41 +240,8 @@ public sealed class StandardCriticalTableParser
|
|||||||
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
|
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
|
|
||||||
{
|
|
||||||
if (keyTop == int.MaxValue)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return fragment.Top >= keyTop - FooterPageNumberExclusionGap &&
|
|
||||||
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
|
|
||||||
}
|
|
||||||
|
|
||||||
private static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
|
|
||||||
{
|
|
||||||
var groups = new List<List<XmlTextFragment>>();
|
|
||||||
|
|
||||||
foreach (var fragment in fragments)
|
|
||||||
{
|
|
||||||
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance)
|
|
||||||
{
|
|
||||||
groups.Add([fragment]);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
groups[^1].Add(fragment);
|
|
||||||
}
|
|
||||||
|
|
||||||
return groups;
|
|
||||||
}
|
|
||||||
|
|
||||||
private sealed record ColumnAnchor(string Key, double CenterX);
|
|
||||||
|
|
||||||
private sealed record RowAnchor(string Label, int Top, int SortOrder);
|
private sealed record RowAnchor(string Label, int Top, int SortOrder);
|
||||||
|
|
||||||
private sealed record BodyLine(int Top, bool IsAffixLike);
|
|
||||||
|
|
||||||
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
|
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
|
||||||
{
|
{
|
||||||
public string RollBandLabel { get; } = rollBandLabel;
|
public string RollBandLabel { get; } = rollBandLabel;
|
||||||
|
|||||||
@@ -0,0 +1,276 @@
|
|||||||
|
namespace RolemasterDb.ImportTool.Parsing;
|
||||||
|
|
||||||
|
public sealed class VariantColumnCriticalTableParser
|
||||||
|
{
|
||||||
|
private static readonly ColumnDefinition[] ExpectedColumns =
|
||||||
|
[
|
||||||
|
new("NORMAL", "Normal"),
|
||||||
|
new("MAGIC", "Magic"),
|
||||||
|
new("MITHRIL", "Mithril"),
|
||||||
|
new("HOLY_ARMS", "Holy Arms"),
|
||||||
|
new("SLAYING", "Slaying")
|
||||||
|
];
|
||||||
|
|
||||||
|
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
|
||||||
|
{
|
||||||
|
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
|
||||||
|
var headerFragments = FindHeaderFragments(fragments);
|
||||||
|
var validationErrors = new List<string>();
|
||||||
|
var validationWarnings = new List<string>();
|
||||||
|
|
||||||
|
var columnAnchors = headerFragments
|
||||||
|
.OrderBy(item => item.Left)
|
||||||
|
.Select(item =>
|
||||||
|
{
|
||||||
|
var definition = ResolveColumnDefinition(item.Text);
|
||||||
|
return (definition.Key, definition.Label, item.CenterX);
|
||||||
|
})
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
|
||||||
|
var keyTop = fragments
|
||||||
|
.Where(item =>
|
||||||
|
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
|
||||||
|
.Select(item => (int?)item.Top)
|
||||||
|
.Min() ?? int.MaxValue;
|
||||||
|
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
|
||||||
|
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
|
||||||
|
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
|
||||||
|
fragments,
|
||||||
|
leftCutoff,
|
||||||
|
bodyStartTop,
|
||||||
|
keyTop);
|
||||||
|
|
||||||
|
var rowAnchors = rowLabelFragments
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
if (rowAnchors.Count == 0)
|
||||||
|
{
|
||||||
|
validationErrors.Add("No roll-band labels were found in the XML artifact.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var columnCenters = columnAnchors
|
||||||
|
.Select(item => (item.Key, item.CenterX))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var bodyFragments = fragments
|
||||||
|
.Where(item =>
|
||||||
|
item.Top >= bodyStartTop &&
|
||||||
|
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
|
||||||
|
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
|
||||||
|
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
|
||||||
|
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
|
||||||
|
!headerFragments.Contains(item))
|
||||||
|
.ToList();
|
||||||
|
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
|
||||||
|
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
|
||||||
|
|
||||||
|
var parsedRollBands = rowAnchors
|
||||||
|
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var cellEntries = new List<CellEntry>();
|
||||||
|
|
||||||
|
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
|
||||||
|
{
|
||||||
|
var rowStart = rowIndex == 0
|
||||||
|
? bodyStartTop
|
||||||
|
: ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
|
||||||
|
|
||||||
|
var rowEnd = rowIndex == rowAnchors.Count - 1
|
||||||
|
? keyTop - 1
|
||||||
|
: ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
|
||||||
|
|
||||||
|
var rowFragments = bodyFragments
|
||||||
|
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
foreach (var columnAnchor in columnAnchors)
|
||||||
|
{
|
||||||
|
var cellFragments = rowFragments
|
||||||
|
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ThenBy(item => item.Left)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
if (cellFragments.Count == 0)
|
||||||
|
{
|
||||||
|
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
cellEntries.Add(new CellEntry(
|
||||||
|
rowAnchors[rowIndex].Label,
|
||||||
|
rowIndex,
|
||||||
|
columnAnchor.Key,
|
||||||
|
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
|
||||||
|
|
||||||
|
var parsedCells = new List<ParsedCriticalCellArtifact>();
|
||||||
|
var parsedResults = new List<ParsedCriticalResult>();
|
||||||
|
|
||||||
|
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey, StringComparer.Ordinal))
|
||||||
|
{
|
||||||
|
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
|
||||||
|
if (segmentCount > 2)
|
||||||
|
{
|
||||||
|
validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
|
||||||
|
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
|
||||||
|
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
|
||||||
|
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
|
||||||
|
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
|
||||||
|
|
||||||
|
parsedCells.Add(new ParsedCriticalCellArtifact(
|
||||||
|
null,
|
||||||
|
cellEntry.RollBandLabel,
|
||||||
|
cellEntry.ColumnKey,
|
||||||
|
cellEntry.Lines,
|
||||||
|
rawCellText,
|
||||||
|
descriptionText,
|
||||||
|
rawAffixText));
|
||||||
|
|
||||||
|
parsedResults.Add(new ParsedCriticalResult(
|
||||||
|
null,
|
||||||
|
cellEntry.ColumnKey,
|
||||||
|
cellEntry.RollBandLabel,
|
||||||
|
rawCellText,
|
||||||
|
descriptionText,
|
||||||
|
rawAffixText));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (columnAnchors.Count != ExpectedColumns.Length)
|
||||||
|
{
|
||||||
|
validationErrors.Add($"Expected {ExpectedColumns.Length} variant columns but found {columnAnchors.Count}.");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parsedCells.Count != rowAnchors.Count * columnAnchors.Count)
|
||||||
|
{
|
||||||
|
validationErrors.Add($"Expected {rowAnchors.Count * columnAnchors.Count} parsed cells but produced {parsedCells.Count}.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var validationReport = new ImportValidationReport(
|
||||||
|
validationErrors.Count == 0,
|
||||||
|
validationErrors,
|
||||||
|
validationWarnings,
|
||||||
|
rowAnchors.Count,
|
||||||
|
parsedCells.Count);
|
||||||
|
|
||||||
|
var table = new ParsedCriticalTable(
|
||||||
|
entry.Slug,
|
||||||
|
entry.DisplayName,
|
||||||
|
entry.Family,
|
||||||
|
Path.GetFileName(entry.PdfPath),
|
||||||
|
"Imported from PDF XML extraction.",
|
||||||
|
[],
|
||||||
|
ExpectedColumns.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Label, "variant", index + 1)).ToList(),
|
||||||
|
parsedRollBands,
|
||||||
|
parsedResults);
|
||||||
|
|
||||||
|
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
|
||||||
|
{
|
||||||
|
var expectedLabels = ExpectedColumns
|
||||||
|
.Select(item => item.Label.ToLowerInvariant())
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
var headerCandidates = fragments
|
||||||
|
.Where(item => expectedLabels.Contains(item.Text.Trim().ToLowerInvariant(), StringComparer.Ordinal))
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ThenBy(item => item.Left)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
|
||||||
|
{
|
||||||
|
var ordered = group.OrderBy(item => item.Left).ToList();
|
||||||
|
var labels = ordered.Select(item => item.Text.Trim().ToLowerInvariant()).ToList();
|
||||||
|
if (labels.SequenceEqual(expectedLabels))
|
||||||
|
{
|
||||||
|
return ordered;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new InvalidOperationException("Could not find the variant-column header row in the XML artifact.");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ColumnDefinition ResolveColumnDefinition(string value) =>
|
||||||
|
ExpectedColumns.SingleOrDefault(item => string.Equals(item.Label, value.Trim(), StringComparison.OrdinalIgnoreCase))
|
||||||
|
?? throw new InvalidOperationException($"Unsupported variant column label '{value}'.");
|
||||||
|
|
||||||
|
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
|
||||||
|
{
|
||||||
|
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
|
||||||
|
var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
|
||||||
|
|
||||||
|
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
|
||||||
|
{
|
||||||
|
foreach (var columnKey in columnKeys)
|
||||||
|
{
|
||||||
|
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
|
||||||
|
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
|
||||||
|
if (current is null || next is null)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var leadingAffixCount = 0;
|
||||||
|
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
|
||||||
|
{
|
||||||
|
leadingAffixCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
|
||||||
|
next.Lines.RemoveRange(0, leadingAffixCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int ResolveRowBoundaryTop(
|
||||||
|
RowAnchor current,
|
||||||
|
RowAnchor next,
|
||||||
|
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
|
||||||
|
{
|
||||||
|
var linesBetweenLabels = bodyLines
|
||||||
|
.Where(item => item.Top >= current.Top && item.Top < next.Top)
|
||||||
|
.OrderBy(item => item.Top)
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
|
||||||
|
{
|
||||||
|
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
|
||||||
|
{
|
||||||
|
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed record ColumnDefinition(string Key, string Label);
|
||||||
|
|
||||||
|
private sealed record RowAnchor(string Label, int Top, int SortOrder);
|
||||||
|
|
||||||
|
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
|
||||||
|
{
|
||||||
|
public string RollBandLabel { get; } = rollBandLabel;
|
||||||
|
public int RowIndex { get; } = rowIndex;
|
||||||
|
public string ColumnKey { get; } = columnKey;
|
||||||
|
public List<string> Lines { get; } = lines;
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user