diff --git a/docs/critical_import_tool.md b/docs/critical_import_tool.md index 5e481c9..cb746c0 100644 --- a/docs/critical_import_tool.md +++ b/docs/critical_import_tool.md @@ -30,8 +30,10 @@ The current implementation supports: - explicit CLI commands for reset, extraction, and import - manifest-driven source selection - `standard` critical tables with columns `A-E` +- `variant_column` critical tables with non-severity columns +- `grouped_variant` critical tables with a group axis plus variant columns - XML-based extraction using `pdftohtml -xml` -- geometry-based parsing across the currently enabled phase-3 tables: +- geometry-based parsing across the currently enabled table set: - `arcane-aether` - `arcane-nether` - `ballistic-shrapnel` @@ -42,22 +44,24 @@ The current implementation supports: - `heat` - `impact` - `krush` + - `large_creature_magic` + - `large_creature_weapon` - `ma-strikes` - `ma-sweeps` - `mana` - `puncture` - `slash` - `subdual` + - `super_large_creature_weapon` - `tiny` - `unbalance` - row-boundary repair for trailing affix leakage +- split row-label reconstruction for tables that render labels such as `99-` / `100` as two fragments - footer/page-number filtering during body parsing - transactional loading into SQLite The current implementation does not yet support: -- variant-column critical tables -- grouped variant tables - OCR/image-based PDFs such as `Void.pdf` - normalized `critical_branch` population - normalized `critical_effect` population @@ -246,9 +250,28 @@ Current phase-3 notes: ### Phase 4: Variant and Grouped Tables -- support `variant_column` tables such as `Large Creature - Weapon.pdf` -- support `grouped_variant` tables such as `Large Creature - Magic.pdf` -- add parser strategies for additional table families +Phase 4 extended the importer beyond `A-E` tables. + +The currently enabled phase-4 table set is: + +- `large_creature_weapon` + - `family`: `variant_column` + - columns: `NORMAL`, `MAGIC`, `MITHRIL`, `HOLY_ARMS`, `SLAYING` +- `super_large_creature_weapon` + - `family`: `variant_column` + - columns: `NORMAL`, `MAGIC`, `MITHRIL`, `HOLY_ARMS`, `SLAYING` +- `large_creature_magic` + - `family`: `grouped_variant` + - groups: `large`, `super_large` + - columns: `NORMAL`, `SLAYING` + +Phase-4 notes: + +- grouped results now populate `critical_group` during SQLite load +- parser dispatch is family-based instead of standard-table only +- left-margin row labels can be reconstructed from split fragments such as `151-` / `175` +- the grouped magic PDF is imported once as `large_creature_magic` + - `sources/Large Creature - Magic.pdf` and `sources/Super Large Creature - Magic.pdf` are duplicate files ### Phase 5: Conditional Branch Extraction @@ -335,10 +358,12 @@ Each entry declares: The manifest is intentionally the control point for enabling importer support one table at a time. -For the currently enabled phase-3 entries: +For the currently enabled entries: -- `family` is `standard` -- `extractionMethod` is `xml` +- standard tables use `family: standard` +- creature weapon tables use `family: variant_column` +- grouped creature magic uses `family: grouped_variant` +- all enabled entries currently use `extractionMethod: xml` ## Artifact Layout diff --git a/docs/critical_tables_db_model.md b/docs/critical_tables_db_model.md index e0ce75d..7746c23 100644 --- a/docs/critical_tables_db_model.md +++ b/docs/critical_tables_db_model.md @@ -19,11 +19,12 @@ The PDFs are not one uniform table shape. I found three families: - Example: `Large Creature - Magic.pdf` has: - group: `large`, `super_large` - column: `normal`, `slaying` + - In the current importer manifest, the grouped magic PDF is loaded once as `large_creature_magic` because the `Large Creature - Magic.pdf` and `Super Large Creature - Magic.pdf` source files are duplicates. - row: roll band There are also extraction constraints: -- Most PDFs are text extractable with `pdftotext -layout`. +- Most PDFs are text extractable with `pdftohtml -xml`. - `Void.pdf` appears image-based and will need OCR or manual transcription. - A single cell can contain: - base description text @@ -282,4 +283,3 @@ Recommended import flow: 6. Route image PDFs like `Void.pdf` through OCR before the same parser. The important design decision is: never throw away the original text. The prose is too irregular to rely on normalized fields alone. - diff --git a/sources/critical-import-manifest.json b/sources/critical-import-manifest.json index 0c1953e..9721b5a 100644 --- a/sources/critical-import-manifest.json +++ b/sources/critical-import-manifest.json @@ -80,6 +80,22 @@ "pdfPath": "sources/Krush.pdf", "enabled": true }, + { + "slug": "large_creature_magic", + "displayName": "Spells Against Creatures Critical Strike Table", + "family": "grouped_variant", + "extractionMethod": "xml", + "pdfPath": "sources/Large Creature - Magic.pdf", + "enabled": true + }, + { + "slug": "large_creature_weapon", + "displayName": "Large Creature Critical Strike Table", + "family": "variant_column", + "extractionMethod": "xml", + "pdfPath": "sources/Large Creature - Weapon.pdf", + "enabled": true + }, { "slug": "ma-strikes", "displayName": "Martial Arts Strikes Critical Strike Table", @@ -128,6 +144,14 @@ "pdfPath": "sources/Subdual.pdf", "enabled": true }, + { + "slug": "super_large_creature_weapon", + "displayName": "Super Large Creature Critical Strike Table", + "family": "variant_column", + "extractionMethod": "xml", + "pdfPath": "sources/Super Large Creature - Weapon.pdf", + "enabled": true + }, { "slug": "tiny", "displayName": "Tiny Critical Strike Table", diff --git a/src/RolemasterDb.App/rolemaster.db b/src/RolemasterDb.App/rolemaster.db index f9fa532..208d4c6 100644 Binary files a/src/RolemasterDb.App/rolemaster.db and b/src/RolemasterDb.App/rolemaster.db differ diff --git a/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs b/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs index 3c44cfa..7f598f9 100644 --- a/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs +++ b/src/RolemasterDb.ImportTool.Tests/StandardCriticalTableParserIntegrationTests.cs @@ -4,7 +4,7 @@ namespace RolemasterDb.ImportTool.Tests; public sealed class StandardCriticalTableParserIntegrationTests { - private static readonly string[] ExpectedPhase3Slugs = + private static readonly string[] ExpectedEnabledSlugs = [ "arcane-aether", "arcane-nether", @@ -16,20 +16,25 @@ public sealed class StandardCriticalTableParserIntegrationTests "heat", "impact", "krush", + "large_creature_magic", + "large_creature_weapon", "ma-strikes", "ma-sweeps", "mana", "puncture", "slash", "subdual", + "super_large_creature_weapon", "tiny", "unbalance" ]; private static readonly PdfXmlExtractor Extractor = new(); - private static readonly StandardCriticalTableParser Parser = new(); + private static readonly StandardCriticalTableParser StandardParser = new(); + private static readonly VariantColumnCriticalTableParser VariantColumnParser = new(); + private static readonly GroupedVariantCriticalTableParser GroupedVariantParser = new(); - public static IEnumerable EnabledStandardTables() => + public static IEnumerable EnabledTables() => LoadManifest().Tables .Where(item => item.Enabled) .OrderBy(item => item.Slug, StringComparer.Ordinal) @@ -37,18 +42,22 @@ public sealed class StandardCriticalTableParserIntegrationTests public static IEnumerable RepresentativeCells() { - yield return ["slash", "71-75", "A", "Blow falls on lower leg"]; - yield return ["puncture", "66", "C", "Strike shatters foe's knee"]; - yield return ["ballistic-shrapnel", "86-90", "E", "destroy his heart"]; - yield return ["arcane-aether", "96-99", "E", "smoking pulp"]; - yield return ["ma-strikes", "96-99", "E", "drives bone into brain"]; - yield return ["mana", "96-99", "E", "momentarily transformed"]; - yield return ["mana", "100", "E", "Mana consumes everything"]; - yield return ["tiny", "100", "E", "Vein and artery severed"]; + yield return new object[] { "slash", null!, "71-75", "A", "Blow falls on lower leg" }; + yield return new object[] { "puncture", null!, "66", "C", "Strike shatters foe's knee" }; + yield return new object[] { "ballistic-shrapnel", null!, "86-90", "E", "destroy his heart" }; + yield return new object[] { "arcane-aether", null!, "96-99", "E", "smoking pulp" }; + yield return new object[] { "ma-strikes", null!, "96-99", "E", "drives bone into brain" }; + yield return new object[] { "mana", null!, "96-99", "E", "momentarily transformed" }; + yield return new object[] { "mana", null!, "100", "E", "Mana consumes everything" }; + yield return new object[] { "tiny", null!, "100", "E", "Vein and artery severed" }; + yield return new object[] { "large_creature_weapon", null!, "01-05", "NORMAL", "Weapon shatters" }; + yield return new object[] { "super_large_creature_weapon", null!, "31-40", "SLAYING", "Boom! Solid without question" }; + yield return new object[] { "large_creature_magic", "large", "251+", "NORMAL", "Foe lowers his eyes within your reach" }; + yield return new object[] { "large_creature_magic", "super_large", "251+", "SLAYING", "Blast goes in through foe's eye" }; } [Fact] - public void Manifest_enables_the_phase_3_standard_table_set() + public void Manifest_enables_the_phase_4_table_set() { var manifest = LoadManifest(); var enabledTables = manifest.Tables @@ -56,25 +65,29 @@ public sealed class StandardCriticalTableParserIntegrationTests .OrderBy(item => item.Slug, StringComparer.Ordinal) .ToList(); - Assert.Equal(ExpectedPhase3Slugs, enabledTables.Select(item => item.Slug)); + Assert.Equal(ExpectedEnabledSlugs, enabledTables.Select(item => item.Slug)); Assert.All(enabledTables, entry => { - Assert.Equal("standard", entry.Family); Assert.Equal("xml", entry.ExtractionMethod); Assert.True(File.Exists(Path.Combine(GetRepositoryRoot(), entry.PdfPath)), $"Missing source PDF for '{entry.Slug}'."); }); + + Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "large_creature_weapon").Family); + Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "super_large_creature_weapon").Family); + Assert.Equal("grouped_variant", enabledTables.Single(item => item.Slug == "large_creature_magic").Family); } [Theory] - [MemberData(nameof(EnabledStandardTables))] - public async Task Enabled_standard_tables_extract_and_parse_successfully(CriticalImportManifestEntry entry) + [MemberData(nameof(EnabledTables))] + public async Task Enabled_tables_extract_and_parse_successfully(CriticalImportManifestEntry entry) { var parseResult = await LoadParseResultAsync(entry); + var expectedGroupCount = Math.Max(parseResult.Table.Groups.Count, 1); Assert.True(parseResult.ValidationReport.IsValid, string.Join(Environment.NewLine, parseResult.ValidationReport.Errors)); - Assert.Equal(5, parseResult.Table.Columns.Count); + Assert.NotEmpty(parseResult.Table.Columns); Assert.NotEmpty(parseResult.Table.RollBands); - Assert.Equal(parseResult.ValidationReport.RowCount * 5, parseResult.ValidationReport.CellCount); + Assert.Equal(parseResult.ValidationReport.RowCount * parseResult.Table.Columns.Count * expectedGroupCount, parseResult.ValidationReport.CellCount); Assert.Equal(parseResult.ValidationReport.CellCount, parseResult.Table.Results.Count); } @@ -82,6 +95,7 @@ public sealed class StandardCriticalTableParserIntegrationTests [MemberData(nameof(RepresentativeCells))] public async Task Representative_cells_keep_expected_descriptions( string slug, + string? groupKey, string rollBandLabel, string columnKey, string expectedSnippet) @@ -89,6 +103,7 @@ public sealed class StandardCriticalTableParserIntegrationTests var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, slug, StringComparison.Ordinal)); var parseResult = await LoadParseResultAsync(entry); var result = parseResult.Table.Results.Single(item => + string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) && string.Equals(item.RollBandLabel, rollBandLabel, StringComparison.Ordinal) && string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal)); @@ -101,6 +116,7 @@ public sealed class StandardCriticalTableParserIntegrationTests var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "slash", StringComparison.Ordinal)); var parseResult = await LoadParseResultAsync(entry); var result = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "56-60", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "A", StringComparison.Ordinal)); @@ -113,9 +129,11 @@ public sealed class StandardCriticalTableParserIntegrationTests var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); var parseResult = await LoadParseResultAsync(entry); var row96E = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "96-99", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "E", StringComparison.Ordinal)); var row100E = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "100", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "E", StringComparison.Ordinal)); @@ -130,6 +148,7 @@ public sealed class StandardCriticalTableParserIntegrationTests var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); var parseResult = await LoadParseResultAsync(entry); var row100C = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "100", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "C", StringComparison.Ordinal)); @@ -143,9 +162,11 @@ public sealed class StandardCriticalTableParserIntegrationTests var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); var parseResult = await LoadParseResultAsync(entry); var row71A = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "A", StringComparison.Ordinal)); var row71B = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "B", StringComparison.Ordinal)); @@ -159,9 +180,11 @@ public sealed class StandardCriticalTableParserIntegrationTests var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); var parseResult = await LoadParseResultAsync(entry); var row71D = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "D", StringComparison.Ordinal)); var row71E = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "E", StringComparison.Ordinal)); @@ -175,9 +198,11 @@ public sealed class StandardCriticalTableParserIntegrationTests var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); var parseResult = await LoadParseResultAsync(entry); var row91B = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "B", StringComparison.Ordinal)); var row91C = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "C", StringComparison.Ordinal)); @@ -191,9 +216,11 @@ public sealed class StandardCriticalTableParserIntegrationTests var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); var parseResult = await LoadParseResultAsync(entry); var row86B = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "B", StringComparison.Ordinal)); var row86C = parseResult.Table.Results.Single(item => + item.GroupKey is null && string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) && string.Equals(item.ColumnKey, "C", StringComparison.Ordinal)); @@ -201,7 +228,28 @@ public sealed class StandardCriticalTableParserIntegrationTests Assert.Contains("+16H - 8", row86C.RawAffixText, StringComparison.Ordinal); } - private static async Task LoadParseResultAsync(CriticalImportManifestEntry entry) + [Fact] + public async Task Grouped_magic_table_keeps_large_and_super_large_groups_distinct() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "large_creature_magic", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + + Assert.Equal(["large", "super_large"], parseResult.Table.Groups.Select(item => item.GroupKey)); + + var largeNormal = parseResult.Table.Results.Single(item => + string.Equals(item.GroupKey, "large", StringComparison.Ordinal) && + string.Equals(item.RollBandLabel, "251+", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "NORMAL", StringComparison.Ordinal)); + var superSlaying = parseResult.Table.Results.Single(item => + string.Equals(item.GroupKey, "super_large", StringComparison.Ordinal) && + string.Equals(item.RollBandLabel, "251+", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "SLAYING", StringComparison.Ordinal)); + + Assert.DoesNotContain("Blast goes in through foe's eye", largeNormal.DescriptionText, StringComparison.OrdinalIgnoreCase); + Assert.Contains("Blast goes in through foe's eye", superSlaying.DescriptionText, StringComparison.OrdinalIgnoreCase); + } + + private static async Task LoadParseResultAsync(CriticalImportManifestEntry entry) { var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml"); @@ -211,7 +259,13 @@ public sealed class StandardCriticalTableParserIntegrationTests } var xmlContent = await File.ReadAllTextAsync(xmlPath); - return Parser.Parse(entry, xmlContent); + return entry.Family switch + { + "standard" => StandardParser.Parse(entry, xmlContent), + "variant_column" => VariantColumnParser.Parse(entry, xmlContent), + "grouped_variant" => GroupedVariantParser.Parse(entry, xmlContent), + _ => throw new InvalidOperationException($"Unsupported manifest family '{entry.Family}'.") + }; } private static CriticalImportManifest LoadManifest() => diff --git a/src/RolemasterDb.ImportTool/CriticalImportCommandRunner.cs b/src/RolemasterDb.ImportTool/CriticalImportCommandRunner.cs index ba5b8e1..8810a17 100644 --- a/src/RolemasterDb.ImportTool/CriticalImportCommandRunner.cs +++ b/src/RolemasterDb.ImportTool/CriticalImportCommandRunner.cs @@ -8,6 +8,8 @@ public sealed class CriticalImportCommandRunner private readonly ImportArtifactWriter artifactWriter = new(); private readonly PdfXmlExtractor pdfXmlExtractor = new(); private readonly StandardCriticalTableParser standardParser = new(); + private readonly VariantColumnCriticalTableParser variantColumnParser = new(); + private readonly GroupedVariantCriticalTableParser groupedVariantParser = new(); public async Task RunAsync(ResetOptions options) { @@ -96,14 +98,24 @@ public sealed class CriticalImportCommandRunner ?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'."); } - private StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) + private CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) { - if (!string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase)) + if (string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase)) { - throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 2."); + return standardParser.Parse(entry, xmlContent); } - return standardParser.Parse(entry, xmlContent); + if (string.Equals(entry.Family, "variant_column", StringComparison.OrdinalIgnoreCase)) + { + return variantColumnParser.Parse(entry, xmlContent); + } + + if (string.Equals(entry.Family, "grouped_variant", StringComparison.OrdinalIgnoreCase)) + { + return groupedVariantParser.Parse(entry, xmlContent); + } + + throw new InvalidOperationException($"Family '{entry.Family}' is not supported by the importer."); } private static ImportArtifactPaths CreateArtifactPaths(string slug) => diff --git a/src/RolemasterDb.ImportTool/CriticalImportLoader.cs b/src/RolemasterDb.ImportTool/CriticalImportLoader.cs index 080e0f6..f7a0417 100644 --- a/src/RolemasterDb.ImportTool/CriticalImportLoader.cs +++ b/src/RolemasterDb.ImportTool/CriticalImportLoader.cs @@ -43,6 +43,15 @@ public sealed class CriticalImportLoader(string databasePath) Notes = table.Notes }; + entity.Groups = table.Groups + .Select(item => new CriticalGroup + { + GroupKey = item.GroupKey, + Label = item.Label, + SortOrder = item.SortOrder + }) + .ToList(); + entity.Columns = table.Columns .Select(item => new CriticalColumn { @@ -63,12 +72,14 @@ public sealed class CriticalImportLoader(string databasePath) }) .ToList(); + var groupsByKey = entity.Groups.ToDictionary(item => item.GroupKey, StringComparer.OrdinalIgnoreCase); var columnsByKey = entity.Columns.ToDictionary(item => item.ColumnKey, StringComparer.OrdinalIgnoreCase); var rollBandsByLabel = entity.RollBands.ToDictionary(item => item.Label, StringComparer.OrdinalIgnoreCase); entity.Results = table.Results .Select(item => new CriticalResult { + CriticalGroup = item.GroupKey is null ? null : groupsByKey[item.GroupKey], CriticalColumn = columnsByKey[item.ColumnKey], CriticalRollBand = rollBandsByLabel[item.RollBandLabel], RawCellText = item.RawCellText, diff --git a/src/RolemasterDb.ImportTool/ImportArtifactWriter.cs b/src/RolemasterDb.ImportTool/ImportArtifactWriter.cs index 53c3cc4..c0c6c34 100644 --- a/src/RolemasterDb.ImportTool/ImportArtifactWriter.cs +++ b/src/RolemasterDb.ImportTool/ImportArtifactWriter.cs @@ -11,7 +11,7 @@ public sealed class ImportArtifactWriter WriteIndented = true }; - public async Task WriteAsync(ImportArtifactPaths artifactPaths, StandardCriticalTableParseResult parseResult, CancellationToken cancellationToken = default) + public async Task WriteAsync(ImportArtifactPaths artifactPaths, CriticalTableParseResult parseResult, CancellationToken cancellationToken = default) { Directory.CreateDirectory(artifactPaths.DirectoryPath); diff --git a/src/RolemasterDb.ImportTool/Parsing/CriticalTableParseResult.cs b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParseResult.cs new file mode 100644 index 0000000..79bbdcb --- /dev/null +++ b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParseResult.cs @@ -0,0 +1,13 @@ +namespace RolemasterDb.ImportTool.Parsing; + +public sealed class CriticalTableParseResult( + ParsedCriticalTable table, + IReadOnlyList fragments, + IReadOnlyList cells, + ImportValidationReport validationReport) +{ + public ParsedCriticalTable Table { get; } = table; + public IReadOnlyList Fragments { get; } = fragments; + public IReadOnlyList Cells { get; } = cells; + public ImportValidationReport ValidationReport { get; } = validationReport; +} diff --git a/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs new file mode 100644 index 0000000..983ff2e --- /dev/null +++ b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs @@ -0,0 +1,477 @@ +using System.Text.RegularExpressions; +using System.Xml; +using System.Xml.Linq; + +namespace RolemasterDb.ImportTool.Parsing; + +internal static class CriticalTableParserSupport +{ + internal const int HeaderToBodyMinimumGap = 20; + internal const int FooterLabelExclusionGap = 15; + internal const int FooterPageNumberExclusionGap = 80; + internal const int RowLabelDuplicateTolerance = 15; + internal const int TopGroupingTolerance = 2; + + private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled); + private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled); + private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|–)\d+\)$", RegexOptions.Compiled); + + internal static List LoadFragments(string xmlContent) + { + using var stringReader = new StringReader(xmlContent); + using var xmlReader = XmlReader.Create( + stringReader, + new XmlReaderSettings + { + DtdProcessing = DtdProcessing.Ignore + }); + + var document = XDocument.Load(xmlReader); + + return document.Descendants("page") + .SelectMany(page => + { + var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1"); + return page.Elements("text") + .Select(item => new XmlTextFragment( + pageNumber, + int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")), + int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")), + int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")), + int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")), + NormalizeText(string.Concat(item.DescendantNodes().OfType().Select(node => node.Value))))) + .Where(item => !string.IsNullOrWhiteSpace(item.Text)); + }) + .ToList(); + } + + internal static List FindRowLabelFragments( + IReadOnlyList fragments, + int leftCutoff, + int bodyStartTop, + int keyTop) + { + var candidates = fragments + .Where(item => + item.Left < leftCutoff && + item.Top >= bodyStartTop && + item.Top < keyTop - FooterLabelExclusionGap && + (IsRollBandLabel(item.Text) || LooksLikeSplitRollBandStart(item.Text))) + .OrderBy(item => item.Top) + .ThenBy(item => item.Left) + .ToList(); + + var merged = new List(); + + for (var index = 0; index < candidates.Count; index++) + { + var candidate = candidates[index]; + if (TryMergeSplitRollBand(candidates, index, out var mergedCandidate)) + { + merged.Add(mergedCandidate); + index++; + continue; + } + + if (IsRollBandLabel(candidate.Text)) + { + merged.Add(candidate); + } + } + + var deduped = new List(); + + foreach (var candidate in merged) + { + var previous = deduped.LastOrDefault(); + if (previous is not null && + string.Equals(NormalizeRollBandLabel(previous.Text), NormalizeRollBandLabel(candidate.Text), StringComparison.OrdinalIgnoreCase) && + Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance) + { + continue; + } + + deduped.Add(candidate); + } + + return deduped; + } + + internal static bool IsRollBandLabel(string value) => + Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:\s*-\s*\d{2,3})?$|^\d{2,3}\+$"); + + internal static bool IsPotentialRowLabelFragment(XmlTextFragment fragment, int leftCutoff) => + fragment.Left < leftCutoff && + (IsRollBandLabel(fragment.Text) || LooksLikeSplitRollBandStart(fragment.Text)); + + internal static string NormalizeRollBandLabel(string label) => + Regex.Replace(CollapseWhitespace(label), @"\s*-\s*", "-"); + + internal static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder) + { + var normalizedLabel = NormalizeRollBandLabel(label); + if (normalizedLabel.EndsWith('+')) + { + return new ParsedCriticalRollBand(normalizedLabel, int.Parse(normalizedLabel[..^1]), null, sortOrder); + } + + var parts = normalizedLabel.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); + return parts.Length == 1 + ? new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder) + : new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder); + } + + internal static string ResolveColumn(double centerX, IReadOnlyList<(string Key, double CenterX)> columns) + { + for (var index = 0; index < columns.Count - 1; index++) + { + var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0; + if (centerX < boundary) + { + return columns[index].Key; + } + } + + return columns[^1].Key; + } + + internal static IReadOnlyList BuildLines(IReadOnlyList fragments) + { + var lines = new List>(); + + foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left)) + { + if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance) + { + lines.Add([fragment]); + continue; + } + + lines[^1].Add(fragment); + } + + return lines + .Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text)))) + .Where(item => !string.IsNullOrWhiteSpace(item)) + .ToList(); + } + + internal static bool IsAffixLikeLine(string line, ISet affixLegendSymbols) + { + var value = line.Trim(); + if (value.Length == 0) + { + return false; + } + + if (value is "-" or "\u2013" or "\u2014") + { + return true; + } + + if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) || + value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) || + value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) || + value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) || + value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) || + value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) || + value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase)) + { + return value.Contains(':', StringComparison.Ordinal); + } + + if (affixLegendSymbols.Count > 0 && + affixLegendSymbols.Any(symbol => value.Contains(symbol, StringComparison.Ordinal))) + { + if (value.Any(char.IsDigit)) + { + return true; + } + + var remainder = value; + foreach (var symbol in affixLegendSymbols.OrderByDescending(item => item.Length)) + { + remainder = remainder.Replace(symbol, string.Empty, StringComparison.Ordinal); + } + + remainder = remainder + .Replace("+", string.Empty, StringComparison.Ordinal) + .Replace("-", string.Empty, StringComparison.Ordinal) + .Replace("–", string.Empty, StringComparison.Ordinal) + .Replace("(", string.Empty, StringComparison.Ordinal) + .Replace(")", string.Empty, StringComparison.Ordinal) + .Replace("/", string.Empty, StringComparison.Ordinal); + + if (string.IsNullOrWhiteSpace(remainder)) + { + return true; + } + } + + return value.StartsWith("+", StringComparison.Ordinal) || + value.StartsWith("\u2211", StringComparison.Ordinal) || + value.StartsWith("\u220F", StringComparison.Ordinal) || + value.StartsWith("\u03C0", StringComparison.Ordinal) || + value.StartsWith("\u222B", StringComparison.Ordinal) || + StandaloneModifierAffixLineRegex.IsMatch(value) || + NumericAffixLineRegex.IsMatch(value) || + value.Contains(" - ", StringComparison.Ordinal) || + value.Contains(" – ", StringComparison.Ordinal); + } + + internal static int CountLineTypeSegments(IReadOnlyList lines, ISet affixLegendSymbols) + { + var segmentCount = 0; + bool? previousIsAffix = null; + + foreach (var line in lines) + { + var currentIsAffix = IsAffixLikeLine(line, affixLegendSymbols); + if (previousIsAffix == currentIsAffix) + { + continue; + } + + segmentCount++; + previousIsAffix = currentIsAffix; + } + + return segmentCount; + } + + internal static string CollapseWhitespace(string value) => + Regex.Replace(value.Trim(), @"\s+", " "); + + internal static string NormalizeText(string value) => + value + .Replace('\u00a0', ' ') + .Replace('\r', ' ') + .Replace('\n', ' ') + .Replace('’', '\'') + .Trim(); + + internal static HashSet DetectAffixLegendSymbols(IReadOnlyList fragments, int keyTop) + { + if (keyTop == int.MaxValue) + { + return []; + } + + var footerLines = GroupByTop(fragments + .Where(item => item.Top >= keyTop - TopGroupingTolerance) + .OrderBy(item => item.Top) + .ThenBy(item => item.Left) + .ToList()) + .Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text)))) + .ToList(); + + var symbols = new HashSet(StringComparer.Ordinal); + + foreach (var footerLine in footerLines) + { + AddLegendMatch(symbols, footerLine, @"must parry\s*=\s*(\S)"); + AddLegendMatch(symbols, footerLine, @"no parry\s*=\s*(\S)"); + AddLegendMatch(symbols, footerLine, @"stun(?:ned)?\s*=\s*(\S)"); + AddLegendMatch(symbols, footerLine, @"bleed\s*=\s*(\S)"); + AddLegendMatch(symbols, footerLine, @"powerpoint modification.*=\s*(\S)"); + } + + return symbols; + } + + internal static List SplitBoundaryCrossingAffixFragments( + IReadOnlyList bodyFragments, + IReadOnlyList<(string Key, double CenterX)> columnCenters, + ISet affixLegendSymbols) + { + var splitFragments = new List(bodyFragments.Count); + + foreach (var fragment in bodyFragments) + { + splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols)); + } + + return splitFragments; + } + + internal static List<(int Top, bool IsAffixLike)> BuildBodyLines( + IReadOnlyList bodyFragments, + IReadOnlyList<(string Key, double CenterX)> columnCenters, + ISet affixLegendSymbols) + { + var bodyLines = new List<(int Top, bool IsAffixLike)>(); + + foreach (var lineFragments in GroupByTop(bodyFragments.OrderBy(item => item.Top).ThenBy(item => item.Left).ToList())) + { + var columnTexts = lineFragments + .GroupBy(item => ResolveColumn(item.CenterX, columnCenters), StringComparer.OrdinalIgnoreCase) + .Select(group => CollapseWhitespace(string.Join(' ', group.OrderBy(item => item.Left).Select(item => item.Text)))) + .Where(item => !string.IsNullOrWhiteSpace(item)) + .ToList(); + + var isAffixLike = columnTexts.Count > 0 && + columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols)); + + bodyLines.Add((lineFragments[0].Top, isAffixLike)); + } + + return bodyLines; + } + + internal static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop) + { + if (keyTop == int.MaxValue) + { + return false; + } + + return fragment.Top >= keyTop - FooterPageNumberExclusionGap && + Regex.IsMatch(fragment.Text, @"^\d{2,3}$"); + } + + internal static IEnumerable> GroupByTop(IReadOnlyList fragments) + { + var groups = new List>(); + + foreach (var fragment in fragments) + { + if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance) + { + groups.Add([fragment]); + continue; + } + + groups[^1].Add(fragment); + } + + return groups; + } + + private static bool LooksLikeSplitRollBandStart(string value) => + Regex.IsMatch(value.Trim(), @"^\d{2,3}\s*-$"); + + private static bool TryMergeSplitRollBand(IReadOnlyList candidates, int index, out XmlTextFragment mergedCandidate) + { + var current = candidates[index]; + if (!LooksLikeSplitRollBandStart(current.Text) || index + 1 >= candidates.Count) + { + mergedCandidate = null!; + return false; + } + + var next = candidates[index + 1]; + if (current.PageNumber != next.PageNumber || + !Regex.IsMatch(next.Text.Trim(), @"^\d{2,3}$") || + next.Top <= current.Top || + next.Top - current.Top > RowLabelDuplicateTolerance + 5 || + Math.Abs(next.Left - current.Left) > 20) + { + mergedCandidate = null!; + return false; + } + + var startDigits = Regex.Match(current.Text, @"\d{2,3}").Value; + var mergedLabel = $"{startDigits}-{next.Text.Trim()}"; + var right = Math.Max(current.Left + current.Width, next.Left + next.Width); + + mergedCandidate = new XmlTextFragment( + current.PageNumber, + current.Top, + Math.Min(current.Left, next.Left), + right - Math.Min(current.Left, next.Left), + Math.Max(current.Height, next.Height), + mergedLabel); + return true; + } + + private static IReadOnlyList SplitBoundaryCrossingAffixFragment( + XmlTextFragment fragment, + IReadOnlyList<(string Key, double CenterX)> columnCenters, + ISet affixLegendSymbols) + { + if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols)) + { + return [fragment]; + } + + var matches = MultiFragmentSplitRegex.Matches(fragment.Text); + if (matches.Count < 2) + { + return [fragment]; + } + + var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1); + var splitFragments = new List(matches.Count); + + foreach (Match match in matches) + { + var segmentText = CollapseWhitespace(match.Value); + if (segmentText.Length == 0) + { + continue; + } + + var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index); + var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length)); + + splitFragments.Add(new XmlTextFragment( + fragment.PageNumber, + fragment.Top, + segmentLeft, + segmentWidth, + fragment.Height, + segmentText)); + } + + if (splitFragments.Count < 2) + { + return [fragment]; + } + + var originalColumn = ResolveColumn(fragment.CenterX, columnCenters); + var distinctColumns = splitFragments + .Select(item => ResolveColumn(item.CenterX, columnCenters)) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToList(); + + return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase)) + ? splitFragments + : [fragment]; + } + + private static bool LooksLikeBoundaryCrossingAffixFragment( + XmlTextFragment fragment, + IReadOnlyList<(string Key, double CenterX)> columnCenters, + ISet affixLegendSymbols) + { + if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) || + !fragment.Text.Contains(" ", StringComparison.Ordinal)) + { + return false; + } + + var fragmentRight = fragment.Left + fragment.Width; + + for (var index = 0; index < columnCenters.Count - 1; index++) + { + var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0; + if (fragment.Left < boundary && fragmentRight > boundary) + { + return true; + } + } + + return false; + } + + private static void AddLegendMatch(HashSet symbols, string value, string pattern) + { + foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase)) + { + if (match.Groups.Count > 1) + { + symbols.Add(match.Groups[1].Value); + } + } + } +} diff --git a/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs new file mode 100644 index 0000000..ed6c3e4 --- /dev/null +++ b/src/RolemasterDb.ImportTool/Parsing/GroupedVariantCriticalTableParser.cs @@ -0,0 +1,306 @@ +namespace RolemasterDb.ImportTool.Parsing; + +public sealed class GroupedVariantCriticalTableParser +{ + private static readonly ParsedCriticalGroup[] ExpectedGroups = + [ + new("large", "Large Creatures", 1), + new("super_large", "Super Large Creatures", 2) + ]; + + private static readonly ParsedCriticalColumn[] ExpectedColumns = + [ + new("NORMAL", "Normal", "variant", 1), + new("SLAYING", "Slaying", "variant", 2) + ]; + + public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) + { + var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); + var groupHeaders = FindGroupHeaders(fragments); + var columnHeaders = FindColumnHeaders(fragments); + var validationErrors = new List(); + var validationWarnings = new List(); + + var combinedColumnAnchors = columnHeaders + .OrderBy(item => item.Left) + .Select((item, index) => + { + var group = ExpectedGroups[index / ExpectedColumns.Length]; + var column = ExpectedColumns[index % ExpectedColumns.Length]; + return (group.GroupKey, column.ColumnKey, CompositeKey: $"{group.GroupKey}:{column.ColumnKey}", item.CenterX); + }) + .ToList(); + + var bodyStartTop = Math.Max( + groupHeaders.Max(item => item.Top), + columnHeaders.Max(item => item.Top)) + + CriticalTableParserSupport.HeaderToBodyMinimumGap; + var keyTop = fragments + .Where(item => + string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) || + item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) || + item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase)) + .Select(item => (int?)item.Top) + .Min() ?? int.MaxValue; + var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop); + var leftCutoff = columnHeaders.Min(item => item.Left) - 10; + var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments( + fragments, + leftCutoff, + bodyStartTop, + keyTop); + + var rowAnchors = rowLabelFragments + .OrderBy(item => item.Top) + .Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1)) + .ToList(); + + if (rowAnchors.Count == 0) + { + validationErrors.Add("No roll-band labels were found in the XML artifact."); + } + + var columnCenters = combinedColumnAnchors + .Select(item => (item.CompositeKey, item.CenterX)) + .ToList(); + + var bodyFragments = fragments + .Where(item => + item.Top >= bodyStartTop && + item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance && + !CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) && + !CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) && + !rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) && + !groupHeaders.Contains(item) && + !columnHeaders.Contains(item)) + .ToList(); + bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols); + var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols); + + var parsedRollBands = rowAnchors + .Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder)) + .ToList(); + + var cellEntries = new List(); + + for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++) + { + var rowStart = rowIndex == 0 + ? bodyStartTop + : ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines); + + var rowEnd = rowIndex == rowAnchors.Count - 1 + ? keyTop - 1 + : ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines); + + var rowFragments = bodyFragments + .Where(item => item.Top >= rowStart && item.Top < rowEnd) + .ToList(); + + foreach (var anchor in combinedColumnAnchors) + { + var cellFragments = rowFragments + .Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == anchor.CompositeKey) + .OrderBy(item => item.Top) + .ThenBy(item => item.Left) + .ToList(); + + if (cellFragments.Count == 0) + { + validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', group '{anchor.GroupKey}', column '{anchor.ColumnKey}'."); + continue; + } + + cellEntries.Add(new CellEntry( + anchor.GroupKey, + rowAnchors[rowIndex].Label, + rowIndex, + anchor.ColumnKey, + CriticalTableParserSupport.BuildLines(cellFragments).ToList())); + } + } + + RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols); + + var parsedCells = new List(); + var parsedResults = new List(); + + foreach (var cellEntry in cellEntries + .OrderBy(item => item.RowIndex) + .ThenBy(item => item.GroupKey, StringComparer.Ordinal) + .ThenBy(item => item.ColumnKey, StringComparer.Ordinal)) + { + var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols); + if (segmentCount > 2) + { + validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.GroupKey}/{cellEntry.ColumnKey}' interleaves prose and affix lines."); + } + + var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList(); + var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList(); + var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines); + var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines)); + var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines); + + parsedCells.Add(new ParsedCriticalCellArtifact( + cellEntry.GroupKey, + cellEntry.RollBandLabel, + cellEntry.ColumnKey, + cellEntry.Lines, + rawCellText, + descriptionText, + rawAffixText)); + + parsedResults.Add(new ParsedCriticalResult( + cellEntry.GroupKey, + cellEntry.ColumnKey, + cellEntry.RollBandLabel, + rawCellText, + descriptionText, + rawAffixText)); + } + + var expectedCellCount = rowAnchors.Count * ExpectedGroups.Length * ExpectedColumns.Length; + if (parsedCells.Count != expectedCellCount) + { + validationErrors.Add($"Expected {expectedCellCount} parsed cells but produced {parsedCells.Count}."); + } + + var validationReport = new ImportValidationReport( + validationErrors.Count == 0, + validationErrors, + validationWarnings, + rowAnchors.Count, + parsedCells.Count); + + var table = new ParsedCriticalTable( + entry.Slug, + entry.DisplayName, + entry.Family, + Path.GetFileName(entry.PdfPath), + "Imported from PDF XML extraction.", + ExpectedGroups, + ExpectedColumns, + parsedRollBands, + parsedResults); + + return new CriticalTableParseResult(table, fragments, parsedCells, validationReport); + } + + private static List FindGroupHeaders(IReadOnlyList fragments) + { + var expectedLabels = ExpectedGroups.Select(item => item.Label).ToList(); + var headerCandidates = fragments + .Where(item => expectedLabels.Contains(item.Text.Trim(), StringComparer.OrdinalIgnoreCase)) + .OrderBy(item => item.Top) + .ThenBy(item => item.Left) + .ToList(); + + foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates)) + { + var ordered = group.OrderBy(item => item.Left).ToList(); + var labels = ordered.Select(item => item.Text.Trim()).ToList(); + if (labels.SequenceEqual(expectedLabels, StringComparer.OrdinalIgnoreCase)) + { + return ordered; + } + } + + throw new InvalidOperationException("Could not find the grouped-variant section headers in the XML artifact."); + } + + private static List FindColumnHeaders(IReadOnlyList fragments) + { + var expectedLabels = new[] { "normal", "slaying", "normal", "slaying" }; + var headerCandidates = fragments + .Where(item => + { + var normalized = item.Text.Trim().ToLowerInvariant(); + return normalized is "normal" or "slaying"; + }) + .OrderBy(item => item.Top) + .ThenBy(item => item.Left) + .ToList(); + + foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates)) + { + var ordered = group.OrderBy(item => item.Left).ToList(); + var labels = ordered.Select(item => item.Text.Trim().ToLowerInvariant()).ToList(); + if (labels.SequenceEqual(expectedLabels)) + { + return ordered; + } + } + + throw new InvalidOperationException("Could not find the grouped-variant column header row in the XML artifact."); + } + + private static void RepairLeadingAffixLeakage(List cellEntries, ISet affixLegendSymbols) + { + var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex); + var axes = cellEntries + .Select(item => (item.GroupKey, item.ColumnKey)) + .Distinct() + .ToList(); + + for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++) + { + foreach (var (groupKey, columnKey) in axes) + { + var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.GroupKey == groupKey && item.ColumnKey == columnKey); + var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.GroupKey == groupKey && item.ColumnKey == columnKey); + if (current is null || next is null) + { + continue; + } + + var leadingAffixCount = 0; + while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols)) + { + leadingAffixCount++; + } + + if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count) + { + continue; + } + + current.Lines.AddRange(next.Lines.Take(leadingAffixCount)); + next.Lines.RemoveRange(0, leadingAffixCount); + } + } + } + + private static int ResolveRowBoundaryTop( + RowAnchor current, + RowAnchor next, + IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines) + { + var linesBetweenLabels = bodyLines + .Where(item => item.Top >= current.Top && item.Top < next.Top) + .OrderBy(item => item.Top) + .ToList(); + + for (var index = linesBetweenLabels.Count - 2; index >= 0; index--) + { + if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike) + { + return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1; + } + } + + return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1; + } + + private sealed record RowAnchor(string Label, int Top, int SortOrder); + + private sealed class CellEntry(string groupKey, string rollBandLabel, int rowIndex, string columnKey, List lines) + { + public string GroupKey { get; } = groupKey; + public string RollBandLabel { get; } = rollBandLabel; + public int RowIndex { get; } = rowIndex; + public string ColumnKey { get; } = columnKey; + public List Lines { get; } = lines; + } +} diff --git a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalCellArtifact.cs b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalCellArtifact.cs index 76475a3..ecfdc19 100644 --- a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalCellArtifact.cs +++ b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalCellArtifact.cs @@ -1,6 +1,7 @@ namespace RolemasterDb.ImportTool.Parsing; public sealed class ParsedCriticalCellArtifact( + string? groupKey, string rollBandLabel, string columnKey, IReadOnlyList lines, @@ -8,6 +9,7 @@ public sealed class ParsedCriticalCellArtifact( string descriptionText, string? rawAffixText) { + public string? GroupKey { get; } = groupKey; public string RollBandLabel { get; } = rollBandLabel; public string ColumnKey { get; } = columnKey; public IReadOnlyList Lines { get; } = lines; diff --git a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalGroup.cs b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalGroup.cs new file mode 100644 index 0000000..d61bc3e --- /dev/null +++ b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalGroup.cs @@ -0,0 +1,8 @@ +namespace RolemasterDb.ImportTool.Parsing; + +public sealed class ParsedCriticalGroup(string groupKey, string label, int sortOrder) +{ + public string GroupKey { get; } = groupKey; + public string Label { get; } = label; + public int SortOrder { get; } = sortOrder; +} diff --git a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalResult.cs b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalResult.cs index a8dbb09..7d89774 100644 --- a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalResult.cs +++ b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalResult.cs @@ -1,12 +1,14 @@ namespace RolemasterDb.ImportTool.Parsing; public sealed class ParsedCriticalResult( + string? groupKey, string columnKey, string rollBandLabel, string rawCellText, string descriptionText, string? rawAffixText) { + public string? GroupKey { get; } = groupKey; public string ColumnKey { get; } = columnKey; public string RollBandLabel { get; } = rollBandLabel; public string RawCellText { get; } = rawCellText; diff --git a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalTable.cs b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalTable.cs index cf9d7f2..927eaad 100644 --- a/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalTable.cs +++ b/src/RolemasterDb.ImportTool/Parsing/ParsedCriticalTable.cs @@ -6,6 +6,7 @@ public sealed class ParsedCriticalTable( string family, string sourceDocument, string? notes, + IReadOnlyList groups, IReadOnlyList columns, IReadOnlyList rollBands, IReadOnlyList results) @@ -15,6 +16,7 @@ public sealed class ParsedCriticalTable( public string Family { get; } = family; public string SourceDocument { get; } = sourceDocument; public string? Notes { get; } = notes; + public IReadOnlyList Groups { get; } = groups; public IReadOnlyList Columns { get; } = columns; public IReadOnlyList RollBands { get; } = rollBands; public IReadOnlyList Results { get; } = results; diff --git a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs index 15db26e..c250495 100644 --- a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs @@ -1,33 +1,20 @@ -using System.Text.RegularExpressions; -using System.Xml; -using System.Xml.Linq; - namespace RolemasterDb.ImportTool.Parsing; public sealed class StandardCriticalTableParser { - private const int HeaderToBodyMinimumGap = 20; - private const int FooterLabelExclusionGap = 15; - private const int FooterPageNumberExclusionGap = 80; - private const int RowLabelDuplicateTolerance = 15; - private const int TopGroupingTolerance = 2; - private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled); - private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled); - private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled); - - public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) + public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) { - var fragments = LoadFragments(xmlContent); + var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); var headerFragments = FindHeaderFragments(fragments); var validationErrors = new List(); var validationWarnings = new List(); var columnCenters = headerFragments .OrderBy(item => item.Left) - .Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX)) + .Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX)) .ToList(); - var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap; + var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap; var keyTop = fragments .Where(item => string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) || @@ -35,12 +22,17 @@ public sealed class StandardCriticalTableParser item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase)) .Select(item => (int?)item.Top) .Min() ?? int.MaxValue; - var affixLegendSymbols = DetectAffixLegendSymbols(fragments, keyTop); - var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments, keyTop); + var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop); + var leftCutoff = headerFragments.Min(item => item.Left) - 10; + var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments( + fragments, + leftCutoff, + bodyStartTop, + keyTop); var rowAnchors = rowLabelFragments .OrderBy(item => item.Top) - .Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1)) + .Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1)) .ToList(); if (rowAnchors.Count == 0) @@ -51,16 +43,17 @@ public sealed class StandardCriticalTableParser var bodyFragments = fragments .Where(item => item.Top >= bodyStartTop && - item.Top < keyTop - TopGroupingTolerance && - !IsFooterPageNumberFragment(item, keyTop) && - !rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) && + item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance && + !CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) && + !CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) && + !rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) && !headerFragments.Contains(item)) .ToList(); - bodyFragments = SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols); - var bodyLines = BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols); + bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols); + var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols); var parsedRollBands = rowAnchors - .Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder)) + .Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder)) .ToList(); var cellEntries = new List(); @@ -82,7 +75,7 @@ public sealed class StandardCriticalTableParser foreach (var columnAnchor in columnCenters) { var cellFragments = rowFragments - .Where(item => ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key) + .Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key) .OrderBy(item => item.Top) .ThenBy(item => item.Left) .ToList(); @@ -97,7 +90,7 @@ public sealed class StandardCriticalTableParser rowAnchors[rowIndex].Label, rowIndex, columnAnchor.Key, - BuildLines(cellFragments).ToList())); + CriticalTableParserSupport.BuildLines(cellFragments).ToList())); } } @@ -108,7 +101,7 @@ public sealed class StandardCriticalTableParser foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey)) { - var segmentCount = CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols); + var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols); if (segmentCount > 2) { @@ -116,13 +109,14 @@ public sealed class StandardCriticalTableParser $"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines."); } - var rawAffixLines = cellEntry.Lines.Where(line => IsAffixLikeLine(line, affixLegendSymbols)).ToList(); - var descriptionLines = cellEntry.Lines.Where(line => !IsAffixLikeLine(line, affixLegendSymbols)).ToList(); + var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList(); + var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList(); var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines); - var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines)); + var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines)); var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines); parsedCells.Add(new ParsedCriticalCellArtifact( + null, cellEntry.RollBandLabel, cellEntry.ColumnKey, cellEntry.Lines, @@ -131,6 +125,7 @@ public sealed class StandardCriticalTableParser rawAffixText)); parsedResults.Add(new ParsedCriticalResult( + null, cellEntry.ColumnKey, cellEntry.RollBandLabel, rawCellText, @@ -162,40 +157,12 @@ public sealed class StandardCriticalTableParser entry.Family, Path.GetFileName(entry.PdfPath), "Imported from PDF XML extraction.", + [], columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(), parsedRollBands, parsedResults); - return new StandardCriticalTableParseResult(table, fragments, parsedCells, validationReport); - } - - private static List LoadFragments(string xmlContent) - { - using var stringReader = new StringReader(xmlContent); - using var xmlReader = XmlReader.Create( - stringReader, - new XmlReaderSettings - { - DtdProcessing = DtdProcessing.Ignore - }); - - var document = XDocument.Load(xmlReader); - - return document.Descendants("page") - .SelectMany(page => - { - var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1"); - return page.Elements("text") - .Select(item => new XmlTextFragment( - pageNumber, - int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")), - int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")), - int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")), - int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")), - NormalizeText(string.Concat(item.DescendantNodes().OfType().Select(node => node.Value))))) - .Where(item => !string.IsNullOrWhiteSpace(item.Text)); - }) - .ToList(); + return new CriticalTableParseResult(table, fragments, parsedCells, validationReport); } private static List FindHeaderFragments(IReadOnlyList fragments) @@ -206,7 +173,7 @@ public sealed class StandardCriticalTableParser .ThenBy(item => item.Left) .ToList(); - foreach (var group in GroupByTop(headerCandidates)) + foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates)) { var ordered = group.OrderBy(item => item.Left).ToList(); var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList(); @@ -219,156 +186,6 @@ public sealed class StandardCriticalTableParser throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact."); } - private static List FindRowLabelFragments( - IReadOnlyList fragments, - IReadOnlyList headerFragments, - int keyTop) - { - var leftCutoff = headerFragments.Min(item => item.Left) - 10; - var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap; - - var candidates = fragments - .Where(item => - item.Left < leftCutoff && - item.Top >= bodyStartTop && - item.Top < keyTop - FooterLabelExclusionGap && - IsRollBandLabel(item.Text)) - .OrderBy(item => item.Top) - .ToList(); - - var deduped = new List(); - - foreach (var candidate in candidates) - { - var previous = deduped.LastOrDefault(); - if (previous is not null && - string.Equals(previous.Text, candidate.Text, StringComparison.OrdinalIgnoreCase) && - Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance) - { - continue; - } - - deduped.Add(candidate); - } - - return deduped; - } - - private static bool IsRollBandLabel(string value) => - Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$"); - - private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder) - { - if (label.EndsWith('+')) - { - return new ParsedCriticalRollBand(label, int.Parse(label[..^1]), null, sortOrder); - } - - var parts = label.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); - return parts.Length == 1 - ? new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder) - : new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder); - } - - private static string ResolveColumn(double centerX, IReadOnlyList columns) - { - for (var index = 0; index < columns.Count - 1; index++) - { - var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0; - if (centerX < boundary) - { - return columns[index].Key; - } - } - - return columns[^1].Key; - } - - private static IReadOnlyList BuildLines(IReadOnlyList fragments) - { - var lines = new List>(); - - foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left)) - { - if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance) - { - lines.Add([fragment]); - continue; - } - - lines[^1].Add(fragment); - } - - return lines - .Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text)))) - .Where(item => !string.IsNullOrWhiteSpace(item)) - .ToList(); - } - - private static bool IsAffixLikeLine(string line, ISet affixLegendSymbols) - { - var value = line.Trim(); - if (value.Length == 0) - { - return false; - } - - if (value == "-" || value == "\u2013" || value == "\u2014") - { - return true; - } - - if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) || - value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) || - value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) || - value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) || - value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) || - value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) || - value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase)) - { - return value.Contains(':', StringComparison.Ordinal); - } - - if (affixLegendSymbols.Count > 0 && - affixLegendSymbols.Any(symbol => value.Contains(symbol, StringComparison.Ordinal))) - { - if (value.Any(char.IsDigit)) - { - return true; - } - - var remainder = value; - foreach (var symbol in affixLegendSymbols.OrderByDescending(item => item.Length)) - { - remainder = remainder.Replace(symbol, string.Empty, StringComparison.Ordinal); - } - - remainder = remainder - .Replace("+", string.Empty, StringComparison.Ordinal) - .Replace("-", string.Empty, StringComparison.Ordinal) - .Replace("(", string.Empty, StringComparison.Ordinal) - .Replace(")", string.Empty, StringComparison.Ordinal) - .Replace("/", string.Empty, StringComparison.Ordinal); - - if (string.IsNullOrWhiteSpace(remainder)) - { - return true; - } - } - - return value.StartsWith("+", StringComparison.Ordinal) || - value.StartsWith("\u2211", StringComparison.Ordinal) || - value.StartsWith("\u220F", StringComparison.Ordinal) || - value.StartsWith("\u03C0", StringComparison.Ordinal) || - value.StartsWith("\u222B", StringComparison.Ordinal) || - StandaloneModifierAffixLineRegex.IsMatch(value) || - NumericAffixLineRegex.IsMatch(value) || - value.Contains(" - ", StringComparison.Ordinal); - } - - private static void RepairLeadingAffixLeakage(List cellEntries) - => RepairLeadingAffixLeakage(cellEntries, new HashSet(StringComparer.Ordinal)); - private static void RepairLeadingAffixLeakage(List cellEntries, ISet affixLegendSymbols) { var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex); @@ -380,14 +197,13 @@ public sealed class StandardCriticalTableParser { var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey); var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey); - if (current is null || next is null) { continue; } var leadingAffixCount = 0; - while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols)) + while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols)) { leadingAffixCount++; } @@ -403,199 +219,10 @@ public sealed class StandardCriticalTableParser } } - private static string CollapseWhitespace(string value) => - Regex.Replace(value.Trim(), @"\s+", " "); - - private static string NormalizeText(string value) => - value - .Replace('\u00a0', ' ') - .Replace('\r', ' ') - .Replace('\n', ' ') - .Trim(); - - private static int CountLineTypeSegments(IReadOnlyList lines, ISet affixLegendSymbols) - { - var segmentCount = 0; - bool? previousIsAffix = null; - - foreach (var line in lines) - { - var currentIsAffix = IsAffixLikeLine(line, affixLegendSymbols); - if (previousIsAffix == currentIsAffix) - { - continue; - } - - segmentCount++; - previousIsAffix = currentIsAffix; - } - - return segmentCount; - } - - private static HashSet DetectAffixLegendSymbols(IReadOnlyList fragments, int keyTop) - { - if (keyTop == int.MaxValue) - { - return []; - } - - var footerLines = GroupByTop(fragments - .Where(item => item.Top >= keyTop - TopGroupingTolerance) - .OrderBy(item => item.Top) - .ThenBy(item => item.Left) - .ToList()) - .Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text)))) - .ToList(); - - var symbols = new HashSet(StringComparer.Ordinal); - - foreach (var footerLine in footerLines) - { - AddLegendMatch(symbols, footerLine, @"must parry\s*=\s*(\S)"); - AddLegendMatch(symbols, footerLine, @"no parry\s*=\s*(\S)"); - AddLegendMatch(symbols, footerLine, @"stun(?:ned)?\s*=\s*(\S)"); - AddLegendMatch(symbols, footerLine, @"bleed\s*=\s*(\S)"); - AddLegendMatch(symbols, footerLine, @"powerpoint modification.*=\s*(\S)"); - } - - return symbols; - } - - private static List SplitBoundaryCrossingAffixFragments( - IReadOnlyList bodyFragments, - IReadOnlyList columnCenters, - ISet affixLegendSymbols) - { - var splitFragments = new List(bodyFragments.Count); - - foreach (var fragment in bodyFragments) - { - splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols)); - } - - return splitFragments; - } - - private static IReadOnlyList SplitBoundaryCrossingAffixFragment( - XmlTextFragment fragment, - IReadOnlyList columnCenters, - ISet affixLegendSymbols) - { - if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols)) - { - return [fragment]; - } - - var matches = MultiFragmentSplitRegex.Matches(fragment.Text); - if (matches.Count < 2) - { - return [fragment]; - } - - var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1); - var splitFragments = new List(matches.Count); - - foreach (Match match in matches) - { - var segmentText = CollapseWhitespace(match.Value); - if (segmentText.Length == 0) - { - continue; - } - - var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index); - var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length)); - - splitFragments.Add(new XmlTextFragment( - fragment.PageNumber, - fragment.Top, - segmentLeft, - segmentWidth, - fragment.Height, - segmentText)); - } - - if (splitFragments.Count < 2) - { - return [fragment]; - } - - var originalColumn = ResolveColumn(fragment.CenterX, columnCenters); - var distinctColumns = splitFragments - .Select(item => ResolveColumn(item.CenterX, columnCenters)) - .Distinct(StringComparer.OrdinalIgnoreCase) - .ToList(); - - return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase)) - ? splitFragments - : [fragment]; - } - - private static bool LooksLikeBoundaryCrossingAffixFragment( - XmlTextFragment fragment, - IReadOnlyList columnCenters, - ISet affixLegendSymbols) - { - if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) || - !fragment.Text.Contains(" ", StringComparison.Ordinal)) - { - return false; - } - - var fragmentRight = fragment.Left + fragment.Width; - - for (var index = 0; index < columnCenters.Count - 1; index++) - { - var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0; - if (fragment.Left < boundary && fragmentRight > boundary) - { - return true; - } - } - - return false; - } - - private static void AddLegendMatch(HashSet symbols, string value, string pattern) - { - foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase)) - { - if (match.Groups.Count > 1) - { - symbols.Add(match.Groups[1].Value); - } - } - } - - private static List BuildBodyLines( - IReadOnlyList bodyFragments, - IReadOnlyList columnCenters, - ISet affixLegendSymbols) - { - var bodyLines = new List(); - - foreach (var lineFragments in GroupByTop(bodyFragments.OrderBy(item => item.Top).ThenBy(item => item.Left).ToList())) - { - var columnTexts = lineFragments - .GroupBy(item => ResolveColumn(item.CenterX, columnCenters), StringComparer.OrdinalIgnoreCase) - .Select(group => CollapseWhitespace(string.Join(' ', group.OrderBy(item => item.Left).Select(item => item.Text)))) - .Where(item => !string.IsNullOrWhiteSpace(item)) - .ToList(); - - var isAffixLike = columnTexts.Count > 0 && - columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols)); - - bodyLines.Add(new BodyLine(lineFragments[0].Top, isAffixLike)); - } - - return bodyLines; - } - private static int ResolveRowBoundaryTop( RowAnchor current, RowAnchor next, - IReadOnlyList bodyLines) + IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines) { var linesBetweenLabels = bodyLines .Where(item => item.Top >= current.Top && item.Top < next.Top) @@ -613,41 +240,8 @@ public sealed class StandardCriticalTableParser return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1; } - private static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop) - { - if (keyTop == int.MaxValue) - { - return false; - } - - return fragment.Top >= keyTop - FooterPageNumberExclusionGap && - Regex.IsMatch(fragment.Text, @"^\d{2,3}$"); - } - - private static IEnumerable> GroupByTop(IReadOnlyList fragments) - { - var groups = new List>(); - - foreach (var fragment in fragments) - { - if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance) - { - groups.Add([fragment]); - continue; - } - - groups[^1].Add(fragment); - } - - return groups; - } - - private sealed record ColumnAnchor(string Key, double CenterX); - private sealed record RowAnchor(string Label, int Top, int SortOrder); - private sealed record BodyLine(int Top, bool IsAffixLike); - private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List lines) { public string RollBandLabel { get; } = rollBandLabel; diff --git a/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs new file mode 100644 index 0000000..5e1a716 --- /dev/null +++ b/src/RolemasterDb.ImportTool/Parsing/VariantColumnCriticalTableParser.cs @@ -0,0 +1,276 @@ +namespace RolemasterDb.ImportTool.Parsing; + +public sealed class VariantColumnCriticalTableParser +{ + private static readonly ColumnDefinition[] ExpectedColumns = + [ + new("NORMAL", "Normal"), + new("MAGIC", "Magic"), + new("MITHRIL", "Mithril"), + new("HOLY_ARMS", "Holy Arms"), + new("SLAYING", "Slaying") + ]; + + public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) + { + var fragments = CriticalTableParserSupport.LoadFragments(xmlContent); + var headerFragments = FindHeaderFragments(fragments); + var validationErrors = new List(); + var validationWarnings = new List(); + + var columnAnchors = headerFragments + .OrderBy(item => item.Left) + .Select(item => + { + var definition = ResolveColumnDefinition(item.Text); + return (definition.Key, definition.Label, item.CenterX); + }) + .ToList(); + + var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap; + var keyTop = fragments + .Where(item => + string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) || + item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) || + item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase)) + .Select(item => (int?)item.Top) + .Min() ?? int.MaxValue; + var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop); + var leftCutoff = headerFragments.Min(item => item.Left) - 10; + var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments( + fragments, + leftCutoff, + bodyStartTop, + keyTop); + + var rowAnchors = rowLabelFragments + .OrderBy(item => item.Top) + .Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1)) + .ToList(); + + if (rowAnchors.Count == 0) + { + validationErrors.Add("No roll-band labels were found in the XML artifact."); + } + + var columnCenters = columnAnchors + .Select(item => (item.Key, item.CenterX)) + .ToList(); + + var bodyFragments = fragments + .Where(item => + item.Top >= bodyStartTop && + item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance && + !CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) && + !CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) && + !rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) && + !headerFragments.Contains(item)) + .ToList(); + bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols); + var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols); + + var parsedRollBands = rowAnchors + .Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder)) + .ToList(); + + var cellEntries = new List(); + + for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++) + { + var rowStart = rowIndex == 0 + ? bodyStartTop + : ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines); + + var rowEnd = rowIndex == rowAnchors.Count - 1 + ? keyTop - 1 + : ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines); + + var rowFragments = bodyFragments + .Where(item => item.Top >= rowStart && item.Top < rowEnd) + .ToList(); + + foreach (var columnAnchor in columnAnchors) + { + var cellFragments = rowFragments + .Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key) + .OrderBy(item => item.Top) + .ThenBy(item => item.Left) + .ToList(); + + if (cellFragments.Count == 0) + { + validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'."); + continue; + } + + cellEntries.Add(new CellEntry( + rowAnchors[rowIndex].Label, + rowIndex, + columnAnchor.Key, + CriticalTableParserSupport.BuildLines(cellFragments).ToList())); + } + } + + RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols); + + var parsedCells = new List(); + var parsedResults = new List(); + + foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey, StringComparer.Ordinal)) + { + var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols); + if (segmentCount > 2) + { + validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines."); + } + + var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList(); + var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList(); + var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines); + var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines)); + var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines); + + parsedCells.Add(new ParsedCriticalCellArtifact( + null, + cellEntry.RollBandLabel, + cellEntry.ColumnKey, + cellEntry.Lines, + rawCellText, + descriptionText, + rawAffixText)); + + parsedResults.Add(new ParsedCriticalResult( + null, + cellEntry.ColumnKey, + cellEntry.RollBandLabel, + rawCellText, + descriptionText, + rawAffixText)); + } + + if (columnAnchors.Count != ExpectedColumns.Length) + { + validationErrors.Add($"Expected {ExpectedColumns.Length} variant columns but found {columnAnchors.Count}."); + } + + if (parsedCells.Count != rowAnchors.Count * columnAnchors.Count) + { + validationErrors.Add($"Expected {rowAnchors.Count * columnAnchors.Count} parsed cells but produced {parsedCells.Count}."); + } + + var validationReport = new ImportValidationReport( + validationErrors.Count == 0, + validationErrors, + validationWarnings, + rowAnchors.Count, + parsedCells.Count); + + var table = new ParsedCriticalTable( + entry.Slug, + entry.DisplayName, + entry.Family, + Path.GetFileName(entry.PdfPath), + "Imported from PDF XML extraction.", + [], + ExpectedColumns.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Label, "variant", index + 1)).ToList(), + parsedRollBands, + parsedResults); + + return new CriticalTableParseResult(table, fragments, parsedCells, validationReport); + } + + private static List FindHeaderFragments(IReadOnlyList fragments) + { + var expectedLabels = ExpectedColumns + .Select(item => item.Label.ToLowerInvariant()) + .ToList(); + + var headerCandidates = fragments + .Where(item => expectedLabels.Contains(item.Text.Trim().ToLowerInvariant(), StringComparer.Ordinal)) + .OrderBy(item => item.Top) + .ThenBy(item => item.Left) + .ToList(); + + foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates)) + { + var ordered = group.OrderBy(item => item.Left).ToList(); + var labels = ordered.Select(item => item.Text.Trim().ToLowerInvariant()).ToList(); + if (labels.SequenceEqual(expectedLabels)) + { + return ordered; + } + } + + throw new InvalidOperationException("Could not find the variant-column header row in the XML artifact."); + } + + private static ColumnDefinition ResolveColumnDefinition(string value) => + ExpectedColumns.SingleOrDefault(item => string.Equals(item.Label, value.Trim(), StringComparison.OrdinalIgnoreCase)) + ?? throw new InvalidOperationException($"Unsupported variant column label '{value}'."); + + private static void RepairLeadingAffixLeakage(List cellEntries, ISet affixLegendSymbols) + { + var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex); + var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList(); + + for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++) + { + foreach (var columnKey in columnKeys) + { + var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey); + var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey); + if (current is null || next is null) + { + continue; + } + + var leadingAffixCount = 0; + while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols)) + { + leadingAffixCount++; + } + + if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count) + { + continue; + } + + current.Lines.AddRange(next.Lines.Take(leadingAffixCount)); + next.Lines.RemoveRange(0, leadingAffixCount); + } + } + } + + private static int ResolveRowBoundaryTop( + RowAnchor current, + RowAnchor next, + IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines) + { + var linesBetweenLabels = bodyLines + .Where(item => item.Top >= current.Top && item.Top < next.Top) + .OrderBy(item => item.Top) + .ToList(); + + for (var index = linesBetweenLabels.Count - 2; index >= 0; index--) + { + if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike) + { + return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1; + } + } + + return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1; + } + + private sealed record ColumnDefinition(string Key, string Label); + + private sealed record RowAnchor(string Label, int Top, int SortOrder); + + private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List lines) + { + public string RollBandLabel { get; } = rollBandLabel; + public int RowIndex { get; } = rowIndex; + public string ColumnKey { get; } = columnKey; + public List Lines { get; } = lines; + } +}