Implement phase 4 critical table imports

This commit is contained in:
2026-03-14 03:27:14 +01:00
parent a391a1421a
commit b2f61c3d73
17 changed files with 1280 additions and 474 deletions

View File

@@ -30,8 +30,10 @@ The current implementation supports:
- explicit CLI commands for reset, extraction, and import
- manifest-driven source selection
- `standard` critical tables with columns `A-E`
- `variant_column` critical tables with non-severity columns
- `grouped_variant` critical tables with a group axis plus variant columns
- XML-based extraction using `pdftohtml -xml`
- geometry-based parsing across the currently enabled phase-3 tables:
- geometry-based parsing across the currently enabled table set:
- `arcane-aether`
- `arcane-nether`
- `ballistic-shrapnel`
@@ -42,22 +44,24 @@ The current implementation supports:
- `heat`
- `impact`
- `krush`
- `large_creature_magic`
- `large_creature_weapon`
- `ma-strikes`
- `ma-sweeps`
- `mana`
- `puncture`
- `slash`
- `subdual`
- `super_large_creature_weapon`
- `tiny`
- `unbalance`
- row-boundary repair for trailing affix leakage
- split row-label reconstruction for tables that render labels such as `99-` / `100` as two fragments
- footer/page-number filtering during body parsing
- transactional loading into SQLite
The current implementation does not yet support:
- variant-column critical tables
- grouped variant tables
- OCR/image-based PDFs such as `Void.pdf`
- normalized `critical_branch` population
- normalized `critical_effect` population
@@ -246,9 +250,28 @@ Current phase-3 notes:
### Phase 4: Variant and Grouped Tables
- support `variant_column` tables such as `Large Creature - Weapon.pdf`
- support `grouped_variant` tables such as `Large Creature - Magic.pdf`
- add parser strategies for additional table families
Phase 4 extended the importer beyond `A-E` tables.
The currently enabled phase-4 table set is:
- `large_creature_weapon`
- `family`: `variant_column`
- columns: `NORMAL`, `MAGIC`, `MITHRIL`, `HOLY_ARMS`, `SLAYING`
- `super_large_creature_weapon`
- `family`: `variant_column`
- columns: `NORMAL`, `MAGIC`, `MITHRIL`, `HOLY_ARMS`, `SLAYING`
- `large_creature_magic`
- `family`: `grouped_variant`
- groups: `large`, `super_large`
- columns: `NORMAL`, `SLAYING`
Phase-4 notes:
- grouped results now populate `critical_group` during SQLite load
- parser dispatch is family-based instead of standard-table only
- left-margin row labels can be reconstructed from split fragments such as `151-` / `175`
- the grouped magic PDF is imported once as `large_creature_magic`
- `sources/Large Creature - Magic.pdf` and `sources/Super Large Creature - Magic.pdf` are duplicate files
### Phase 5: Conditional Branch Extraction
@@ -335,10 +358,12 @@ Each entry declares:
The manifest is intentionally the control point for enabling importer support one table at a time.
For the currently enabled phase-3 entries:
For the currently enabled entries:
- `family` is `standard`
- `extractionMethod` is `xml`
- standard tables use `family: standard`
- creature weapon tables use `family: variant_column`
- grouped creature magic uses `family: grouped_variant`
- all enabled entries currently use `extractionMethod: xml`
## Artifact Layout

View File

@@ -19,11 +19,12 @@ The PDFs are not one uniform table shape. I found three families:
- Example: `Large Creature - Magic.pdf` has:
- group: `large`, `super_large`
- column: `normal`, `slaying`
- In the current importer manifest, the grouped magic PDF is loaded once as `large_creature_magic` because the `Large Creature - Magic.pdf` and `Super Large Creature - Magic.pdf` source files are duplicates.
- row: roll band
There are also extraction constraints:
- Most PDFs are text extractable with `pdftotext -layout`.
- Most PDFs are text extractable with `pdftohtml -xml`.
- `Void.pdf` appears image-based and will need OCR or manual transcription.
- A single cell can contain:
- base description text
@@ -282,4 +283,3 @@ Recommended import flow:
6. Route image PDFs like `Void.pdf` through OCR before the same parser.
The important design decision is: never throw away the original text. The prose is too irregular to rely on normalized fields alone.

View File

@@ -80,6 +80,22 @@
"pdfPath": "sources/Krush.pdf",
"enabled": true
},
{
"slug": "large_creature_magic",
"displayName": "Spells Against Creatures Critical Strike Table",
"family": "grouped_variant",
"extractionMethod": "xml",
"pdfPath": "sources/Large Creature - Magic.pdf",
"enabled": true
},
{
"slug": "large_creature_weapon",
"displayName": "Large Creature Critical Strike Table",
"family": "variant_column",
"extractionMethod": "xml",
"pdfPath": "sources/Large Creature - Weapon.pdf",
"enabled": true
},
{
"slug": "ma-strikes",
"displayName": "Martial Arts Strikes Critical Strike Table",
@@ -128,6 +144,14 @@
"pdfPath": "sources/Subdual.pdf",
"enabled": true
},
{
"slug": "super_large_creature_weapon",
"displayName": "Super Large Creature Critical Strike Table",
"family": "variant_column",
"extractionMethod": "xml",
"pdfPath": "sources/Super Large Creature - Weapon.pdf",
"enabled": true
},
{
"slug": "tiny",
"displayName": "Tiny Critical Strike Table",

Binary file not shown.

View File

@@ -4,7 +4,7 @@ namespace RolemasterDb.ImportTool.Tests;
public sealed class StandardCriticalTableParserIntegrationTests
{
private static readonly string[] ExpectedPhase3Slugs =
private static readonly string[] ExpectedEnabledSlugs =
[
"arcane-aether",
"arcane-nether",
@@ -16,20 +16,25 @@ public sealed class StandardCriticalTableParserIntegrationTests
"heat",
"impact",
"krush",
"large_creature_magic",
"large_creature_weapon",
"ma-strikes",
"ma-sweeps",
"mana",
"puncture",
"slash",
"subdual",
"super_large_creature_weapon",
"tiny",
"unbalance"
];
private static readonly PdfXmlExtractor Extractor = new();
private static readonly StandardCriticalTableParser Parser = new();
private static readonly StandardCriticalTableParser StandardParser = new();
private static readonly VariantColumnCriticalTableParser VariantColumnParser = new();
private static readonly GroupedVariantCriticalTableParser GroupedVariantParser = new();
public static IEnumerable<object[]> EnabledStandardTables() =>
public static IEnumerable<object[]> EnabledTables() =>
LoadManifest().Tables
.Where(item => item.Enabled)
.OrderBy(item => item.Slug, StringComparer.Ordinal)
@@ -37,18 +42,22 @@ public sealed class StandardCriticalTableParserIntegrationTests
public static IEnumerable<object[]> RepresentativeCells()
{
yield return ["slash", "71-75", "A", "Blow falls on lower leg"];
yield return ["puncture", "66", "C", "Strike shatters foe's knee"];
yield return ["ballistic-shrapnel", "86-90", "E", "destroy his heart"];
yield return ["arcane-aether", "96-99", "E", "smoking pulp"];
yield return ["ma-strikes", "96-99", "E", "drives bone into brain"];
yield return ["mana", "96-99", "E", "momentarily transformed"];
yield return ["mana", "100", "E", "Mana consumes everything"];
yield return ["tiny", "100", "E", "Vein and artery severed"];
yield return new object[] { "slash", null!, "71-75", "A", "Blow falls on lower leg" };
yield return new object[] { "puncture", null!, "66", "C", "Strike shatters foe's knee" };
yield return new object[] { "ballistic-shrapnel", null!, "86-90", "E", "destroy his heart" };
yield return new object[] { "arcane-aether", null!, "96-99", "E", "smoking pulp" };
yield return new object[] { "ma-strikes", null!, "96-99", "E", "drives bone into brain" };
yield return new object[] { "mana", null!, "96-99", "E", "momentarily transformed" };
yield return new object[] { "mana", null!, "100", "E", "Mana consumes everything" };
yield return new object[] { "tiny", null!, "100", "E", "Vein and artery severed" };
yield return new object[] { "large_creature_weapon", null!, "01-05", "NORMAL", "Weapon shatters" };
yield return new object[] { "super_large_creature_weapon", null!, "31-40", "SLAYING", "Boom! Solid without question" };
yield return new object[] { "large_creature_magic", "large", "251+", "NORMAL", "Foe lowers his eyes within your reach" };
yield return new object[] { "large_creature_magic", "super_large", "251+", "SLAYING", "Blast goes in through foe's eye" };
}
[Fact]
public void Manifest_enables_the_phase_3_standard_table_set()
public void Manifest_enables_the_phase_4_table_set()
{
var manifest = LoadManifest();
var enabledTables = manifest.Tables
@@ -56,25 +65,29 @@ public sealed class StandardCriticalTableParserIntegrationTests
.OrderBy(item => item.Slug, StringComparer.Ordinal)
.ToList();
Assert.Equal(ExpectedPhase3Slugs, enabledTables.Select(item => item.Slug));
Assert.Equal(ExpectedEnabledSlugs, enabledTables.Select(item => item.Slug));
Assert.All(enabledTables, entry =>
{
Assert.Equal("standard", entry.Family);
Assert.Equal("xml", entry.ExtractionMethod);
Assert.True(File.Exists(Path.Combine(GetRepositoryRoot(), entry.PdfPath)), $"Missing source PDF for '{entry.Slug}'.");
});
Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "large_creature_weapon").Family);
Assert.Equal("variant_column", enabledTables.Single(item => item.Slug == "super_large_creature_weapon").Family);
Assert.Equal("grouped_variant", enabledTables.Single(item => item.Slug == "large_creature_magic").Family);
}
[Theory]
[MemberData(nameof(EnabledStandardTables))]
public async Task Enabled_standard_tables_extract_and_parse_successfully(CriticalImportManifestEntry entry)
[MemberData(nameof(EnabledTables))]
public async Task Enabled_tables_extract_and_parse_successfully(CriticalImportManifestEntry entry)
{
var parseResult = await LoadParseResultAsync(entry);
var expectedGroupCount = Math.Max(parseResult.Table.Groups.Count, 1);
Assert.True(parseResult.ValidationReport.IsValid, string.Join(Environment.NewLine, parseResult.ValidationReport.Errors));
Assert.Equal(5, parseResult.Table.Columns.Count);
Assert.NotEmpty(parseResult.Table.Columns);
Assert.NotEmpty(parseResult.Table.RollBands);
Assert.Equal(parseResult.ValidationReport.RowCount * 5, parseResult.ValidationReport.CellCount);
Assert.Equal(parseResult.ValidationReport.RowCount * parseResult.Table.Columns.Count * expectedGroupCount, parseResult.ValidationReport.CellCount);
Assert.Equal(parseResult.ValidationReport.CellCount, parseResult.Table.Results.Count);
}
@@ -82,6 +95,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
[MemberData(nameof(RepresentativeCells))]
public async Task Representative_cells_keep_expected_descriptions(
string slug,
string? groupKey,
string rollBandLabel,
string columnKey,
string expectedSnippet)
@@ -89,6 +103,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, slug, StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var result = parseResult.Table.Results.Single(item =>
string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) &&
string.Equals(item.RollBandLabel, rollBandLabel, StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal));
@@ -101,6 +116,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "slash", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var result = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "56-60", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "A", StringComparison.Ordinal));
@@ -113,9 +129,11 @@ public sealed class StandardCriticalTableParserIntegrationTests
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var row96E = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "96-99", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
var row100E = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "100", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
@@ -130,6 +148,7 @@ public sealed class StandardCriticalTableParserIntegrationTests
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var row100C = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "100", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
@@ -143,9 +162,11 @@ public sealed class StandardCriticalTableParserIntegrationTests
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var row71A = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "A", StringComparison.Ordinal));
var row71B = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
@@ -159,9 +180,11 @@ public sealed class StandardCriticalTableParserIntegrationTests
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var row71D = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "D", StringComparison.Ordinal));
var row71E = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "E", StringComparison.Ordinal));
@@ -175,9 +198,11 @@ public sealed class StandardCriticalTableParserIntegrationTests
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var row91B = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
var row91C = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
@@ -191,9 +216,11 @@ public sealed class StandardCriticalTableParserIntegrationTests
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var row86B = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
var row86C = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "C", StringComparison.Ordinal));
@@ -201,7 +228,28 @@ public sealed class StandardCriticalTableParserIntegrationTests
Assert.Contains("+16H - 8", row86C.RawAffixText, StringComparison.Ordinal);
}
private static async Task<StandardCriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
[Fact]
public async Task Grouped_magic_table_keeps_large_and_super_large_groups_distinct()
{
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "large_creature_magic", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
Assert.Equal(["large", "super_large"], parseResult.Table.Groups.Select(item => item.GroupKey));
var largeNormal = parseResult.Table.Results.Single(item =>
string.Equals(item.GroupKey, "large", StringComparison.Ordinal) &&
string.Equals(item.RollBandLabel, "251+", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "NORMAL", StringComparison.Ordinal));
var superSlaying = parseResult.Table.Results.Single(item =>
string.Equals(item.GroupKey, "super_large", StringComparison.Ordinal) &&
string.Equals(item.RollBandLabel, "251+", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "SLAYING", StringComparison.Ordinal));
Assert.DoesNotContain("Blast goes in through foe's eye", largeNormal.DescriptionText, StringComparison.OrdinalIgnoreCase);
Assert.Contains("Blast goes in through foe's eye", superSlaying.DescriptionText, StringComparison.OrdinalIgnoreCase);
}
private static async Task<CriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
{
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
@@ -211,7 +259,13 @@ public sealed class StandardCriticalTableParserIntegrationTests
}
var xmlContent = await File.ReadAllTextAsync(xmlPath);
return Parser.Parse(entry, xmlContent);
return entry.Family switch
{
"standard" => StandardParser.Parse(entry, xmlContent),
"variant_column" => VariantColumnParser.Parse(entry, xmlContent),
"grouped_variant" => GroupedVariantParser.Parse(entry, xmlContent),
_ => throw new InvalidOperationException($"Unsupported manifest family '{entry.Family}'.")
};
}
private static CriticalImportManifest LoadManifest() =>

View File

@@ -8,6 +8,8 @@ public sealed class CriticalImportCommandRunner
private readonly ImportArtifactWriter artifactWriter = new();
private readonly PdfXmlExtractor pdfXmlExtractor = new();
private readonly StandardCriticalTableParser standardParser = new();
private readonly VariantColumnCriticalTableParser variantColumnParser = new();
private readonly GroupedVariantCriticalTableParser groupedVariantParser = new();
public async Task<int> RunAsync(ResetOptions options)
{
@@ -96,14 +98,24 @@ public sealed class CriticalImportCommandRunner
?? throw new InvalidOperationException($"No enabled manifest entry was found for '{tableSlug}'.");
}
private StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
private CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
if (!string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
if (string.Equals(entry.Family, "standard", StringComparison.OrdinalIgnoreCase))
{
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by phase 2.");
return standardParser.Parse(entry, xmlContent);
}
return standardParser.Parse(entry, xmlContent);
if (string.Equals(entry.Family, "variant_column", StringComparison.OrdinalIgnoreCase))
{
return variantColumnParser.Parse(entry, xmlContent);
}
if (string.Equals(entry.Family, "grouped_variant", StringComparison.OrdinalIgnoreCase))
{
return groupedVariantParser.Parse(entry, xmlContent);
}
throw new InvalidOperationException($"Family '{entry.Family}' is not supported by the importer.");
}
private static ImportArtifactPaths CreateArtifactPaths(string slug) =>

View File

@@ -43,6 +43,15 @@ public sealed class CriticalImportLoader(string databasePath)
Notes = table.Notes
};
entity.Groups = table.Groups
.Select(item => new CriticalGroup
{
GroupKey = item.GroupKey,
Label = item.Label,
SortOrder = item.SortOrder
})
.ToList();
entity.Columns = table.Columns
.Select(item => new CriticalColumn
{
@@ -63,12 +72,14 @@ public sealed class CriticalImportLoader(string databasePath)
})
.ToList();
var groupsByKey = entity.Groups.ToDictionary(item => item.GroupKey, StringComparer.OrdinalIgnoreCase);
var columnsByKey = entity.Columns.ToDictionary(item => item.ColumnKey, StringComparer.OrdinalIgnoreCase);
var rollBandsByLabel = entity.RollBands.ToDictionary(item => item.Label, StringComparer.OrdinalIgnoreCase);
entity.Results = table.Results
.Select(item => new CriticalResult
{
CriticalGroup = item.GroupKey is null ? null : groupsByKey[item.GroupKey],
CriticalColumn = columnsByKey[item.ColumnKey],
CriticalRollBand = rollBandsByLabel[item.RollBandLabel],
RawCellText = item.RawCellText,

View File

@@ -11,7 +11,7 @@ public sealed class ImportArtifactWriter
WriteIndented = true
};
public async Task WriteAsync(ImportArtifactPaths artifactPaths, StandardCriticalTableParseResult parseResult, CancellationToken cancellationToken = default)
public async Task WriteAsync(ImportArtifactPaths artifactPaths, CriticalTableParseResult parseResult, CancellationToken cancellationToken = default)
{
Directory.CreateDirectory(artifactPaths.DirectoryPath);

View File

@@ -0,0 +1,13 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class CriticalTableParseResult(
ParsedCriticalTable table,
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<ParsedCriticalCellArtifact> cells,
ImportValidationReport validationReport)
{
public ParsedCriticalTable Table { get; } = table;
public IReadOnlyList<XmlTextFragment> Fragments { get; } = fragments;
public IReadOnlyList<ParsedCriticalCellArtifact> Cells { get; } = cells;
public ImportValidationReport ValidationReport { get; } = validationReport;
}

View File

@@ -0,0 +1,477 @@
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
namespace RolemasterDb.ImportTool.Parsing;
internal static class CriticalTableParserSupport
{
internal const int HeaderToBodyMinimumGap = 20;
internal const int FooterLabelExclusionGap = 15;
internal const int FooterPageNumberExclusionGap = 80;
internal const int RowLabelDuplicateTolerance = 15;
internal const int TopGroupingTolerance = 2;
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[-])", RegexOptions.Compiled);
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|)\d+\)$", RegexOptions.Compiled);
internal static List<XmlTextFragment> LoadFragments(string xmlContent)
{
using var stringReader = new StringReader(xmlContent);
using var xmlReader = XmlReader.Create(
stringReader,
new XmlReaderSettings
{
DtdProcessing = DtdProcessing.Ignore
});
var document = XDocument.Load(xmlReader);
return document.Descendants("page")
.SelectMany(page =>
{
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
return page.Elements("text")
.Select(item => new XmlTextFragment(
pageNumber,
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
})
.ToList();
}
internal static List<XmlTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments,
int leftCutoff,
int bodyStartTop,
int keyTop)
{
var candidates = fragments
.Where(item =>
item.Left < leftCutoff &&
item.Top >= bodyStartTop &&
item.Top < keyTop - FooterLabelExclusionGap &&
(IsRollBandLabel(item.Text) || LooksLikeSplitRollBandStart(item.Text)))
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
var merged = new List<XmlTextFragment>();
for (var index = 0; index < candidates.Count; index++)
{
var candidate = candidates[index];
if (TryMergeSplitRollBand(candidates, index, out var mergedCandidate))
{
merged.Add(mergedCandidate);
index++;
continue;
}
if (IsRollBandLabel(candidate.Text))
{
merged.Add(candidate);
}
}
var deduped = new List<XmlTextFragment>();
foreach (var candidate in merged)
{
var previous = deduped.LastOrDefault();
if (previous is not null &&
string.Equals(NormalizeRollBandLabel(previous.Text), NormalizeRollBandLabel(candidate.Text), StringComparison.OrdinalIgnoreCase) &&
Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance)
{
continue;
}
deduped.Add(candidate);
}
return deduped;
}
internal static bool IsRollBandLabel(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:\s*-\s*\d{2,3})?$|^\d{2,3}\+$");
internal static bool IsPotentialRowLabelFragment(XmlTextFragment fragment, int leftCutoff) =>
fragment.Left < leftCutoff &&
(IsRollBandLabel(fragment.Text) || LooksLikeSplitRollBandStart(fragment.Text));
internal static string NormalizeRollBandLabel(string label) =>
Regex.Replace(CollapseWhitespace(label), @"\s*-\s*", "-");
internal static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
{
var normalizedLabel = NormalizeRollBandLabel(label);
if (normalizedLabel.EndsWith('+'))
{
return new ParsedCriticalRollBand(normalizedLabel, int.Parse(normalizedLabel[..^1]), null, sortOrder);
}
var parts = normalizedLabel.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
return parts.Length == 1
? new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder)
: new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
}
internal static string ResolveColumn(double centerX, IReadOnlyList<(string Key, double CenterX)> columns)
{
for (var index = 0; index < columns.Count - 1; index++)
{
var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
if (centerX < boundary)
{
return columns[index].Key;
}
}
return columns[^1].Key;
}
internal static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
{
var lines = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
{
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
lines.Add([fragment]);
continue;
}
lines[^1].Add(fragment);
}
return lines
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
}
internal static bool IsAffixLikeLine(string line, ISet<string> affixLegendSymbols)
{
var value = line.Trim();
if (value.Length == 0)
{
return false;
}
if (value is "-" or "\u2013" or "\u2014")
{
return true;
}
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase))
{
return value.Contains(':', StringComparison.Ordinal);
}
if (affixLegendSymbols.Count > 0 &&
affixLegendSymbols.Any(symbol => value.Contains(symbol, StringComparison.Ordinal)))
{
if (value.Any(char.IsDigit))
{
return true;
}
var remainder = value;
foreach (var symbol in affixLegendSymbols.OrderByDescending(item => item.Length))
{
remainder = remainder.Replace(symbol, string.Empty, StringComparison.Ordinal);
}
remainder = remainder
.Replace("+", string.Empty, StringComparison.Ordinal)
.Replace("-", string.Empty, StringComparison.Ordinal)
.Replace("", string.Empty, StringComparison.Ordinal)
.Replace("(", string.Empty, StringComparison.Ordinal)
.Replace(")", string.Empty, StringComparison.Ordinal)
.Replace("/", string.Empty, StringComparison.Ordinal);
if (string.IsNullOrWhiteSpace(remainder))
{
return true;
}
}
return value.StartsWith("+", StringComparison.Ordinal) ||
value.StartsWith("\u2211", StringComparison.Ordinal) ||
value.StartsWith("\u220F", StringComparison.Ordinal) ||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
StandaloneModifierAffixLineRegex.IsMatch(value) ||
NumericAffixLineRegex.IsMatch(value) ||
value.Contains(" - ", StringComparison.Ordinal) ||
value.Contains(" ", StringComparison.Ordinal);
}
internal static int CountLineTypeSegments(IReadOnlyList<string> lines, ISet<string> affixLegendSymbols)
{
var segmentCount = 0;
bool? previousIsAffix = null;
foreach (var line in lines)
{
var currentIsAffix = IsAffixLikeLine(line, affixLegendSymbols);
if (previousIsAffix == currentIsAffix)
{
continue;
}
segmentCount++;
previousIsAffix = currentIsAffix;
}
return segmentCount;
}
internal static string CollapseWhitespace(string value) =>
Regex.Replace(value.Trim(), @"\s+", " ");
internal static string NormalizeText(string value) =>
value
.Replace('\u00a0', ' ')
.Replace('\r', ' ')
.Replace('\n', ' ')
.Replace('', '\'')
.Trim();
internal static HashSet<string> DetectAffixLegendSymbols(IReadOnlyList<XmlTextFragment> fragments, int keyTop)
{
if (keyTop == int.MaxValue)
{
return [];
}
var footerLines = GroupByTop(fragments
.Where(item => item.Top >= keyTop - TopGroupingTolerance)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList())
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.ToList();
var symbols = new HashSet<string>(StringComparer.Ordinal);
foreach (var footerLine in footerLines)
{
AddLegendMatch(symbols, footerLine, @"must parry\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"no parry\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"stun(?:ned)?\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"bleed\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"powerpoint modification.*=\s*(\S)");
}
return symbols;
}
internal static List<XmlTextFragment> SplitBoundaryCrossingAffixFragments(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
ISet<string> affixLegendSymbols)
{
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count);
foreach (var fragment in bodyFragments)
{
splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols));
}
return splitFragments;
}
internal static List<(int Top, bool IsAffixLike)> BuildBodyLines(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
ISet<string> affixLegendSymbols)
{
var bodyLines = new List<(int Top, bool IsAffixLike)>();
foreach (var lineFragments in GroupByTop(bodyFragments.OrderBy(item => item.Top).ThenBy(item => item.Left).ToList()))
{
var columnTexts = lineFragments
.GroupBy(item => ResolveColumn(item.CenterX, columnCenters), StringComparer.OrdinalIgnoreCase)
.Select(group => CollapseWhitespace(string.Join(' ', group.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
var isAffixLike = columnTexts.Count > 0 &&
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols));
bodyLines.Add((lineFragments[0].Top, isAffixLike));
}
return bodyLines;
}
internal static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
{
if (keyTop == int.MaxValue)
{
return false;
}
return fragment.Top >= keyTop - FooterPageNumberExclusionGap &&
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
}
internal static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
{
var groups = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments)
{
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
groups.Add([fragment]);
continue;
}
groups[^1].Add(fragment);
}
return groups;
}
private static bool LooksLikeSplitRollBandStart(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}\s*-$");
private static bool TryMergeSplitRollBand(IReadOnlyList<XmlTextFragment> candidates, int index, out XmlTextFragment mergedCandidate)
{
var current = candidates[index];
if (!LooksLikeSplitRollBandStart(current.Text) || index + 1 >= candidates.Count)
{
mergedCandidate = null!;
return false;
}
var next = candidates[index + 1];
if (current.PageNumber != next.PageNumber ||
!Regex.IsMatch(next.Text.Trim(), @"^\d{2,3}$") ||
next.Top <= current.Top ||
next.Top - current.Top > RowLabelDuplicateTolerance + 5 ||
Math.Abs(next.Left - current.Left) > 20)
{
mergedCandidate = null!;
return false;
}
var startDigits = Regex.Match(current.Text, @"\d{2,3}").Value;
var mergedLabel = $"{startDigits}-{next.Text.Trim()}";
var right = Math.Max(current.Left + current.Width, next.Left + next.Width);
mergedCandidate = new XmlTextFragment(
current.PageNumber,
current.Top,
Math.Min(current.Left, next.Left),
right - Math.Min(current.Left, next.Left),
Math.Max(current.Height, next.Height),
mergedLabel);
return true;
}
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingAffixFragment(
XmlTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
ISet<string> affixLegendSymbols)
{
if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols))
{
return [fragment];
}
var matches = MultiFragmentSplitRegex.Matches(fragment.Text);
if (matches.Count < 2)
{
return [fragment];
}
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
var splitFragments = new List<XmlTextFragment>(matches.Count);
foreach (Match match in matches)
{
var segmentText = CollapseWhitespace(match.Value);
if (segmentText.Length == 0)
{
continue;
}
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
splitFragments.Add(new XmlTextFragment(
fragment.PageNumber,
fragment.Top,
segmentLeft,
segmentWidth,
fragment.Height,
segmentText));
}
if (splitFragments.Count < 2)
{
return [fragment];
}
var originalColumn = ResolveColumn(fragment.CenterX, columnCenters);
var distinctColumns = splitFragments
.Select(item => ResolveColumn(item.CenterX, columnCenters))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase))
? splitFragments
: [fragment];
}
private static bool LooksLikeBoundaryCrossingAffixFragment(
XmlTextFragment fragment,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
ISet<string> affixLegendSymbols)
{
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) ||
!fragment.Text.Contains(" ", StringComparison.Ordinal))
{
return false;
}
var fragmentRight = fragment.Left + fragment.Width;
for (var index = 0; index < columnCenters.Count - 1; index++)
{
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
if (fragment.Left < boundary && fragmentRight > boundary)
{
return true;
}
}
return false;
}
private static void AddLegendMatch(HashSet<string> symbols, string value, string pattern)
{
foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))
{
if (match.Groups.Count > 1)
{
symbols.Add(match.Groups[1].Value);
}
}
}
}

View File

@@ -0,0 +1,306 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class GroupedVariantCriticalTableParser
{
private static readonly ParsedCriticalGroup[] ExpectedGroups =
[
new("large", "Large Creatures", 1),
new("super_large", "Super Large Creatures", 2)
];
private static readonly ParsedCriticalColumn[] ExpectedColumns =
[
new("NORMAL", "Normal", "variant", 1),
new("SLAYING", "Slaying", "variant", 2)
];
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var groupHeaders = FindGroupHeaders(fragments);
var columnHeaders = FindColumnHeaders(fragments);
var validationErrors = new List<string>();
var validationWarnings = new List<string>();
var combinedColumnAnchors = columnHeaders
.OrderBy(item => item.Left)
.Select((item, index) =>
{
var group = ExpectedGroups[index / ExpectedColumns.Length];
var column = ExpectedColumns[index % ExpectedColumns.Length];
return (group.GroupKey, column.ColumnKey, CompositeKey: $"{group.GroupKey}:{column.ColumnKey}", item.CenterX);
})
.ToList();
var bodyStartTop = Math.Max(
groupHeaders.Max(item => item.Top),
columnHeaders.Max(item => item.Top))
+ CriticalTableParserSupport.HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
var leftCutoff = columnHeaders.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
bodyStartTop,
keyTop);
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
if (rowAnchors.Count == 0)
{
validationErrors.Add("No roll-band labels were found in the XML artifact.");
}
var columnCenters = combinedColumnAnchors
.Select(item => (item.CompositeKey, item.CenterX))
.ToList();
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
!groupHeaders.Contains(item) &&
!columnHeaders.Contains(item))
.ToList();
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<CellEntry>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? bodyStartTop
: ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList();
foreach (var anchor in combinedColumnAnchors)
{
var cellFragments = rowFragments
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == anchor.CompositeKey)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (cellFragments.Count == 0)
{
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', group '{anchor.GroupKey}', column '{anchor.ColumnKey}'.");
continue;
}
cellEntries.Add(new CellEntry(
anchor.GroupKey,
rowAnchors[rowIndex].Label,
rowIndex,
anchor.ColumnKey,
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
}
}
RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
foreach (var cellEntry in cellEntries
.OrderBy(item => item.RowIndex)
.ThenBy(item => item.GroupKey, StringComparer.Ordinal)
.ThenBy(item => item.ColumnKey, StringComparer.Ordinal))
{
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
if (segmentCount > 2)
{
validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.GroupKey}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
}
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
cellEntry.GroupKey,
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines,
rawCellText,
descriptionText,
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
cellEntry.GroupKey,
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
rawCellText,
descriptionText,
rawAffixText));
}
var expectedCellCount = rowAnchors.Count * ExpectedGroups.Length * ExpectedColumns.Length;
if (parsedCells.Count != expectedCellCount)
{
validationErrors.Add($"Expected {expectedCellCount} parsed cells but produced {parsedCells.Count}.");
}
var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
validationWarnings,
rowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.",
ExpectedGroups,
ExpectedColumns,
parsedRollBands,
parsedResults);
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport);
}
private static List<XmlTextFragment> FindGroupHeaders(IReadOnlyList<XmlTextFragment> fragments)
{
var expectedLabels = ExpectedGroups.Select(item => item.Label).ToList();
var headerCandidates = fragments
.Where(item => expectedLabels.Contains(item.Text.Trim(), StringComparer.OrdinalIgnoreCase))
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.Trim()).ToList();
if (labels.SequenceEqual(expectedLabels, StringComparer.OrdinalIgnoreCase))
{
return ordered;
}
}
throw new InvalidOperationException("Could not find the grouped-variant section headers in the XML artifact.");
}
private static List<XmlTextFragment> FindColumnHeaders(IReadOnlyList<XmlTextFragment> fragments)
{
var expectedLabels = new[] { "normal", "slaying", "normal", "slaying" };
var headerCandidates = fragments
.Where(item =>
{
var normalized = item.Text.Trim().ToLowerInvariant();
return normalized is "normal" or "slaying";
})
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.Trim().ToLowerInvariant()).ToList();
if (labels.SequenceEqual(expectedLabels))
{
return ordered;
}
}
throw new InvalidOperationException("Could not find the grouped-variant column header row in the XML artifact.");
}
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
var axes = cellEntries
.Select(item => (item.GroupKey, item.ColumnKey))
.Distinct()
.ToList();
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
{
foreach (var (groupKey, columnKey) in axes)
{
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.GroupKey == groupKey && item.ColumnKey == columnKey);
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.GroupKey == groupKey && item.ColumnKey == columnKey);
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
{
leadingAffixCount++;
}
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
{
continue;
}
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
next.Lines.RemoveRange(0, leadingAffixCount);
}
}
}
private static int ResolveRowBoundaryTop(
RowAnchor current,
RowAnchor next,
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
{
var linesBetweenLabels = bodyLines
.Where(item => item.Top >= current.Top && item.Top < next.Top)
.OrderBy(item => item.Top)
.ToList();
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
{
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
{
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
}
}
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
}
private sealed record RowAnchor(string Label, int Top, int SortOrder);
private sealed class CellEntry(string groupKey, string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string GroupKey { get; } = groupKey;
public string RollBandLabel { get; } = rollBandLabel;
public int RowIndex { get; } = rowIndex;
public string ColumnKey { get; } = columnKey;
public List<string> Lines { get; } = lines;
}
}

View File

@@ -1,6 +1,7 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalCellArtifact(
string? groupKey,
string rollBandLabel,
string columnKey,
IReadOnlyList<string> lines,
@@ -8,6 +9,7 @@ public sealed class ParsedCriticalCellArtifact(
string descriptionText,
string? rawAffixText)
{
public string? GroupKey { get; } = groupKey;
public string RollBandLabel { get; } = rollBandLabel;
public string ColumnKey { get; } = columnKey;
public IReadOnlyList<string> Lines { get; } = lines;

View File

@@ -0,0 +1,8 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalGroup(string groupKey, string label, int sortOrder)
{
public string GroupKey { get; } = groupKey;
public string Label { get; } = label;
public int SortOrder { get; } = sortOrder;
}

View File

@@ -1,12 +1,14 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalResult(
string? groupKey,
string columnKey,
string rollBandLabel,
string rawCellText,
string descriptionText,
string? rawAffixText)
{
public string? GroupKey { get; } = groupKey;
public string ColumnKey { get; } = columnKey;
public string RollBandLabel { get; } = rollBandLabel;
public string RawCellText { get; } = rawCellText;

View File

@@ -6,6 +6,7 @@ public sealed class ParsedCriticalTable(
string family,
string sourceDocument,
string? notes,
IReadOnlyList<ParsedCriticalGroup> groups,
IReadOnlyList<ParsedCriticalColumn> columns,
IReadOnlyList<ParsedCriticalRollBand> rollBands,
IReadOnlyList<ParsedCriticalResult> results)
@@ -15,6 +16,7 @@ public sealed class ParsedCriticalTable(
public string Family { get; } = family;
public string SourceDocument { get; } = sourceDocument;
public string? Notes { get; } = notes;
public IReadOnlyList<ParsedCriticalGroup> Groups { get; } = groups;
public IReadOnlyList<ParsedCriticalColumn> Columns { get; } = columns;
public IReadOnlyList<ParsedCriticalRollBand> RollBands { get; } = rollBands;
public IReadOnlyList<ParsedCriticalResult> Results { get; } = results;

View File

@@ -1,33 +1,20 @@
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
namespace RolemasterDb.ImportTool.Parsing;
public sealed class StandardCriticalTableParser
{
private const int HeaderToBodyMinimumGap = 20;
private const int FooterLabelExclusionGap = 15;
private const int FooterPageNumberExclusionGap = 80;
private const int RowLabelDuplicateTolerance = 15;
private const int TopGroupingTolerance = 2;
private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled);
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[-])", RegexOptions.Compiled);
private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled);
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
var fragments = LoadFragments(xmlContent);
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var headerFragments = FindHeaderFragments(fragments);
var validationErrors = new List<string>();
var validationWarnings = new List<string>();
var columnCenters = headerFragments
.OrderBy(item => item.Left)
.Select(item => new ColumnAnchor(item.Text.ToUpperInvariant(), item.CenterX))
.Select(item => (Key: item.Text.ToUpperInvariant(), CenterX: item.CenterX))
.ToList();
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
@@ -35,12 +22,17 @@ public sealed class StandardCriticalTableParser
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var affixLegendSymbols = DetectAffixLegendSymbols(fragments, keyTop);
var rowLabelFragments = FindRowLabelFragments(fragments, headerFragments, keyTop);
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
bodyStartTop,
keyTop);
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(item.Text, item.Top, index + 1))
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
if (rowAnchors.Count == 0)
@@ -51,16 +43,17 @@ public sealed class StandardCriticalTableParser
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - TopGroupingTolerance &&
!IsFooterPageNumberFragment(item, keyTop) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) &&
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList();
bodyFragments = SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyLines = BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder))
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<CellEntry>();
@@ -82,7 +75,7 @@ public sealed class StandardCriticalTableParser
foreach (var columnAnchor in columnCenters)
{
var cellFragments = rowFragments
.Where(item => ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
@@ -97,7 +90,7 @@ public sealed class StandardCriticalTableParser
rowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
BuildLines(cellFragments).ToList()));
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
}
}
@@ -108,7 +101,7 @@ public sealed class StandardCriticalTableParser
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
{
var segmentCount = CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
if (segmentCount > 2)
{
@@ -116,13 +109,14 @@ public sealed class StandardCriticalTableParser
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
}
var rawAffixLines = cellEntry.Lines.Where(line => IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
null,
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines,
@@ -131,6 +125,7 @@ public sealed class StandardCriticalTableParser
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
null,
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
rawCellText,
@@ -162,40 +157,12 @@ public sealed class StandardCriticalTableParser
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.",
[],
columnCenters.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Key, "severity", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new StandardCriticalTableParseResult(table, fragments, parsedCells, validationReport);
}
private static List<XmlTextFragment> LoadFragments(string xmlContent)
{
using var stringReader = new StringReader(xmlContent);
using var xmlReader = XmlReader.Create(
stringReader,
new XmlReaderSettings
{
DtdProcessing = DtdProcessing.Ignore
});
var document = XDocument.Load(xmlReader);
return document.Descendants("page")
.SelectMany(page =>
{
var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1");
return page.Elements("text")
.Select(item => new XmlTextFragment(
pageNumber,
int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")),
int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")),
int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")),
int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")),
NormalizeText(string.Concat(item.DescendantNodes().OfType<XText>().Select(node => node.Value)))))
.Where(item => !string.IsNullOrWhiteSpace(item.Text));
})
.ToList();
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport);
}
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
@@ -206,7 +173,7 @@ public sealed class StandardCriticalTableParser
.ThenBy(item => item.Left)
.ToList();
foreach (var group in GroupByTop(headerCandidates))
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.ToUpperInvariant()).ToList();
@@ -219,156 +186,6 @@ public sealed class StandardCriticalTableParser
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
}
private static List<XmlTextFragment> FindRowLabelFragments(
IReadOnlyList<XmlTextFragment> fragments,
IReadOnlyList<XmlTextFragment> headerFragments,
int keyTop)
{
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var bodyStartTop = headerFragments.Max(item => item.Top) + HeaderToBodyMinimumGap;
var candidates = fragments
.Where(item =>
item.Left < leftCutoff &&
item.Top >= bodyStartTop &&
item.Top < keyTop - FooterLabelExclusionGap &&
IsRollBandLabel(item.Text))
.OrderBy(item => item.Top)
.ToList();
var deduped = new List<XmlTextFragment>();
foreach (var candidate in candidates)
{
var previous = deduped.LastOrDefault();
if (previous is not null &&
string.Equals(previous.Text, candidate.Text, StringComparison.OrdinalIgnoreCase) &&
Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance)
{
continue;
}
deduped.Add(candidate);
}
return deduped;
}
private static bool IsRollBandLabel(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$");
private static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder)
{
if (label.EndsWith('+'))
{
return new ParsedCriticalRollBand(label, int.Parse(label[..^1]), null, sortOrder);
}
var parts = label.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
return parts.Length == 1
? new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder)
: new ParsedCriticalRollBand(label, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder);
}
private static string ResolveColumn(double centerX, IReadOnlyList<ColumnAnchor> columns)
{
for (var index = 0; index < columns.Count - 1; index++)
{
var boundary = (columns[index].CenterX + columns[index + 1].CenterX) / 2.0;
if (centerX < boundary)
{
return columns[index].Key;
}
}
return columns[^1].Key;
}
private static IReadOnlyList<string> BuildLines(IReadOnlyList<XmlTextFragment> fragments)
{
var lines = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments.OrderBy(item => item.Top).ThenBy(item => item.Left))
{
if (lines.Count == 0 || Math.Abs(lines[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
lines.Add([fragment]);
continue;
}
lines[^1].Add(fragment);
}
return lines
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
}
private static bool IsAffixLikeLine(string line, ISet<string> affixLegendSymbols)
{
var value = line.Trim();
if (value.Length == 0)
{
return false;
}
if (value == "-" || value == "\u2013" || value == "\u2014")
{
return true;
}
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase))
{
return value.Contains(':', StringComparison.Ordinal);
}
if (affixLegendSymbols.Count > 0 &&
affixLegendSymbols.Any(symbol => value.Contains(symbol, StringComparison.Ordinal)))
{
if (value.Any(char.IsDigit))
{
return true;
}
var remainder = value;
foreach (var symbol in affixLegendSymbols.OrderByDescending(item => item.Length))
{
remainder = remainder.Replace(symbol, string.Empty, StringComparison.Ordinal);
}
remainder = remainder
.Replace("+", string.Empty, StringComparison.Ordinal)
.Replace("-", string.Empty, StringComparison.Ordinal)
.Replace("(", string.Empty, StringComparison.Ordinal)
.Replace(")", string.Empty, StringComparison.Ordinal)
.Replace("/", string.Empty, StringComparison.Ordinal);
if (string.IsNullOrWhiteSpace(remainder))
{
return true;
}
}
return value.StartsWith("+", StringComparison.Ordinal) ||
value.StartsWith("\u2211", StringComparison.Ordinal) ||
value.StartsWith("\u220F", StringComparison.Ordinal) ||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
StandaloneModifierAffixLineRegex.IsMatch(value) ||
NumericAffixLineRegex.IsMatch(value) ||
value.Contains(" - ", StringComparison.Ordinal);
}
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries)
=> RepairLeadingAffixLeakage(cellEntries, new HashSet<string>(StringComparer.Ordinal));
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
@@ -380,14 +197,13 @@ public sealed class StandardCriticalTableParser
{
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
{
leadingAffixCount++;
}
@@ -403,199 +219,10 @@ public sealed class StandardCriticalTableParser
}
}
private static string CollapseWhitespace(string value) =>
Regex.Replace(value.Trim(), @"\s+", " ");
private static string NormalizeText(string value) =>
value
.Replace('\u00a0', ' ')
.Replace('\r', ' ')
.Replace('\n', ' ')
.Trim();
private static int CountLineTypeSegments(IReadOnlyList<string> lines, ISet<string> affixLegendSymbols)
{
var segmentCount = 0;
bool? previousIsAffix = null;
foreach (var line in lines)
{
var currentIsAffix = IsAffixLikeLine(line, affixLegendSymbols);
if (previousIsAffix == currentIsAffix)
{
continue;
}
segmentCount++;
previousIsAffix = currentIsAffix;
}
return segmentCount;
}
private static HashSet<string> DetectAffixLegendSymbols(IReadOnlyList<XmlTextFragment> fragments, int keyTop)
{
if (keyTop == int.MaxValue)
{
return [];
}
var footerLines = GroupByTop(fragments
.Where(item => item.Top >= keyTop - TopGroupingTolerance)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList())
.Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))))
.ToList();
var symbols = new HashSet<string>(StringComparer.Ordinal);
foreach (var footerLine in footerLines)
{
AddLegendMatch(symbols, footerLine, @"must parry\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"no parry\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"stun(?:ned)?\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"bleed\s*=\s*(\S)");
AddLegendMatch(symbols, footerLine, @"powerpoint modification.*=\s*(\S)");
}
return symbols;
}
private static List<XmlTextFragment> SplitBoundaryCrossingAffixFragments(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
var splitFragments = new List<XmlTextFragment>(bodyFragments.Count);
foreach (var fragment in bodyFragments)
{
splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols));
}
return splitFragments;
}
private static IReadOnlyList<XmlTextFragment> SplitBoundaryCrossingAffixFragment(
XmlTextFragment fragment,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols))
{
return [fragment];
}
var matches = MultiFragmentSplitRegex.Matches(fragment.Text);
if (matches.Count < 2)
{
return [fragment];
}
var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1);
var splitFragments = new List<XmlTextFragment>(matches.Count);
foreach (Match match in matches)
{
var segmentText = CollapseWhitespace(match.Value);
if (segmentText.Length == 0)
{
continue;
}
var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index);
var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length));
splitFragments.Add(new XmlTextFragment(
fragment.PageNumber,
fragment.Top,
segmentLeft,
segmentWidth,
fragment.Height,
segmentText));
}
if (splitFragments.Count < 2)
{
return [fragment];
}
var originalColumn = ResolveColumn(fragment.CenterX, columnCenters);
var distinctColumns = splitFragments
.Select(item => ResolveColumn(item.CenterX, columnCenters))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList();
return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase))
? splitFragments
: [fragment];
}
private static bool LooksLikeBoundaryCrossingAffixFragment(
XmlTextFragment fragment,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) ||
!fragment.Text.Contains(" ", StringComparison.Ordinal))
{
return false;
}
var fragmentRight = fragment.Left + fragment.Width;
for (var index = 0; index < columnCenters.Count - 1; index++)
{
var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0;
if (fragment.Left < boundary && fragmentRight > boundary)
{
return true;
}
}
return false;
}
private static void AddLegendMatch(HashSet<string> symbols, string value, string pattern)
{
foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))
{
if (match.Groups.Count > 1)
{
symbols.Add(match.Groups[1].Value);
}
}
}
private static List<BodyLine> BuildBodyLines(
IReadOnlyList<XmlTextFragment> bodyFragments,
IReadOnlyList<ColumnAnchor> columnCenters,
ISet<string> affixLegendSymbols)
{
var bodyLines = new List<BodyLine>();
foreach (var lineFragments in GroupByTop(bodyFragments.OrderBy(item => item.Top).ThenBy(item => item.Left).ToList()))
{
var columnTexts = lineFragments
.GroupBy(item => ResolveColumn(item.CenterX, columnCenters), StringComparer.OrdinalIgnoreCase)
.Select(group => CollapseWhitespace(string.Join(' ', group.OrderBy(item => item.Left).Select(item => item.Text))))
.Where(item => !string.IsNullOrWhiteSpace(item))
.ToList();
var isAffixLike = columnTexts.Count > 0 &&
columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols));
bodyLines.Add(new BodyLine(lineFragments[0].Top, isAffixLike));
}
return bodyLines;
}
private static int ResolveRowBoundaryTop(
RowAnchor current,
RowAnchor next,
IReadOnlyList<BodyLine> bodyLines)
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
{
var linesBetweenLabels = bodyLines
.Where(item => item.Top >= current.Top && item.Top < next.Top)
@@ -613,41 +240,8 @@ public sealed class StandardCriticalTableParser
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
}
private static bool IsFooterPageNumberFragment(XmlTextFragment fragment, int keyTop)
{
if (keyTop == int.MaxValue)
{
return false;
}
return fragment.Top >= keyTop - FooterPageNumberExclusionGap &&
Regex.IsMatch(fragment.Text, @"^\d{2,3}$");
}
private static IEnumerable<List<XmlTextFragment>> GroupByTop(IReadOnlyList<XmlTextFragment> fragments)
{
var groups = new List<List<XmlTextFragment>>();
foreach (var fragment in fragments)
{
if (groups.Count == 0 || Math.Abs(groups[^1][0].Top - fragment.Top) > TopGroupingTolerance)
{
groups.Add([fragment]);
continue;
}
groups[^1].Add(fragment);
}
return groups;
}
private sealed record ColumnAnchor(string Key, double CenterX);
private sealed record RowAnchor(string Label, int Top, int SortOrder);
private sealed record BodyLine(int Top, bool IsAffixLike);
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string RollBandLabel { get; } = rollBandLabel;

View File

@@ -0,0 +1,276 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class VariantColumnCriticalTableParser
{
private static readonly ColumnDefinition[] ExpectedColumns =
[
new("NORMAL", "Normal"),
new("MAGIC", "Magic"),
new("MITHRIL", "Mithril"),
new("HOLY_ARMS", "Holy Arms"),
new("SLAYING", "Slaying")
];
public CriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
var fragments = CriticalTableParserSupport.LoadFragments(xmlContent);
var headerFragments = FindHeaderFragments(fragments);
var validationErrors = new List<string>();
var validationWarnings = new List<string>();
var columnAnchors = headerFragments
.OrderBy(item => item.Left)
.Select(item =>
{
var definition = ResolveColumnDefinition(item.Text);
return (definition.Key, definition.Label, item.CenterX);
})
.ToList();
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
fragments,
leftCutoff,
bodyStartTop,
keyTop);
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
if (rowAnchors.Count == 0)
{
validationErrors.Add("No roll-band labels were found in the XML artifact.");
}
var columnCenters = columnAnchors
.Select(item => (item.Key, item.CenterX))
.ToList();
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList();
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<CellEntry>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? bodyStartTop
: ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
.ToList();
foreach (var columnAnchor in columnAnchors)
{
var cellFragments = rowFragments
.Where(item => CriticalTableParserSupport.ResolveColumn(item.CenterX, columnCenters) == columnAnchor.Key)
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
if (cellFragments.Count == 0)
{
validationErrors.Add($"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'.");
continue;
}
cellEntries.Add(new CellEntry(
rowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
CriticalTableParserSupport.BuildLines(cellFragments).ToList()));
}
}
RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey, StringComparer.Ordinal))
{
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
if (segmentCount > 2)
{
validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
}
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
null,
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines,
rawCellText,
descriptionText,
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
null,
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
rawCellText,
descriptionText,
rawAffixText));
}
if (columnAnchors.Count != ExpectedColumns.Length)
{
validationErrors.Add($"Expected {ExpectedColumns.Length} variant columns but found {columnAnchors.Count}.");
}
if (parsedCells.Count != rowAnchors.Count * columnAnchors.Count)
{
validationErrors.Add($"Expected {rowAnchors.Count * columnAnchors.Count} parsed cells but produced {parsedCells.Count}.");
}
var validationReport = new ImportValidationReport(
validationErrors.Count == 0,
validationErrors,
validationWarnings,
rowAnchors.Count,
parsedCells.Count);
var table = new ParsedCriticalTable(
entry.Slug,
entry.DisplayName,
entry.Family,
Path.GetFileName(entry.PdfPath),
"Imported from PDF XML extraction.",
[],
ExpectedColumns.Select((item, index) => new ParsedCriticalColumn(item.Key, item.Label, "variant", index + 1)).ToList(),
parsedRollBands,
parsedResults);
return new CriticalTableParseResult(table, fragments, parsedCells, validationReport);
}
private static List<XmlTextFragment> FindHeaderFragments(IReadOnlyList<XmlTextFragment> fragments)
{
var expectedLabels = ExpectedColumns
.Select(item => item.Label.ToLowerInvariant())
.ToList();
var headerCandidates = fragments
.Where(item => expectedLabels.Contains(item.Text.Trim().ToLowerInvariant(), StringComparer.Ordinal))
.OrderBy(item => item.Top)
.ThenBy(item => item.Left)
.ToList();
foreach (var group in CriticalTableParserSupport.GroupByTop(headerCandidates))
{
var ordered = group.OrderBy(item => item.Left).ToList();
var labels = ordered.Select(item => item.Text.Trim().ToLowerInvariant()).ToList();
if (labels.SequenceEqual(expectedLabels))
{
return ordered;
}
}
throw new InvalidOperationException("Could not find the variant-column header row in the XML artifact.");
}
private static ColumnDefinition ResolveColumnDefinition(string value) =>
ExpectedColumns.SingleOrDefault(item => string.Equals(item.Label, value.Trim(), StringComparison.OrdinalIgnoreCase))
?? throw new InvalidOperationException($"Unsupported variant column label '{value}'.");
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
{
foreach (var columnKey in columnKeys)
{
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
{
leadingAffixCount++;
}
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
{
continue;
}
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
next.Lines.RemoveRange(0, leadingAffixCount);
}
}
}
private static int ResolveRowBoundaryTop(
RowAnchor current,
RowAnchor next,
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
{
var linesBetweenLabels = bodyLines
.Where(item => item.Top >= current.Top && item.Top < next.Top)
.OrderBy(item => item.Top)
.ToList();
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
{
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
{
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
}
}
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
}
private sealed record ColumnDefinition(string Key, string Label);
private sealed record RowAnchor(string Label, int Top, int SortOrder);
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string RollBandLabel { get; } = rollBandLabel;
public int RowIndex { get; } = rowIndex;
public string ColumnKey { get; } = columnKey;
public List<string> Lines { get; } = lines;
}
}