Phase 2.1 import

2026-03-14 01:44:30 +01:00
parent be5c0a9b54
commit 5c4d540246
4 changed files with 151 additions and 22 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -14,4 +14,5 @@ Also see the other related technical documentation in the docs folder.
 - When asked to begin working on a task, create a detailed implementation plan first, present the plan to the user, and ask for approval before beginning with the actual implementation.
 - When an task is finished, perform a code review to evaluate if the change is clean and maintainable with high software engineering standards. Iterate on the code and repeat the review process until satisfied.
 - After the implementation is finished, verify all changed files, and run `python D:\Code\crlf.py $file1 $file2 ...` only for files you recognize, in order to normalize all line endings of all touched files to CRLF.
+- If there's documnentation present, always keep it updated.
 - At the end perform a git commit with a one-liner summary.
--- a/docs/critical_import_tool.md
+++ b/docs/critical_import_tool.md
@@ -32,6 +32,7 @@ The current implementation supports:
 - `standard` critical tables with columns `A-E`
 - XML-based extraction using `pdftohtml -xml`
 - geometry-based parsing for `Slash.pdf`
+- row-boundary repair for trailing affix leakage
 - transactional loading into SQLite

 The current implementation does not yet support:
@@ -149,6 +150,44 @@ This phase fixed the original `Slash / A / 72` corruption. The same lookup now r

 The important change is not only that the current output is correct, but that the importer now fails fast on structural ambiguity instead of silently loading corrupted rows.

+## Phase 2.1: Boundary Hardening After Manual Validation
+
+After phase 2, a manual validation pass compared:
+
+- the rendered `Slash.pdf`
+- the extracted `source.xml`
+- the imported SQLite rows
+
+That review found a remaining defect around the `51-55` / `56-60` boundary:
+
+- `51-55` lost several affix lines
+- `56-60` gained leading affix lines from the previous row
+
+The root cause was the original row segmentation rule:
+
+- rows were assigned strictly by the midpoint between adjacent roll-label `top` values
+
+That rule was too naive for rows whose affix block sits visually near the next row label.
+
+### Phase 2.1 fix
+
+The parser was hardened in two ways:
+
+1. Leading affix leakage repair
+   - after the initial row assignment, if a cell in the next row starts with affix-like lines and then continues with prose, those leading affix lines are moved back to the previous row
+2. Better affix classification
+   - generic digit-starting lines are no longer assumed to be affixes
+   - this prevents prose such as `25% chance your weapon is stuck...` from being misclassified
+
+### Phase 2.1 validation rules
+
+The importer now explicitly rejects cells that still look structurally wrong after repair:
+
+- a cell may not begin with affix-like lines before prose
+- a cell may not contain prose after affix lines
+
+This hardening step is important because it closed a class of row-boundary bugs that simple row/cell counts could not detect.
+
 ## Planned Future Phases

 The current architecture is intended to support additional phases:
@@ -353,6 +392,14 @@ Fragments inside a cell are grouped into lines by close `top` values and then or

 This produces a stable line list even when PDF text is broken into multiple fragments.

+### Boundary Repair
+
+After the initial midpoint-based row assignment, the parser performs a repair step across adjacent rows in the same column.
+
+If the next row begins with affix-like lines and then continues with prose, those leading affix lines are treated as leaked trailing affixes from the previous row and moved back.
+
+This repair exists because some tables place affix lines close enough to the next row label that midpoint-only segmentation is not reliable.
+
 ### Description vs Affix Splitting

 The parser classifies lines as:
@@ -366,6 +413,8 @@ Affix-like lines include:
 - symbolic lines using the critical glyphs
 - branch-like affix lines such as `with leg greaves: +2H - ...`

+Affix-like classification is intentionally conservative. Numeric prose lines such as `25% chance...` are not treated as affixes unless they match a known affix-like notation pattern.
+
 The current implementation stores:

 - `RawCellText`
@@ -384,6 +433,8 @@ At minimum, a valid `standard` table must satisfy:
 - roll-band labels are found
 - each detected row produces content for all five columns
 - total parsed cell count matches `row_count * 5`
+- no cell begins with affix-like lines before prose
+- no cell contains prose after affix lines

 If validation fails:

--- a/src/RolemasterDb.App/rolemaster.db
+++ b/src/RolemasterDb.App/rolemaster.db
--- a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs
+++ b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs
@@ -8,6 +8,7 @@ public sealed class StandardCriticalTableParser
 {
    private const int HeaderToBodyMinimumGap = 20;
    private const int TopGroupingTolerance = 2;
+    private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled);

    public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
    {
@@ -49,8 +50,7 @@ public sealed class StandardCriticalTableParser
            .Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder))
            .ToList();

-        var parsedCells = new List<ParsedCriticalCellArtifact>();
-        var parsedResults = new List<ParsedCriticalResult>();
+        var cellEntries = new List<CellEntry>();

        for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
        {
@@ -80,30 +80,65 @@ public sealed class StandardCriticalTableParser
                    continue;
                }

-                var lines = BuildLines(cellFragments);
-                var rawAffixLines = lines.Where(IsAffixLikeLine).ToList();
-                var descriptionLines = lines.Where(line => !IsAffixLikeLine(line)).ToList();
-                var rawCellText = string.Join(Environment.NewLine, lines);
-                var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
-                var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
-
-                parsedCells.Add(new ParsedCriticalCellArtifact(
+                cellEntries.Add(new CellEntry(
                    rowAnchors[rowIndex].Label,
+                    rowIndex,
                    columnAnchor.Key,
-                    lines,
-                    rawCellText,
-                    descriptionText,
-                    rawAffixText));
-
-                parsedResults.Add(new ParsedCriticalResult(
-                    columnAnchor.Key,
-                    rowAnchors[rowIndex].Label,
-                    rawCellText,
-                    descriptionText,
-                    rawAffixText));
+                    BuildLines(cellFragments).ToList()));
            }
        }

+        RepairLeadingAffixLeakage(cellEntries);
+
+        var parsedCells = new List<ParsedCriticalCellArtifact>();
+        var parsedResults = new List<ParsedCriticalResult>();
+
+        foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
+        {
+            var firstProseIndex = cellEntry.Lines.FindIndex(line => !IsAffixLikeLine(line));
+            var firstAffixIndex = cellEntry.Lines.FindIndex(IsAffixLikeLine);
+
+            if (firstProseIndex > 0)
+            {
+                validationErrors.Add(
+                    $"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' begins with affix-like lines before prose.");
+            }
+
+            if (firstAffixIndex >= 0)
+            {
+                var proseAfterAffix = cellEntry.Lines
+                    .Skip(firstAffixIndex + 1)
+                    .Any(line => !IsAffixLikeLine(line));
+
+                if (proseAfterAffix)
+                {
+                    validationErrors.Add(
+                        $"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' contains prose after affix lines.");
+                }
+            }
+
+            var rawAffixLines = cellEntry.Lines.Where(IsAffixLikeLine).ToList();
+            var descriptionLines = cellEntry.Lines.Where(line => !IsAffixLikeLine(line)).ToList();
+            var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
+            var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
+            var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
+
+            parsedCells.Add(new ParsedCriticalCellArtifact(
+                cellEntry.RollBandLabel,
+                cellEntry.ColumnKey,
+                cellEntry.Lines,
+                rawCellText,
+                descriptionText,
+                rawAffixText));
+
+            parsedResults.Add(new ParsedCriticalResult(
+                cellEntry.ColumnKey,
+                cellEntry.RollBandLabel,
+                rawCellText,
+                descriptionText,
+                rawAffixText));
+        }
+
        if (columnCenters.Count != 5)
        {
            validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}.");
@@ -276,12 +311,46 @@ public sealed class StandardCriticalTableParser
               value.StartsWith("\u220F", StringComparison.Ordinal) ||
               value.StartsWith("\u03C0", StringComparison.Ordinal) ||
               value.StartsWith("\u222B", StringComparison.Ordinal) ||
-               char.IsDigit(value[0]) ||
+               NumericAffixLineRegex.IsMatch(value) ||
               value.Contains(" - ", StringComparison.Ordinal) ||
               value.Contains("(-", StringComparison.Ordinal) ||
               value.Contains("(+", StringComparison.Ordinal);
    }

+    private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries)
+    {
+        var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
+        var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
+
+        for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
+        {
+            foreach (var columnKey in columnKeys)
+            {
+                var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
+                var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
+
+                if (current is null || next is null)
+                {
+                    continue;
+                }
+
+                var leadingAffixCount = 0;
+                while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount]))
+                {
+                    leadingAffixCount++;
+                }
+
+                if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
+                {
+                    continue;
+                }
+
+                current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
+                next.Lines.RemoveRange(0, leadingAffixCount);
+            }
+        }
+    }
+
    private static string CollapseWhitespace(string value) =>
        Regex.Replace(value.Trim(), @"\s+", " ");

@@ -295,4 +364,12 @@ public sealed class StandardCriticalTableParser
    private sealed record ColumnAnchor(string Key, double CenterX);

    private sealed record RowAnchor(string Label, int Top, int SortOrder);
+
+    private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
+    {
+        public string RollBandLabel { get; } = rollBandLabel;
+        public int RowIndex { get; } = rowIndex;
+        public string ColumnKey { get; } = columnKey;
+        public List<string> Lines { get; } = lines;
+    }
 }