diff --git a/AGENTS.md b/AGENTS.md index 9ff8510..ac9b457 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -14,4 +14,5 @@ Also see the other related technical documentation in the docs folder. - When asked to begin working on a task, create a detailed implementation plan first, present the plan to the user, and ask for approval before beginning with the actual implementation. - When an task is finished, perform a code review to evaluate if the change is clean and maintainable with high software engineering standards. Iterate on the code and repeat the review process until satisfied. - After the implementation is finished, verify all changed files, and run `python D:\Code\crlf.py $file1 $file2 ...` only for files you recognize, in order to normalize all line endings of all touched files to CRLF. +- If there's documnentation present, always keep it updated. - At the end perform a git commit with a one-liner summary. \ No newline at end of file diff --git a/docs/critical_import_tool.md b/docs/critical_import_tool.md index 5db628f..4e96af7 100644 --- a/docs/critical_import_tool.md +++ b/docs/critical_import_tool.md @@ -32,6 +32,7 @@ The current implementation supports: - `standard` critical tables with columns `A-E` - XML-based extraction using `pdftohtml -xml` - geometry-based parsing for `Slash.pdf` +- row-boundary repair for trailing affix leakage - transactional loading into SQLite The current implementation does not yet support: @@ -149,6 +150,44 @@ This phase fixed the original `Slash / A / 72` corruption. The same lookup now r The important change is not only that the current output is correct, but that the importer now fails fast on structural ambiguity instead of silently loading corrupted rows. +## Phase 2.1: Boundary Hardening After Manual Validation + +After phase 2, a manual validation pass compared: + +- the rendered `Slash.pdf` +- the extracted `source.xml` +- the imported SQLite rows + +That review found a remaining defect around the `51-55` / `56-60` boundary: + +- `51-55` lost several affix lines +- `56-60` gained leading affix lines from the previous row + +The root cause was the original row segmentation rule: + +- rows were assigned strictly by the midpoint between adjacent roll-label `top` values + +That rule was too naive for rows whose affix block sits visually near the next row label. + +### Phase 2.1 fix + +The parser was hardened in two ways: + +1. Leading affix leakage repair + - after the initial row assignment, if a cell in the next row starts with affix-like lines and then continues with prose, those leading affix lines are moved back to the previous row +2. Better affix classification + - generic digit-starting lines are no longer assumed to be affixes + - this prevents prose such as `25% chance your weapon is stuck...` from being misclassified + +### Phase 2.1 validation rules + +The importer now explicitly rejects cells that still look structurally wrong after repair: + +- a cell may not begin with affix-like lines before prose +- a cell may not contain prose after affix lines + +This hardening step is important because it closed a class of row-boundary bugs that simple row/cell counts could not detect. + ## Planned Future Phases The current architecture is intended to support additional phases: @@ -353,6 +392,14 @@ Fragments inside a cell are grouped into lines by close `top` values and then or This produces a stable line list even when PDF text is broken into multiple fragments. +### Boundary Repair + +After the initial midpoint-based row assignment, the parser performs a repair step across adjacent rows in the same column. + +If the next row begins with affix-like lines and then continues with prose, those leading affix lines are treated as leaked trailing affixes from the previous row and moved back. + +This repair exists because some tables place affix lines close enough to the next row label that midpoint-only segmentation is not reliable. + ### Description vs Affix Splitting The parser classifies lines as: @@ -366,6 +413,8 @@ Affix-like lines include: - symbolic lines using the critical glyphs - branch-like affix lines such as `with leg greaves: +2H - ...` +Affix-like classification is intentionally conservative. Numeric prose lines such as `25% chance...` are not treated as affixes unless they match a known affix-like notation pattern. + The current implementation stores: - `RawCellText` @@ -384,6 +433,8 @@ At minimum, a valid `standard` table must satisfy: - roll-band labels are found - each detected row produces content for all five columns - total parsed cell count matches `row_count * 5` +- no cell begins with affix-like lines before prose +- no cell contains prose after affix lines If validation fails: diff --git a/src/RolemasterDb.App/rolemaster.db b/src/RolemasterDb.App/rolemaster.db index 183b54e..e1a6feb 100644 Binary files a/src/RolemasterDb.App/rolemaster.db and b/src/RolemasterDb.App/rolemaster.db differ diff --git a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs index 876c9cd..836f06f 100644 --- a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs @@ -8,6 +8,7 @@ public sealed class StandardCriticalTableParser { private const int HeaderToBodyMinimumGap = 20; private const int TopGroupingTolerance = 2; + private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled); public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) { @@ -49,8 +50,7 @@ public sealed class StandardCriticalTableParser .Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder)) .ToList(); - var parsedCells = new List(); - var parsedResults = new List(); + var cellEntries = new List(); for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++) { @@ -80,30 +80,65 @@ public sealed class StandardCriticalTableParser continue; } - var lines = BuildLines(cellFragments); - var rawAffixLines = lines.Where(IsAffixLikeLine).ToList(); - var descriptionLines = lines.Where(line => !IsAffixLikeLine(line)).ToList(); - var rawCellText = string.Join(Environment.NewLine, lines); - var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines)); - var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines); - - parsedCells.Add(new ParsedCriticalCellArtifact( + cellEntries.Add(new CellEntry( rowAnchors[rowIndex].Label, + rowIndex, columnAnchor.Key, - lines, - rawCellText, - descriptionText, - rawAffixText)); - - parsedResults.Add(new ParsedCriticalResult( - columnAnchor.Key, - rowAnchors[rowIndex].Label, - rawCellText, - descriptionText, - rawAffixText)); + BuildLines(cellFragments).ToList())); } } + RepairLeadingAffixLeakage(cellEntries); + + var parsedCells = new List(); + var parsedResults = new List(); + + foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey)) + { + var firstProseIndex = cellEntry.Lines.FindIndex(line => !IsAffixLikeLine(line)); + var firstAffixIndex = cellEntry.Lines.FindIndex(IsAffixLikeLine); + + if (firstProseIndex > 0) + { + validationErrors.Add( + $"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' begins with affix-like lines before prose."); + } + + if (firstAffixIndex >= 0) + { + var proseAfterAffix = cellEntry.Lines + .Skip(firstAffixIndex + 1) + .Any(line => !IsAffixLikeLine(line)); + + if (proseAfterAffix) + { + validationErrors.Add( + $"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' contains prose after affix lines."); + } + } + + var rawAffixLines = cellEntry.Lines.Where(IsAffixLikeLine).ToList(); + var descriptionLines = cellEntry.Lines.Where(line => !IsAffixLikeLine(line)).ToList(); + var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines); + var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines)); + var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines); + + parsedCells.Add(new ParsedCriticalCellArtifact( + cellEntry.RollBandLabel, + cellEntry.ColumnKey, + cellEntry.Lines, + rawCellText, + descriptionText, + rawAffixText)); + + parsedResults.Add(new ParsedCriticalResult( + cellEntry.ColumnKey, + cellEntry.RollBandLabel, + rawCellText, + descriptionText, + rawAffixText)); + } + if (columnCenters.Count != 5) { validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}."); @@ -276,12 +311,46 @@ public sealed class StandardCriticalTableParser value.StartsWith("\u220F", StringComparison.Ordinal) || value.StartsWith("\u03C0", StringComparison.Ordinal) || value.StartsWith("\u222B", StringComparison.Ordinal) || - char.IsDigit(value[0]) || + NumericAffixLineRegex.IsMatch(value) || value.Contains(" - ", StringComparison.Ordinal) || value.Contains("(-", StringComparison.Ordinal) || value.Contains("(+", StringComparison.Ordinal); } + private static void RepairLeadingAffixLeakage(List cellEntries) + { + var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex); + var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList(); + + for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++) + { + foreach (var columnKey in columnKeys) + { + var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey); + var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey); + + if (current is null || next is null) + { + continue; + } + + var leadingAffixCount = 0; + while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount])) + { + leadingAffixCount++; + } + + if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count) + { + continue; + } + + current.Lines.AddRange(next.Lines.Take(leadingAffixCount)); + next.Lines.RemoveRange(0, leadingAffixCount); + } + } + } + private static string CollapseWhitespace(string value) => Regex.Replace(value.Trim(), @"\s+", " "); @@ -295,4 +364,12 @@ public sealed class StandardCriticalTableParser private sealed record ColumnAnchor(string Key, double CenterX); private sealed record RowAnchor(string Label, int Top, int SortOrder); + + private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List lines) + { + public string RollBandLabel { get; } = rollBandLabel; + public int RowIndex { get; } = rowIndex; + public string ColumnKey { get; } = columnKey; + public List Lines { get; } = lines; + } }