Phase 2.1 import

This commit is contained in:
2026-03-14 01:44:30 +01:00
parent be5c0a9b54
commit 5c4d540246
4 changed files with 151 additions and 22 deletions

View File

@@ -8,6 +8,7 @@ public sealed class StandardCriticalTableParser
{
private const int HeaderToBodyMinimumGap = 20;
private const int TopGroupingTolerance = 2;
private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[-])", RegexOptions.Compiled);
public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent)
{
@@ -49,8 +50,7 @@ public sealed class StandardCriticalTableParser
.Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
var cellEntries = new List<CellEntry>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
@@ -80,30 +80,65 @@ public sealed class StandardCriticalTableParser
continue;
}
var lines = BuildLines(cellFragments);
var rawAffixLines = lines.Where(IsAffixLikeLine).ToList();
var descriptionLines = lines.Where(line => !IsAffixLikeLine(line)).ToList();
var rawCellText = string.Join(Environment.NewLine, lines);
var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
cellEntries.Add(new CellEntry(
rowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
lines,
rawCellText,
descriptionText,
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
columnAnchor.Key,
rowAnchors[rowIndex].Label,
rawCellText,
descriptionText,
rawAffixText));
BuildLines(cellFragments).ToList()));
}
}
RepairLeadingAffixLeakage(cellEntries);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
{
var firstProseIndex = cellEntry.Lines.FindIndex(line => !IsAffixLikeLine(line));
var firstAffixIndex = cellEntry.Lines.FindIndex(IsAffixLikeLine);
if (firstProseIndex > 0)
{
validationErrors.Add(
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' begins with affix-like lines before prose.");
}
if (firstAffixIndex >= 0)
{
var proseAfterAffix = cellEntry.Lines
.Skip(firstAffixIndex + 1)
.Any(line => !IsAffixLikeLine(line));
if (proseAfterAffix)
{
validationErrors.Add(
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' contains prose after affix lines.");
}
}
var rawAffixLines = cellEntry.Lines.Where(IsAffixLikeLine).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !IsAffixLikeLine(line)).ToList();
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines,
rawCellText,
descriptionText,
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
rawCellText,
descriptionText,
rawAffixText));
}
if (columnCenters.Count != 5)
{
validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}.");
@@ -276,12 +311,46 @@ public sealed class StandardCriticalTableParser
value.StartsWith("\u220F", StringComparison.Ordinal) ||
value.StartsWith("\u03C0", StringComparison.Ordinal) ||
value.StartsWith("\u222B", StringComparison.Ordinal) ||
char.IsDigit(value[0]) ||
NumericAffixLineRegex.IsMatch(value) ||
value.Contains(" - ", StringComparison.Ordinal) ||
value.Contains("(-", StringComparison.Ordinal) ||
value.Contains("(+", StringComparison.Ordinal);
}
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
{
foreach (var columnKey in columnKeys)
{
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount]))
{
leadingAffixCount++;
}
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
{
continue;
}
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
next.Lines.RemoveRange(0, leadingAffixCount);
}
}
}
private static string CollapseWhitespace(string value) =>
Regex.Replace(value.Trim(), @"\s+", " ");
@@ -295,4 +364,12 @@ public sealed class StandardCriticalTableParser
private sealed record ColumnAnchor(string Key, double CenterX);
private sealed record RowAnchor(string Label, int Top, int SortOrder);
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string RollBandLabel { get; } = rollBandLabel;
public int RowIndex { get; } = rowIndex;
public string ColumnKey { get; } = columnKey;
public List<string> Lines { get; } = lines;
}
}