Implement phase 5 critical branch extraction

This commit is contained in:
2026-03-14 10:21:26 +01:00
parent b2f61c3d73
commit 60c5d886a4
20 changed files with 589 additions and 399 deletions

View File

@@ -63,7 +63,6 @@ The current implementation supports:
The current implementation does not yet support:
- OCR/image-based PDFs such as `Void.pdf`
- normalized `critical_branch` population
- normalized `critical_effect` population
- automatic confidence scoring beyond validation errors
@@ -210,10 +209,6 @@ The importer now explicitly rejects cells that still look structurally wrong aft
This keeps the phase-2.1 safety goal in place while allowing broader standard-table layouts that render a single affix block either before or after the prose block.
## Planned Future Phases
The current architecture is intended to support additional phases:
### Phase 3: Broader Table Coverage
Phase 3 expands the manifest and validates the shared `standard` parser across a broader set of `A-E` tables.
@@ -494,11 +489,12 @@ Affix-like classification is intentionally conservative. Numeric prose lines suc
The current implementation stores:
- `RawCellText`
- `DescriptionText`
- `RawAffixText`
- base `RawCellText`
- base `DescriptionText`
- base `RawAffixText`
- parsed conditional branches with condition text, branch prose, and branch affix text
It does not yet normalize branches or effects into separate tables.
It does not yet normalize effects into separate tables.
## Validation Rules

View File

@@ -14,6 +14,7 @@ public sealed class RolemasterDbContext(DbContextOptions<RolemasterDbContext> op
public DbSet<CriticalColumn> CriticalColumns => Set<CriticalColumn>();
public DbSet<CriticalRollBand> CriticalRollBands => Set<CriticalRollBand>();
public DbSet<CriticalResult> CriticalResults => Set<CriticalResult>();
public DbSet<CriticalBranch> CriticalBranches => Set<CriticalBranch>();
protected override void OnModelCreating(ModelBuilder modelBuilder)
{
@@ -78,5 +79,13 @@ public sealed class RolemasterDbContext(DbContextOptions<RolemasterDbContext> op
entity.HasIndex(item => new { item.CriticalTableId, item.CriticalGroupId, item.CriticalColumnId, item.CriticalRollBandId }).IsUnique();
entity.Property(item => item.ParseStatus).HasMaxLength(32);
});
modelBuilder.Entity<CriticalBranch>(entity =>
{
entity.HasIndex(item => item.CriticalResultId);
entity.HasIndex(item => new { item.CriticalResultId, item.SortOrder });
entity.Property(item => item.BranchKind).HasMaxLength(32);
entity.Property(item => item.ConditionKey).HasMaxLength(128);
});
}
}

View File

@@ -11,6 +11,7 @@ public static class RolemasterDbInitializer
await using var dbContext = await dbFactory.CreateDbContextAsync(cancellationToken);
await dbContext.Database.EnsureCreatedAsync(cancellationToken);
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext, cancellationToken);
if (await dbContext.AttackTables.AnyAsync(cancellationToken))
{

View File

@@ -0,0 +1,43 @@
using Microsoft.EntityFrameworkCore;
namespace RolemasterDb.App.Data;
public static class RolemasterDbSchemaUpgrader
{
public static async Task EnsureLatestAsync(RolemasterDbContext dbContext, CancellationToken cancellationToken = default)
{
await dbContext.Database.ExecuteSqlRawAsync(
"""
CREATE TABLE IF NOT EXISTS "CriticalBranches" (
"Id" INTEGER NOT NULL CONSTRAINT "PK_CriticalBranches" PRIMARY KEY AUTOINCREMENT,
"CriticalResultId" INTEGER NOT NULL,
"BranchKind" TEXT NOT NULL,
"ConditionKey" TEXT NULL,
"ConditionText" TEXT NOT NULL,
"ConditionJson" TEXT NOT NULL,
"RawText" TEXT NOT NULL,
"DescriptionText" TEXT NOT NULL,
"RawAffixText" TEXT NULL,
"ParsedJson" TEXT NOT NULL,
"SortOrder" INTEGER NOT NULL,
CONSTRAINT "FK_CriticalBranches_CriticalResults_CriticalResultId"
FOREIGN KEY ("CriticalResultId") REFERENCES "CriticalResults" ("Id") ON DELETE CASCADE
);
""",
cancellationToken);
await dbContext.Database.ExecuteSqlRawAsync(
"""
CREATE INDEX IF NOT EXISTS "IX_CriticalBranches_CriticalResultId"
ON "CriticalBranches" ("CriticalResultId");
""",
cancellationToken);
await dbContext.Database.ExecuteSqlRawAsync(
"""
CREATE INDEX IF NOT EXISTS "IX_CriticalBranches_CriticalResultId_SortOrder"
ON "CriticalBranches" ("CriticalResultId", "SortOrder");
""",
cancellationToken);
}
}

View File

@@ -0,0 +1,17 @@
namespace RolemasterDb.App.Domain;
public sealed class CriticalBranch
{
public int Id { get; set; }
public int CriticalResultId { get; set; }
public string BranchKind { get; set; } = "conditional";
public string? ConditionKey { get; set; }
public string ConditionText { get; set; } = string.Empty;
public string ConditionJson { get; set; } = "{}";
public string RawText { get; set; } = string.Empty;
public string DescriptionText { get; set; } = string.Empty;
public string? RawAffixText { get; set; }
public string ParsedJson { get; set; } = "{}";
public int SortOrder { get; set; }
public CriticalResult CriticalResult { get; set; } = null!;
}

View File

@@ -16,4 +16,5 @@ public sealed class CriticalResult
public CriticalGroup? CriticalGroup { get; set; }
public CriticalColumn CriticalColumn { get; set; } = null!;
public CriticalRollBand CriticalRollBand { get; set; } = null!;
public List<CriticalBranch> Branches { get; set; } = [];
}

Binary file not shown.

View File

@@ -1,3 +1,6 @@
using Microsoft.EntityFrameworkCore;
using RolemasterDb.App.Data;
using RolemasterDb.ImportTool.Parsing;
namespace RolemasterDb.ImportTool.Tests;
@@ -249,6 +252,83 @@ public sealed class StandardCriticalTableParserIntegrationTests
Assert.Contains("Blast goes in through foe's eye", superSlaying.DescriptionText, StringComparison.OrdinalIgnoreCase);
}
[Fact]
public async Task Slash_branch_cells_split_base_text_from_conditional_affix_branches()
{
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "slash", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var result = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "36-45", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "B", StringComparison.Ordinal));
Assert.Equal("Strike foe in shin. If he doesn't have greaves, you slash open foe's shin.", result.DescriptionText);
Assert.Null(result.RawAffixText);
Assert.DoesNotContain("with leg greaves:", result.RawCellText, StringComparison.OrdinalIgnoreCase);
Assert.Equal(2, result.Branches.Count);
var withGreaves = result.Branches.Single(item => string.Equals(item.ConditionText, "with leg greaves", StringComparison.OrdinalIgnoreCase));
var withoutGreaves = result.Branches.Single(item => string.Equals(item.ConditionText, "w/o leg greaves", StringComparison.OrdinalIgnoreCase));
Assert.Equal("with_leg_greaves", withGreaves.ConditionKey);
Assert.Equal("+2H π", withGreaves.RawAffixText);
Assert.Equal(string.Empty, withGreaves.DescriptionText);
Assert.Equal("without_leg_greaves", withoutGreaves.ConditionKey);
Assert.Equal("+2H ∫", withoutGreaves.RawAffixText);
}
[Fact]
public async Task Impact_branch_cells_keep_prose_branch_text_separate_from_affix_branch_text()
{
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "impact", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var result = parseResult.Table.Results.Single(item =>
item.GroupKey is null &&
string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, "D", StringComparison.Ordinal));
Assert.Equal(
"Onslaught to foe's midsection. Organs are damaged and foe throws up blood. Foe's abdomen is seriously damaged. He falls and should not be moved.",
result.DescriptionText);
Assert.Null(result.RawAffixText);
Assert.Equal(2, result.Branches.Count);
var withArmor = result.Branches.Single(item => string.Equals(item.ConditionText, "with abdominal armor", StringComparison.OrdinalIgnoreCase));
var withoutArmor = result.Branches.Single(item => string.Equals(item.ConditionText, "w/o abdominal armor", StringComparison.OrdinalIgnoreCase));
Assert.Equal("12∑", withArmor.RawAffixText);
Assert.Equal(string.Empty, withArmor.DescriptionText);
Assert.Null(withoutArmor.RawAffixText);
Assert.Equal("dies in 6 rounds", withoutArmor.DescriptionText);
}
[Fact]
public async Task Loader_upgrades_existing_sqlite_and_persists_branch_rows()
{
var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "slash", StringComparison.Ordinal));
var parseResult = await LoadParseResultAsync(entry);
var databasePath = CreateTemporaryDatabaseCopy();
var loader = new CriticalImportLoader(databasePath);
await loader.LoadAsync(parseResult.Table);
await using var dbContext = CreateDbContext(databasePath);
var result = await dbContext.CriticalResults
.Include(item => item.CriticalTable)
.Include(item => item.CriticalColumn)
.Include(item => item.CriticalRollBand)
.Include(item => item.Branches)
.SingleAsync(item =>
item.CriticalTable.Slug == "slash" &&
item.CriticalColumn.ColumnKey == "B" &&
item.CriticalRollBand.Label == "36-45");
Assert.DoesNotContain("with leg greaves:", result.RawCellText, StringComparison.OrdinalIgnoreCase);
Assert.Equal(2, result.Branches.Count);
Assert.Contains(result.Branches, item => item.ConditionKey == "with_leg_greaves" && item.RawAffixText == "+2H π");
Assert.Contains(result.Branches, item => item.ConditionKey == "without_leg_greaves" && item.RawAffixText == "+2H ∫");
}
private static async Task<CriticalTableParseResult> LoadParseResultAsync(CriticalImportManifestEntry entry)
{
var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml");
@@ -278,6 +358,22 @@ public sealed class StandardCriticalTableParserIntegrationTests
return cacheRoot;
}
private static RolemasterDbContext CreateDbContext(string databasePath)
{
var options = new DbContextOptionsBuilder<RolemasterDbContext>()
.UseSqlite($"Data Source={databasePath}")
.Options;
return new RolemasterDbContext(options);
}
private static string CreateTemporaryDatabaseCopy()
{
var databasePath = Path.Combine(GetArtifactCacheRoot(), $"rolemaster-{Guid.NewGuid():N}.db");
File.Copy(Path.Combine(GetRepositoryRoot(), "src", "RolemasterDb.App", "rolemaster.db"), databasePath, true);
return databasePath;
}
private static string GetRepositoryRoot()
{
var probe = new DirectoryInfo(AppContext.BaseDirectory);

View File

@@ -12,10 +12,12 @@ public sealed class CriticalImportLoader(string databasePath)
{
await using var dbContext = CreateDbContext();
await dbContext.Database.EnsureCreatedAsync(cancellationToken);
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext, cancellationToken);
var removedTableCount = await dbContext.CriticalTables.CountAsync(cancellationToken);
await using var transaction = await dbContext.Database.BeginTransactionAsync(cancellationToken);
await dbContext.CriticalBranches.ExecuteDeleteAsync(cancellationToken);
await dbContext.CriticalResults.ExecuteDeleteAsync(cancellationToken);
await dbContext.CriticalGroups.ExecuteDeleteAsync(cancellationToken);
await dbContext.CriticalColumns.ExecuteDeleteAsync(cancellationToken);
@@ -30,6 +32,7 @@ public sealed class CriticalImportLoader(string databasePath)
{
await using var dbContext = CreateDbContext();
await dbContext.Database.EnsureCreatedAsync(cancellationToken);
await RolemasterDbSchemaUpgrader.EnsureLatestAsync(dbContext, cancellationToken);
await using var transaction = await dbContext.Database.BeginTransactionAsync(cancellationToken);
await DeleteTableAsync(dbContext, table.Slug, cancellationToken);
@@ -86,7 +89,21 @@ public sealed class CriticalImportLoader(string databasePath)
DescriptionText = item.DescriptionText,
RawAffixText = item.RawAffixText,
ParsedJson = "{}",
ParseStatus = "raw"
ParseStatus = "raw",
Branches = item.Branches
.Select(branch => new CriticalBranch
{
BranchKind = branch.BranchKind,
ConditionKey = branch.ConditionKey,
ConditionText = branch.ConditionText,
ConditionJson = "{}",
RawText = branch.RawText,
DescriptionText = branch.DescriptionText,
RawAffixText = branch.RawAffixText,
ParsedJson = "{}",
SortOrder = branch.SortOrder
})
.ToList()
})
.ToList();
@@ -121,6 +138,10 @@ public sealed class CriticalImportLoader(string databasePath)
return;
}
await dbContext.CriticalBranches
.Where(item => item.CriticalResult.CriticalTableId == tableId.Value)
.ExecuteDeleteAsync(cancellationToken);
await dbContext.CriticalResults
.Where(item => item.CriticalTableId == tableId.Value)
.ExecuteDeleteAsync(cancellationToken);

View File

@@ -0,0 +1,10 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed class ColumnarCellEntry(string? groupKey, string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string? GroupKey { get; } = groupKey;
public string RollBandLabel { get; } = rollBandLabel;
public int RowIndex { get; } = rowIndex;
public string ColumnKey { get; } = columnKey;
public List<string> Lines { get; } = lines;
}

View File

@@ -0,0 +1,17 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed class CriticalCellParseContent(
IReadOnlyList<string> baseLines,
string rawCellText,
string descriptionText,
string? rawAffixText,
IReadOnlyList<ParsedCriticalBranch> branches,
IReadOnlyList<string> validationErrors)
{
public IReadOnlyList<string> BaseLines { get; } = baseLines;
public string RawCellText { get; } = rawCellText;
public string DescriptionText { get; } = descriptionText;
public string? RawAffixText { get; } = rawAffixText;
public IReadOnlyList<ParsedCriticalBranch> Branches { get; } = branches;
public IReadOnlyList<string> ValidationErrors { get; } = validationErrors;
}

View File

@@ -0,0 +1,114 @@
namespace RolemasterDb.ImportTool.Parsing;
internal static class CriticalCellTextParser
{
internal static CriticalCellParseContent Parse(IReadOnlyList<string> lines, ISet<string> affixLegendSymbols)
{
var validationErrors = new List<string>();
var branchStartIndexes = FindBranchStartIndexes(lines);
var baseLineCount = branchStartIndexes.Count == 0 ? lines.Count : branchStartIndexes[0];
var baseLines = lines.Take(baseLineCount).ToList();
var branches = new List<ParsedCriticalBranch>();
validationErrors.AddRange(ValidateSegmentCount(baseLines, affixLegendSymbols, "Base content"));
for (var branchIndex = 0; branchIndex < branchStartIndexes.Count; branchIndex++)
{
var startIndex = branchStartIndexes[branchIndex];
var endIndex = branchIndex == branchStartIndexes.Count - 1
? lines.Count
: branchStartIndexes[branchIndex + 1];
branches.Add(ParseBranch(
lines.Skip(startIndex).Take(endIndex - startIndex).ToList(),
branchIndex + 1,
affixLegendSymbols,
validationErrors));
}
var (rawCellText, descriptionText, rawAffixText) = BuildTextSections(baseLines, affixLegendSymbols);
return new CriticalCellParseContent(baseLines, rawCellText, descriptionText, rawAffixText, branches, validationErrors);
}
private static ParsedCriticalBranch ParseBranch(
IReadOnlyList<string> branchLines,
int sortOrder,
ISet<string> affixLegendSymbols,
List<string> validationErrors)
{
var firstLine = branchLines[0];
var separatorIndex = firstLine.IndexOf(':', StringComparison.Ordinal);
var conditionText = CriticalTableParserSupport.CollapseWhitespace(firstLine[..separatorIndex]);
var firstPayloadLine = CriticalTableParserSupport.CollapseWhitespace(firstLine[(separatorIndex + 1)..]);
var payloadLines = new List<string>();
if (!string.IsNullOrWhiteSpace(firstPayloadLine))
{
payloadLines.Add(firstPayloadLine);
}
foreach (var continuationLine in branchLines.Skip(1))
{
var normalized = CriticalTableParserSupport.CollapseWhitespace(continuationLine);
if (!string.IsNullOrWhiteSpace(normalized))
{
payloadLines.Add(normalized);
}
}
validationErrors.AddRange(ValidateSegmentCount(payloadLines, affixLegendSymbols, $"Branch '{conditionText}'"));
var (_, descriptionText, rawAffixText) = BuildTextSections(payloadLines, affixLegendSymbols);
return new ParsedCriticalBranch(
"conditional",
CriticalTableParserSupport.NormalizeConditionKey(conditionText),
conditionText,
string.Join(Environment.NewLine, branchLines),
descriptionText,
rawAffixText,
sortOrder);
}
private static List<int> FindBranchStartIndexes(IReadOnlyList<string> lines)
{
var branchStartIndexes = new List<int>();
for (var index = 0; index < lines.Count; index++)
{
if (CriticalTableParserSupport.IsConditionalBranchStartLine(lines[index]))
{
branchStartIndexes.Add(index);
}
}
return branchStartIndexes;
}
private static IReadOnlyList<string> ValidateSegmentCount(
IReadOnlyList<string> lines,
ISet<string> affixLegendSymbols,
string scope)
{
if (lines.Count == 0)
{
return [];
}
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(lines, affixLegendSymbols);
return segmentCount > 2
? [$"{scope} interleaves prose and affix lines."]
: [];
}
private static (string RawText, string DescriptionText, string? RawAffixText) BuildTextSections(
IReadOnlyList<string> lines,
ISet<string> affixLegendSymbols)
{
var rawText = string.Join(Environment.NewLine, lines);
var rawAffixLines = lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
return (rawText, descriptionText, rawAffixText);
}
}

View File

@@ -169,15 +169,9 @@ internal static class CriticalTableParserSupport
return true;
}
if (value.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
value.StartsWith("unless ", StringComparison.OrdinalIgnoreCase))
if (IsConditionalBranchStartLine(value))
{
return value.Contains(':', StringComparison.Ordinal);
return true;
}
if (affixLegendSymbols.Count > 0 &&
@@ -242,6 +236,23 @@ internal static class CriticalTableParserSupport
internal static string CollapseWhitespace(string value) =>
Regex.Replace(value.Trim(), @"\s+", " ");
internal static bool IsConditionalBranchStartLine(string value)
{
var normalized = value.Trim();
if (!normalized.Contains(':', StringComparison.Ordinal))
{
return false;
}
return normalized.StartsWith("with ", StringComparison.OrdinalIgnoreCase) ||
normalized.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) ||
normalized.StartsWith("without ", StringComparison.OrdinalIgnoreCase) ||
normalized.StartsWith("if ", StringComparison.OrdinalIgnoreCase) ||
normalized.StartsWith("while ", StringComparison.OrdinalIgnoreCase) ||
normalized.StartsWith("until ", StringComparison.OrdinalIgnoreCase) ||
normalized.StartsWith("unless ", StringComparison.OrdinalIgnoreCase);
}
internal static string NormalizeText(string value) =>
value
.Replace('\u00a0', ' ')
@@ -250,6 +261,25 @@ internal static class CriticalTableParserSupport
.Replace('', '\'')
.Trim();
internal static string? NormalizeConditionKey(string conditionText)
{
var normalized = CollapseWhitespace(conditionText)
.ToLowerInvariant()
.Replace("w/o", "without", StringComparison.Ordinal);
normalized = Regex.Replace(normalized, @"[^a-z0-9]+", "_");
normalized = normalized.Trim('_');
return normalized.Length == 0 ? null : normalized;
}
internal static int FindKeyTop(IReadOnlyList<XmlTextFragment> fragments) =>
fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
internal static HashSet<string> DetectAffixLegendSymbols(IReadOnlyList<XmlTextFragment> fragments, int keyTop)
{
if (keyTop == int.MaxValue)
@@ -347,6 +377,138 @@ internal static class CriticalTableParserSupport
return groups;
}
internal static List<RowAnchor> CreateRowAnchors(IReadOnlyList<XmlTextFragment> rowLabelFragments) =>
rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
internal static List<XmlTextFragment> BuildBodyFragments(
IReadOnlyList<XmlTextFragment> fragments,
int bodyStartTop,
int keyTop,
int leftCutoff,
IReadOnlyList<RowAnchor> rowAnchors,
IReadOnlyCollection<XmlTextFragment> excludedFragments,
IReadOnlyList<(string Key, double CenterX)> columnCenters,
ISet<string> affixLegendSymbols)
{
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - TopGroupingTolerance &&
!IsFooterPageNumberFragment(item, keyTop) &&
!IsPotentialRowLabelFragment(item, leftCutoff) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
!excludedFragments.Contains(item))
.ToList();
return SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
}
internal static void RepairLeadingAffixLeakage(List<ColumnarCellEntry> cellEntries, ISet<string> affixLegendSymbols)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
var axes = cellEntries
.Select(item => (item.GroupKey, item.ColumnKey))
.Distinct()
.ToList();
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
{
foreach (var (groupKey, columnKey) in axes)
{
var current = cellEntries.SingleOrDefault(item =>
item.RowIndex == rowIndex &&
string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal));
var next = cellEntries.SingleOrDefault(item =>
item.RowIndex == rowIndex + 1 &&
string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) &&
string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal));
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
{
leadingAffixCount++;
}
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
{
continue;
}
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
next.Lines.RemoveRange(0, leadingAffixCount);
}
}
}
internal static int ResolveRowBoundaryTop(
RowAnchor current,
RowAnchor next,
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
{
var linesBetweenLabels = bodyLines
.Where(item => item.Top >= current.Top && item.Top < next.Top)
.OrderBy(item => item.Top)
.ToList();
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
{
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
{
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
}
}
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
}
internal static void BuildParsedArtifacts(
IReadOnlyList<ColumnarCellEntry> cellEntries,
ISet<string> affixLegendSymbols,
List<ParsedCriticalCellArtifact> parsedCells,
List<ParsedCriticalResult> parsedResults,
List<string> validationErrors)
{
foreach (var cellEntry in cellEntries)
{
var content = CriticalCellTextParser.Parse(cellEntry.Lines, affixLegendSymbols);
validationErrors.AddRange(content.ValidationErrors.Select(error =>
$"Cell '{BuildCellIdentifier(cellEntry)}': {error}"));
parsedCells.Add(new ParsedCriticalCellArtifact(
cellEntry.GroupKey,
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines.ToList(),
content.BaseLines,
content.RawCellText,
content.DescriptionText,
content.RawAffixText,
content.Branches));
parsedResults.Add(new ParsedCriticalResult(
cellEntry.GroupKey,
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
content.RawCellText,
content.DescriptionText,
content.RawAffixText,
content.Branches));
}
}
private static string BuildCellIdentifier(ColumnarCellEntry cellEntry) =>
cellEntry.GroupKey is null
? $"{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}"
: $"{cellEntry.RollBandLabel}/{cellEntry.GroupKey}/{cellEntry.ColumnKey}";
private static bool LooksLikeSplitRollBandStart(string value) =>
Regex.IsMatch(value.Trim(), @"^\d{2,3}\s*-$");

View File

@@ -36,13 +36,7 @@ public sealed class GroupedVariantCriticalTableParser
groupHeaders.Max(item => item.Top),
columnHeaders.Max(item => item.Top))
+ CriticalTableParserSupport.HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
var leftCutoff = columnHeaders.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
@@ -50,11 +44,7 @@ public sealed class GroupedVariantCriticalTableParser
leftCutoff,
bodyStartTop,
keyTop);
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
if (rowAnchors.Count == 0)
{
@@ -65,34 +55,33 @@ public sealed class GroupedVariantCriticalTableParser
.Select(item => (item.CompositeKey, item.CenterX))
.ToList();
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
!groupHeaders.Contains(item) &&
!columnHeaders.Contains(item))
.ToList();
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var excludedFragments = groupHeaders.Concat(columnHeaders).ToList();
var bodyFragments = CriticalTableParserSupport.BuildBodyFragments(
fragments,
bodyStartTop,
keyTop,
leftCutoff,
rowAnchors,
excludedFragments,
columnCenters,
affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<CellEntry>();
var cellEntries = new List<ColumnarCellEntry>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? bodyStartTop
: ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
@@ -112,7 +101,7 @@ public sealed class GroupedVariantCriticalTableParser
continue;
}
cellEntries.Add(new CellEntry(
cellEntries.Add(new ColumnarCellEntry(
anchor.GroupKey,
rowAnchors[rowIndex].Label,
rowIndex,
@@ -121,45 +110,11 @@ public sealed class GroupedVariantCriticalTableParser
}
}
RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
CriticalTableParserSupport.RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
foreach (var cellEntry in cellEntries
.OrderBy(item => item.RowIndex)
.ThenBy(item => item.GroupKey, StringComparer.Ordinal)
.ThenBy(item => item.ColumnKey, StringComparer.Ordinal))
{
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
if (segmentCount > 2)
{
validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.GroupKey}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
}
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
cellEntry.GroupKey,
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines,
rawCellText,
descriptionText,
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
cellEntry.GroupKey,
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
rawCellText,
descriptionText,
rawAffixText));
}
CriticalTableParserSupport.BuildParsedArtifacts(cellEntries, affixLegendSymbols, parsedCells, parsedResults, validationErrors);
var expectedCellCount = rowAnchors.Count * ExpectedGroups.Length * ExpectedColumns.Length;
if (parsedCells.Count != expectedCellCount)
@@ -235,72 +190,4 @@ public sealed class GroupedVariantCriticalTableParser
throw new InvalidOperationException("Could not find the grouped-variant column header row in the XML artifact.");
}
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
var axes = cellEntries
.Select(item => (item.GroupKey, item.ColumnKey))
.Distinct()
.ToList();
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
{
foreach (var (groupKey, columnKey) in axes)
{
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.GroupKey == groupKey && item.ColumnKey == columnKey);
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.GroupKey == groupKey && item.ColumnKey == columnKey);
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
{
leadingAffixCount++;
}
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
{
continue;
}
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
next.Lines.RemoveRange(0, leadingAffixCount);
}
}
}
private static int ResolveRowBoundaryTop(
RowAnchor current,
RowAnchor next,
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
{
var linesBetweenLabels = bodyLines
.Where(item => item.Top >= current.Top && item.Top < next.Top)
.OrderBy(item => item.Top)
.ToList();
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
{
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
{
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
}
}
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
}
private sealed record RowAnchor(string Label, int Top, int SortOrder);
private sealed class CellEntry(string groupKey, string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string GroupKey { get; } = groupKey;
public string RollBandLabel { get; } = rollBandLabel;
public int RowIndex { get; } = rowIndex;
public string ColumnKey { get; } = columnKey;
public List<string> Lines { get; } = lines;
}
}

View File

@@ -0,0 +1,19 @@
namespace RolemasterDb.ImportTool.Parsing;
public sealed class ParsedCriticalBranch(
string branchKind,
string? conditionKey,
string conditionText,
string rawText,
string descriptionText,
string? rawAffixText,
int sortOrder)
{
public string BranchKind { get; } = branchKind;
public string? ConditionKey { get; } = conditionKey;
public string ConditionText { get; } = conditionText;
public string RawText { get; } = rawText;
public string DescriptionText { get; } = descriptionText;
public string? RawAffixText { get; } = rawAffixText;
public int SortOrder { get; } = sortOrder;
}

View File

@@ -5,15 +5,19 @@ public sealed class ParsedCriticalCellArtifact(
string rollBandLabel,
string columnKey,
IReadOnlyList<string> lines,
IReadOnlyList<string> baseLines,
string rawCellText,
string descriptionText,
string? rawAffixText)
string? rawAffixText,
IReadOnlyList<ParsedCriticalBranch> branches)
{
public string? GroupKey { get; } = groupKey;
public string RollBandLabel { get; } = rollBandLabel;
public string ColumnKey { get; } = columnKey;
public IReadOnlyList<string> Lines { get; } = lines;
public IReadOnlyList<string> BaseLines { get; } = baseLines;
public string RawCellText { get; } = rawCellText;
public string DescriptionText { get; } = descriptionText;
public string? RawAffixText { get; } = rawAffixText;
public IReadOnlyList<ParsedCriticalBranch> Branches { get; } = branches;
}

View File

@@ -6,7 +6,8 @@ public sealed class ParsedCriticalResult(
string rollBandLabel,
string rawCellText,
string descriptionText,
string? rawAffixText)
string? rawAffixText,
IReadOnlyList<ParsedCriticalBranch> branches)
{
public string? GroupKey { get; } = groupKey;
public string ColumnKey { get; } = columnKey;
@@ -14,4 +15,5 @@ public sealed class ParsedCriticalResult(
public string RawCellText { get; } = rawCellText;
public string DescriptionText { get; } = descriptionText;
public string? RawAffixText { get; } = rawAffixText;
public IReadOnlyList<ParsedCriticalBranch> Branches { get; } = branches;
}

View File

@@ -0,0 +1,3 @@
namespace RolemasterDb.ImportTool.Parsing;
internal sealed record RowAnchor(string Label, int Top, int SortOrder);

View File

@@ -15,13 +15,7 @@ public sealed class StandardCriticalTableParser
.ToList();
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
@@ -29,44 +23,39 @@ public sealed class StandardCriticalTableParser
leftCutoff,
bodyStartTop,
keyTop);
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
if (rowAnchors.Count == 0)
{
validationErrors.Add("No roll-band labels were found in the XML artifact.");
}
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList();
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyFragments = CriticalTableParserSupport.BuildBodyFragments(
fragments,
bodyStartTop,
keyTop,
leftCutoff,
rowAnchors,
headerFragments,
columnCenters,
affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<CellEntry>();
var cellEntries = new List<ColumnarCellEntry>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? bodyStartTop
: ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
@@ -86,7 +75,8 @@ public sealed class StandardCriticalTableParser
continue;
}
cellEntries.Add(new CellEntry(
cellEntries.Add(new ColumnarCellEntry(
null,
rowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
@@ -94,44 +84,11 @@ public sealed class StandardCriticalTableParser
}
}
RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
CriticalTableParserSupport.RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey))
{
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
if (segmentCount > 2)
{
validationErrors.Add(
$"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
}
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
null,
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines,
rawCellText,
descriptionText,
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
null,
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
rawCellText,
descriptionText,
rawAffixText));
}
CriticalTableParserSupport.BuildParsedArtifacts(cellEntries, affixLegendSymbols, parsedCells, parsedResults, validationErrors);
if (columnCenters.Count != 5)
{
@@ -185,68 +142,4 @@ public sealed class StandardCriticalTableParser
throw new InvalidOperationException("Could not find the standard-table A-E header row in the XML artifact.");
}
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
{
foreach (var columnKey in columnKeys)
{
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
{
leadingAffixCount++;
}
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
{
continue;
}
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
next.Lines.RemoveRange(0, leadingAffixCount);
}
}
}
private static int ResolveRowBoundaryTop(
RowAnchor current,
RowAnchor next,
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
{
var linesBetweenLabels = bodyLines
.Where(item => item.Top >= current.Top && item.Top < next.Top)
.OrderBy(item => item.Top)
.ToList();
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
{
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
{
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
}
}
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
}
private sealed record RowAnchor(string Label, int Top, int SortOrder);
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string RollBandLabel { get; } = rollBandLabel;
public int RowIndex { get; } = rowIndex;
public string ColumnKey { get; } = columnKey;
public List<string> Lines { get; } = lines;
}
}

View File

@@ -28,13 +28,7 @@ public sealed class VariantColumnCriticalTableParser
.ToList();
var bodyStartTop = headerFragments.Max(item => item.Top) + CriticalTableParserSupport.HeaderToBodyMinimumGap;
var keyTop = fragments
.Where(item =>
string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) ||
item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase))
.Select(item => (int?)item.Top)
.Min() ?? int.MaxValue;
var keyTop = CriticalTableParserSupport.FindKeyTop(fragments);
var affixLegendSymbols = CriticalTableParserSupport.DetectAffixLegendSymbols(fragments, keyTop);
var leftCutoff = headerFragments.Min(item => item.Left) - 10;
var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(
@@ -42,11 +36,7 @@ public sealed class VariantColumnCriticalTableParser
leftCutoff,
bodyStartTop,
keyTop);
var rowAnchors = rowLabelFragments
.OrderBy(item => item.Top)
.Select((item, index) => new RowAnchor(CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), item.Top, index + 1))
.ToList();
var rowAnchors = CriticalTableParserSupport.CreateRowAnchors(rowLabelFragments);
if (rowAnchors.Count == 0)
{
@@ -57,33 +47,32 @@ public sealed class VariantColumnCriticalTableParser
.Select(item => (item.Key, item.CenterX))
.ToList();
var bodyFragments = fragments
.Where(item =>
item.Top >= bodyStartTop &&
item.Top < keyTop - CriticalTableParserSupport.TopGroupingTolerance &&
!CriticalTableParserSupport.IsFooterPageNumberFragment(item, keyTop) &&
!CriticalTableParserSupport.IsPotentialRowLabelFragment(item, leftCutoff) &&
!rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, CriticalTableParserSupport.NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) &&
!headerFragments.Contains(item))
.ToList();
bodyFragments = CriticalTableParserSupport.SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols);
var bodyFragments = CriticalTableParserSupport.BuildBodyFragments(
fragments,
bodyStartTop,
keyTop,
leftCutoff,
rowAnchors,
headerFragments,
columnCenters,
affixLegendSymbols);
var bodyLines = CriticalTableParserSupport.BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols);
var parsedRollBands = rowAnchors
.Select(anchor => CriticalTableParserSupport.CreateRollBand(anchor.Label, anchor.SortOrder))
.ToList();
var cellEntries = new List<CellEntry>();
var cellEntries = new List<ColumnarCellEntry>();
for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++)
{
var rowStart = rowIndex == 0
? bodyStartTop
: ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex - 1], rowAnchors[rowIndex], bodyLines);
var rowEnd = rowIndex == rowAnchors.Count - 1
? keyTop - 1
: ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
: CriticalTableParserSupport.ResolveRowBoundaryTop(rowAnchors[rowIndex], rowAnchors[rowIndex + 1], bodyLines);
var rowFragments = bodyFragments
.Where(item => item.Top >= rowStart && item.Top < rowEnd)
@@ -103,7 +92,8 @@ public sealed class VariantColumnCriticalTableParser
continue;
}
cellEntries.Add(new CellEntry(
cellEntries.Add(new ColumnarCellEntry(
null,
rowAnchors[rowIndex].Label,
rowIndex,
columnAnchor.Key,
@@ -111,42 +101,11 @@ public sealed class VariantColumnCriticalTableParser
}
}
RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
CriticalTableParserSupport.RepairLeadingAffixLeakage(cellEntries, affixLegendSymbols);
var parsedCells = new List<ParsedCriticalCellArtifact>();
var parsedResults = new List<ParsedCriticalResult>();
foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey, StringComparer.Ordinal))
{
var segmentCount = CriticalTableParserSupport.CountLineTypeSegments(cellEntry.Lines, affixLegendSymbols);
if (segmentCount > 2)
{
validationErrors.Add($"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' interleaves prose and affix lines.");
}
var rawAffixLines = cellEntry.Lines.Where(line => CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var descriptionLines = cellEntry.Lines.Where(line => !CriticalTableParserSupport.IsAffixLikeLine(line, affixLegendSymbols)).ToList();
var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines);
var descriptionText = CriticalTableParserSupport.CollapseWhitespace(string.Join(' ', descriptionLines));
var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines);
parsedCells.Add(new ParsedCriticalCellArtifact(
null,
cellEntry.RollBandLabel,
cellEntry.ColumnKey,
cellEntry.Lines,
rawCellText,
descriptionText,
rawAffixText));
parsedResults.Add(new ParsedCriticalResult(
null,
cellEntry.ColumnKey,
cellEntry.RollBandLabel,
rawCellText,
descriptionText,
rawAffixText));
}
CriticalTableParserSupport.BuildParsedArtifacts(cellEntries, affixLegendSymbols, parsedCells, parsedResults, validationErrors);
if (columnAnchors.Count != ExpectedColumns.Length)
{
@@ -208,69 +167,5 @@ public sealed class VariantColumnCriticalTableParser
ExpectedColumns.SingleOrDefault(item => string.Equals(item.Label, value.Trim(), StringComparison.OrdinalIgnoreCase))
?? throw new InvalidOperationException($"Unsupported variant column label '{value}'.");
private static void RepairLeadingAffixLeakage(List<CellEntry> cellEntries, ISet<string> affixLegendSymbols)
{
var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex);
var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList();
for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++)
{
foreach (var columnKey in columnKeys)
{
var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey);
var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey);
if (current is null || next is null)
{
continue;
}
var leadingAffixCount = 0;
while (leadingAffixCount < next.Lines.Count && CriticalTableParserSupport.IsAffixLikeLine(next.Lines[leadingAffixCount], affixLegendSymbols))
{
leadingAffixCount++;
}
if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count)
{
continue;
}
current.Lines.AddRange(next.Lines.Take(leadingAffixCount));
next.Lines.RemoveRange(0, leadingAffixCount);
}
}
}
private static int ResolveRowBoundaryTop(
RowAnchor current,
RowAnchor next,
IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines)
{
var linesBetweenLabels = bodyLines
.Where(item => item.Top >= current.Top && item.Top < next.Top)
.OrderBy(item => item.Top)
.ToList();
for (var index = linesBetweenLabels.Count - 2; index >= 0; index--)
{
if (linesBetweenLabels[index].IsAffixLike && !linesBetweenLabels[index + 1].IsAffixLike)
{
return (int)Math.Floor((linesBetweenLabels[index].Top + linesBetweenLabels[index + 1].Top) / 2.0) + 1;
}
}
return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1;
}
private sealed record ColumnDefinition(string Key, string Label);
private sealed record RowAnchor(string Label, int Top, int SortOrder);
private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List<string> lines)
{
public string RollBandLabel { get; } = rollBandLabel;
public int RowIndex { get; } = rowIndex;
public string ColumnKey { get; } = columnKey;
public List<string> Lines { get; } = lines;
}
}