diff --git a/sources/Void OCR CleanScan v2.pdf b/sources/Void OCR CleanScan v2.pdf new file mode 100644 index 0000000..0c88552 Binary files /dev/null and b/sources/Void OCR CleanScan v2.pdf differ diff --git a/sources/critical-import-manifest.json b/sources/critical-import-manifest.json index 1882795..6f4ddb9 100644 --- a/sources/critical-import-manifest.json +++ b/sources/critical-import-manifest.json @@ -174,7 +174,7 @@ "family": "standard", "extractionMethod": "ocr", "axisTemplateSlug": "mana-standard-19", - "pdfPath": "sources/Void.pdf", + "pdfPath": "sources/Void OCR CleanScan v2.pdf", "enabled": true } ] diff --git a/src/RolemasterDb.App/rolemaster.db b/src/RolemasterDb.App/rolemaster.db index 3c23734..23f1301 100644 Binary files a/src/RolemasterDb.App/rolemaster.db and b/src/RolemasterDb.App/rolemaster.db differ diff --git a/src/RolemasterDb.ImportTool.Tests/CriticalTableParserSupportTests.cs b/src/RolemasterDb.ImportTool.Tests/CriticalTableParserSupportTests.cs new file mode 100644 index 0000000..24f836e --- /dev/null +++ b/src/RolemasterDb.ImportTool.Tests/CriticalTableParserSupportTests.cs @@ -0,0 +1,32 @@ +using RolemasterDb.ImportTool.Parsing; + +namespace RolemasterDb.ImportTool.Tests; + +public sealed class CriticalTableParserSupportTests +{ + [Theory] + [InlineData("7-70", "67-70")] + [InlineData("6-10", "06-10")] + [InlineData("1-95", "91-95")] + public void NormalizeRollBandLabel_repairs_known_ocr_missing_leading_digit_cases(string damagedLabel, string expectedLabel) + { + Assert.True(CriticalTableParserSupport.IsRollBandLabel(damagedLabel)); + Assert.Equal(expectedLabel, CriticalTableParserSupport.NormalizeRollBandLabel(damagedLabel)); + } + + [Fact] + public void FindRowLabelFragments_keeps_repaired_ocr_row_labels_in_sequence() + { + List fragments = + [ + new PositionedTextFragment(1, 100, 10, 20, 10, "61-65"), + new PositionedTextFragment(1, 120, 10, 20, 10, "7-70"), + new PositionedTextFragment(1, 140, 10, 20, 10, "71-75") + ]; + + var rowLabelFragments = CriticalTableParserSupport.FindRowLabelFragments(fragments, leftCutoff: 100, bodyStartTop: 90, keyTop: 200); + var labels = rowLabelFragments.Select(item => CriticalTableParserSupport.NormalizeRollBandLabel(item.Text)).ToList(); + + Assert.Equal(["61-65", "67-70", "71-75"], labels); + } +} \ No newline at end of file diff --git a/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs index 32e296e..f995c6b 100644 --- a/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs +++ b/src/RolemasterDb.ImportTool/Parsing/CriticalTableParserSupport.cs @@ -1,7 +1,6 @@ using System.Text.RegularExpressions; using System.Xml; using System.Xml.Linq; - using RolemasterDb.App.Domain; using SharedParsing = RolemasterDb.CriticalParsing; @@ -21,34 +20,41 @@ internal static class CriticalTableParserSupport private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled); private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-|–)\d+\)$", RegexOptions.Compiled); private static readonly Regex BoundaryBonusLineRegex = new(@"^(?:all allies|all foe's allies|all foes|all opponents)\b", RegexOptions.IgnoreCase | RegexOptions.Compiled); + private static readonly Regex RollBandLabelRegex = new(@"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$", RegexOptions.Compiled); + + // Left-edge OCR occasionally drops the first digit of the lower bound on standard-table row labels. + private static readonly IReadOnlyDictionary OcrDamagedStandardRollBandLabels = new Dictionary(StringComparer.OrdinalIgnoreCase) + { + ["1-05"] = "01-05", + ["6-10"] = "06-10", + ["1-15"] = "11-15", + ["6-20"] = "16-20", + ["1-35"] = "21-35", + ["6-45"] = "36-45", + ["6-50"] = "46-50", + ["1-55"] = "51-55", + ["6-60"] = "56-60", + ["1-65"] = "61-65", + ["7-70"] = "67-70", + ["1-75"] = "71-75", + ["6-80"] = "76-80", + ["1-85"] = "81-85", + ["6-90"] = "86-90", + ["1-95"] = "91-95" + }; internal static List LoadFragments(string xmlContent) { using var stringReader = new StringReader(xmlContent); - using var xmlReader = XmlReader.Create( - stringReader, - new XmlReaderSettings - { - DtdProcessing = DtdProcessing.Ignore - }); + using var xmlReader = XmlReader.Create(stringReader, new XmlReaderSettings { DtdProcessing = DtdProcessing.Ignore }); var document = XDocument.Load(xmlReader); - var fragments = document.Descendants("page") - .SelectMany(page => - { - var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1"); - return page.Elements("text") - .Select(item => new PositionedTextFragment( - pageNumber, - int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")), - int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")), - int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")), - int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")), - NormalizeText(string.Concat(item.DescendantNodes().OfType().Select(node => node.Value))))) - .Where(item => !string.IsNullOrWhiteSpace(item.Text)); - }) - .ToList(); + var fragments = document.Descendants("page").SelectMany(page => + { + var pageNumber = int.Parse(page.Attribute("number")?.Value ?? "1"); + return page.Elements("text").Select(item => new PositionedTextFragment(pageNumber, int.Parse(item.Attribute("top")?.Value ?? throw new InvalidOperationException("Missing text top attribute.")), int.Parse(item.Attribute("left")?.Value ?? throw new InvalidOperationException("Missing text left attribute.")), int.Parse(item.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing text width attribute.")), int.Parse(item.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing text height attribute.")), NormalizeText(string.Concat(item.DescendantNodes().OfType().Select(node => node.Value))))).Where(item => !string.IsNullOrWhiteSpace(item.Text)); + }).ToList(); return RemoveRedundantContainedFragments(fragments); } @@ -56,38 +62,16 @@ internal static class CriticalTableParserSupport internal static List LoadPageGeometries(string xmlContent) { using var stringReader = new StringReader(xmlContent); - using var xmlReader = XmlReader.Create( - stringReader, - new XmlReaderSettings - { - DtdProcessing = DtdProcessing.Ignore - }); + using var xmlReader = XmlReader.Create(stringReader, new XmlReaderSettings { DtdProcessing = DtdProcessing.Ignore }); var document = XDocument.Load(xmlReader); - return document.Descendants("page") - .Select(page => new ParsedPdfPageGeometry( - int.Parse(page.Attribute("number")?.Value ?? "1"), - int.Parse(page.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing page width attribute.")), - int.Parse(page.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing page height attribute.")))) - .ToList(); + return document.Descendants("page").Select(page => new ParsedPdfPageGeometry(int.Parse(page.Attribute("number")?.Value ?? "1"), int.Parse(page.Attribute("width")?.Value ?? throw new InvalidOperationException("Missing page width attribute.")), int.Parse(page.Attribute("height")?.Value ?? throw new InvalidOperationException("Missing page height attribute.")))).ToList(); } - internal static List FindRowLabelFragments( - IReadOnlyList fragments, - int leftCutoff, - int bodyStartTop, - int keyTop) + internal static List FindRowLabelFragments(IReadOnlyList fragments, int leftCutoff, int bodyStartTop, int keyTop) { - var candidates = fragments - .Where(item => - item.Left < leftCutoff && - item.Top >= bodyStartTop && - item.Top < keyTop - FooterLabelExclusionGap && - (IsRollBandLabel(item.Text) || LooksLikeSplitRollBandStart(item.Text))) - .OrderBy(item => item.Top) - .ThenBy(item => item.Left) - .ToList(); + var candidates = fragments.Where(item => item.Left < leftCutoff && item.Top >= bodyStartTop && item.Top < keyTop - FooterLabelExclusionGap && (IsRollBandLabel(item.Text) || LooksLikeSplitRollBandStart(item.Text))).OrderBy(item => item.Top).ThenBy(item => item.Left).ToList(); var merged = new List(); @@ -112,9 +96,7 @@ internal static class CriticalTableParserSupport foreach (var candidate in merged) { var previous = deduped.LastOrDefault(); - if (previous is not null && - string.Equals(NormalizeRollBandLabel(previous.Text), NormalizeRollBandLabel(candidate.Text), StringComparison.OrdinalIgnoreCase) && - Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance) + if (previous is not null && string.Equals(NormalizeRollBandLabel(previous.Text), NormalizeRollBandLabel(candidate.Text), StringComparison.OrdinalIgnoreCase) && Math.Abs(previous.Top - candidate.Top) <= RowLabelDuplicateTolerance) { continue; } @@ -126,14 +108,13 @@ internal static class CriticalTableParserSupport } internal static bool IsRollBandLabel(string value) => - Regex.IsMatch(value.Trim(), @"^\d{2,3}(?:\s*-\s*\d{2,3})?$|^\d{2,3}\+$"); + TryNormalizeRollBandLabel(value, out _); internal static bool IsPotentialRowLabelFragment(PositionedTextFragment fragment, int leftCutoff) => - fragment.Left < leftCutoff && - (IsRollBandLabel(fragment.Text) || LooksLikeSplitRollBandStart(fragment.Text)); + fragment.Left < leftCutoff && (IsRollBandLabel(fragment.Text) || LooksLikeSplitRollBandStart(fragment.Text)); internal static string NormalizeRollBandLabel(string label) => - Regex.Replace(CollapseWhitespace(label), @"\s*-\s*", "-"); + TryNormalizeRollBandLabel(label, out var normalized) ? normalized : Regex.Replace(CollapseWhitespace(label), @"\s*-\s*", "-"); internal static ParsedCriticalRollBand CreateRollBand(string label, int sortOrder) { @@ -144,9 +125,7 @@ internal static class CriticalTableParserSupport } var parts = normalizedLabel.Split('-', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); - return parts.Length == 1 - ? new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder) - : new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder); + return parts.Length == 1 ? new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[0]), sortOrder) : new ParsedCriticalRollBand(normalizedLabel, int.Parse(parts[0]), int.Parse(parts[1]), sortOrder); } internal static string ResolveColumn(double centerX, IReadOnlyList<(string Key, double CenterX)> columns) @@ -178,12 +157,7 @@ internal static class CriticalTableParserSupport lines[^1].Add(fragment); } - return lines - .Select(line => new ColumnarCellLine( - CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))), - line.OrderBy(item => item.Left).ToList())) - .Where(item => !string.IsNullOrWhiteSpace(item.Text)) - .ToList(); + return lines.Select(line => new ColumnarCellLine(CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text))), line.OrderBy(item => item.Left).ToList())).Where(item => !string.IsNullOrWhiteSpace(item.Text)).ToList(); } internal static bool IsAffixLikeLine(string line, IReadOnlySet affixLegendSymbols) @@ -204,8 +178,7 @@ internal static class CriticalTableParserSupport return true; } - if (affixLegendSymbols.Count > 0 && - affixLegendSymbols.Any(symbol => value.Contains(symbol, StringComparison.Ordinal))) + if (affixLegendSymbols.Count > 0 && affixLegendSymbols.Any(symbol => value.Contains(symbol, StringComparison.Ordinal))) { if (value.Any(char.IsDigit)) { @@ -218,13 +191,7 @@ internal static class CriticalTableParserSupport remainder = remainder.Replace(symbol, string.Empty, StringComparison.Ordinal); } - remainder = remainder - .Replace("+", string.Empty, StringComparison.Ordinal) - .Replace("-", string.Empty, StringComparison.Ordinal) - .Replace("–", string.Empty, StringComparison.Ordinal) - .Replace("(", string.Empty, StringComparison.Ordinal) - .Replace(")", string.Empty, StringComparison.Ordinal) - .Replace("/", string.Empty, StringComparison.Ordinal); + remainder = remainder.Replace("+", string.Empty, StringComparison.Ordinal).Replace("-", string.Empty, StringComparison.Ordinal).Replace("–", string.Empty, StringComparison.Ordinal).Replace("(", string.Empty, StringComparison.Ordinal).Replace(")", string.Empty, StringComparison.Ordinal).Replace("/", string.Empty, StringComparison.Ordinal); if (string.IsNullOrWhiteSpace(remainder)) { @@ -232,15 +199,7 @@ internal static class CriticalTableParserSupport } } - return value.StartsWith("+", StringComparison.Ordinal) || - value.StartsWith("\u2211", StringComparison.Ordinal) || - value.StartsWith("\u220F", StringComparison.Ordinal) || - value.StartsWith("\u03C0", StringComparison.Ordinal) || - value.StartsWith("\u222B", StringComparison.Ordinal) || - StandaloneModifierAffixLineRegex.IsMatch(value) || - NumericAffixLineRegex.IsMatch(value) || - value.Contains(" - ", StringComparison.Ordinal) || - value.Contains(" – ", StringComparison.Ordinal); + return value.StartsWith("+", StringComparison.Ordinal) || value.StartsWith("\u2211", StringComparison.Ordinal) || value.StartsWith("\u220F", StringComparison.Ordinal) || value.StartsWith("\u03C0", StringComparison.Ordinal) || value.StartsWith("\u222B", StringComparison.Ordinal) || StandaloneModifierAffixLineRegex.IsMatch(value) || NumericAffixLineRegex.IsMatch(value) || value.Contains(" - ", StringComparison.Ordinal) || value.Contains(" – ", StringComparison.Ordinal); } internal static int CountLineTypeSegments(IReadOnlyList lines, IReadOnlySet affixLegendSymbols) @@ -274,23 +233,11 @@ internal static class CriticalTableParserSupport return false; } - return normalized.StartsWith("with ", StringComparison.OrdinalIgnoreCase) || - normalized.StartsWith("w/ ", StringComparison.OrdinalIgnoreCase) || - normalized.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) || - normalized.StartsWith("without ", StringComparison.OrdinalIgnoreCase) || - normalized.StartsWith("if ", StringComparison.OrdinalIgnoreCase) || - normalized.StartsWith("while ", StringComparison.OrdinalIgnoreCase) || - normalized.StartsWith("until ", StringComparison.OrdinalIgnoreCase) || - normalized.StartsWith("unless ", StringComparison.OrdinalIgnoreCase); + return normalized.StartsWith("with ", StringComparison.OrdinalIgnoreCase) || normalized.StartsWith("w/ ", StringComparison.OrdinalIgnoreCase) || normalized.StartsWith("w/o ", StringComparison.OrdinalIgnoreCase) || normalized.StartsWith("without ", StringComparison.OrdinalIgnoreCase) || normalized.StartsWith("if ", StringComparison.OrdinalIgnoreCase) || normalized.StartsWith("while ", StringComparison.OrdinalIgnoreCase) || normalized.StartsWith("until ", StringComparison.OrdinalIgnoreCase) || normalized.StartsWith("unless ", StringComparison.OrdinalIgnoreCase); } internal static string NormalizeText(string value) => - value - .Replace('\u00a0', ' ') - .Replace('\r', ' ') - .Replace('\n', ' ') - .Replace('’', '\'') - .Trim(); + value.Replace('\u00a0', ' ').Replace('\r', ' ').Replace('\n', ' ').Replace('’', '\'').Trim(); private static List RemoveRedundantContainedFragments(IReadOnlyList fragments) { @@ -298,10 +245,7 @@ internal static class CriticalTableParserSupport foreach (var group in fragments.GroupBy(item => (item.PageNumber, item.Top, item.Height))) { - var ordered = group - .OrderByDescending(item => item.Width) - .ThenBy(item => item.Left) - .ToList(); + var ordered = group.OrderByDescending(item => item.Width).ThenBy(item => item.Left).ToList(); for (var index = 0; index < ordered.Count; index++) { @@ -314,9 +258,7 @@ internal static class CriticalTableParserSupport for (var candidateIndex = index + 1; candidateIndex < ordered.Count; candidateIndex++) { var candidate = ordered[candidateIndex]; - if (candidate.Width > container.Width || - !container.Text.Contains(candidate.Text, StringComparison.Ordinal) || - !IsHorizontallyContained(candidate, container)) + if (candidate.Width > container.Width || !container.Text.Contains(candidate.Text, StringComparison.Ordinal) || !IsHorizontallyContained(candidate, container)) { continue; } @@ -326,9 +268,7 @@ internal static class CriticalTableParserSupport } } - return fragments - .Where(item => !redundant.Contains(item)) - .ToList(); + return fragments.Where(item => !redundant.Contains(item)).ToList(); } private static bool IsHorizontallyContained(PositionedTextFragment candidate, PositionedTextFragment container) @@ -338,29 +278,19 @@ internal static class CriticalTableParserSupport var candidateRight = candidate.Left + candidate.Width; var containerRight = container.Left + container.Width; - return candidate.Left >= container.Left - containmentTolerance && - candidateRight <= containerRight + containmentTolerance; + return candidate.Left >= container.Left - containmentTolerance && candidateRight <= containerRight + containmentTolerance; } internal static string? NormalizeConditionKey(string conditionText) { - var normalized = CollapseWhitespace(conditionText) - .ToLowerInvariant() - .Replace("w/o", "without", StringComparison.Ordinal) - .Replace("w/", "with", StringComparison.Ordinal); + var normalized = CollapseWhitespace(conditionText).ToLowerInvariant().Replace("w/o", "without", StringComparison.Ordinal).Replace("w/", "with", StringComparison.Ordinal); normalized = Regex.Replace(normalized, @"[^a-z0-9]+", "_"); normalized = normalized.Trim('_'); return normalized.Length == 0 ? null : normalized; } internal static int FindKeyTop(IReadOnlyList fragments) => - fragments - .Where(item => - string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) || - item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) || - item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase)) - .Select(item => (int?)item.Top) - .Min() ?? int.MaxValue; + fragments.Where(item => string.Equals(item.Text, "Key:", StringComparison.OrdinalIgnoreCase) || item.Text.Contains("must parry", StringComparison.OrdinalIgnoreCase) || item.Text.Contains("attacker gets", StringComparison.OrdinalIgnoreCase)).Select(item => (int?)item.Top).Min() ?? int.MaxValue; internal static AffixLegend ParseAffixLegend(IReadOnlyList fragments, int keyTop) { @@ -369,13 +299,7 @@ internal static class CriticalTableParserSupport return AffixLegend.Empty; } - var footerLines = GroupByTop(fragments - .Where(item => item.Top >= keyTop - TopGroupingTolerance) - .OrderBy(item => item.Top) - .ThenBy(item => item.Left) - .ToList()) - .Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text)))) - .ToList(); + var footerLines = GroupByTop(fragments.Where(item => item.Top >= keyTop - TopGroupingTolerance).OrderBy(item => item.Top).ThenBy(item => item.Left).ToList()).Select(line => CollapseWhitespace(string.Join(' ', line.OrderBy(item => item.Left).Select(item => item.Text)))).ToList(); var footerText = string.Join(' ', footerLines); var symbolEffects = new Dictionary(StringComparer.Ordinal); @@ -389,22 +313,10 @@ internal static class CriticalTableParserSupport AddLegendMatch(symbolEffects, footerText, CriticalEffectCodes.BleedPerRound, @"bleed\s*=\s*(\S)"); AddLegendMatch(symbolEffects, footerText, CriticalEffectCodes.BleedPerRound, @"(\S)\s*=\s*bleed"); - return new AffixLegend( - symbolEffects, - footerText.Contains("powerpoint modification", StringComparison.OrdinalIgnoreCase) - ? ["P"] - : [], - supportsFoePenalty: footerText.Contains("foe has", StringComparison.OrdinalIgnoreCase) && - footerText.Contains("penalty", StringComparison.OrdinalIgnoreCase), - supportsAttackerBonus: footerText.Contains("attacker gets", StringComparison.OrdinalIgnoreCase) && - footerText.Contains("next round", StringComparison.OrdinalIgnoreCase), - supportsPowerPointModifier: footerText.Contains("powerpoint modification", StringComparison.OrdinalIgnoreCase)); + return new AffixLegend(symbolEffects, footerText.Contains("powerpoint modification", StringComparison.OrdinalIgnoreCase) ? ["P"] : [], supportsFoePenalty: footerText.Contains("foe has", StringComparison.OrdinalIgnoreCase) && footerText.Contains("penalty", StringComparison.OrdinalIgnoreCase), supportsAttackerBonus: footerText.Contains("attacker gets", StringComparison.OrdinalIgnoreCase) && footerText.Contains("next round", StringComparison.OrdinalIgnoreCase), supportsPowerPointModifier: footerText.Contains("powerpoint modification", StringComparison.OrdinalIgnoreCase)); } - internal static List SplitBoundaryCrossingFragments( - IReadOnlyList bodyFragments, - IReadOnlyList<(string Key, double CenterX)> columnCenters, - IReadOnlySet affixLegendSymbols) + internal static List SplitBoundaryCrossingFragments(IReadOnlyList bodyFragments, IReadOnlyList<(string Key, double CenterX)> columnCenters, IReadOnlySet affixLegendSymbols) { var splitFragments = new List(bodyFragments.Count); @@ -416,23 +328,15 @@ internal static class CriticalTableParserSupport return splitFragments; } - internal static List<(int Top, bool IsAffixLike)> BuildBodyLines( - IReadOnlyList bodyFragments, - IReadOnlyList<(string Key, double CenterX)> columnCenters, - IReadOnlySet affixLegendSymbols) + internal static List<(int Top, bool IsAffixLike)> BuildBodyLines(IReadOnlyList bodyFragments, IReadOnlyList<(string Key, double CenterX)> columnCenters, IReadOnlySet affixLegendSymbols) { var bodyLines = new List<(int Top, bool IsAffixLike)>(); foreach (var lineFragments in GroupByTop(bodyFragments.OrderBy(item => item.Top).ThenBy(item => item.Left).ToList())) { - var columnTexts = lineFragments - .GroupBy(item => ResolveColumn(item.CenterX, columnCenters), StringComparer.OrdinalIgnoreCase) - .Select(group => CollapseWhitespace(string.Join(' ', group.OrderBy(item => item.Left).Select(item => item.Text)))) - .Where(item => !string.IsNullOrWhiteSpace(item)) - .ToList(); + var columnTexts = lineFragments.GroupBy(item => ResolveColumn(item.CenterX, columnCenters), StringComparer.OrdinalIgnoreCase).Select(group => CollapseWhitespace(string.Join(' ', group.OrderBy(item => item.Left).Select(item => item.Text)))).Where(item => !string.IsNullOrWhiteSpace(item)).ToList(); - var isAffixLike = columnTexts.Count > 0 && - columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols) || IsBoundaryBonusLine(text)); + var isAffixLike = columnTexts.Count > 0 && columnTexts.All(text => IsAffixLikeLine(text, affixLegendSymbols) || IsBoundaryBonusLine(text)); bodyLines.Add((lineFragments[0].Top, isAffixLike)); } @@ -447,8 +351,7 @@ internal static class CriticalTableParserSupport return false; } - return fragment.Top >= keyTop - FooterPageNumberExclusionGap && - Regex.IsMatch(fragment.Text, @"^\d{2,3}$"); + return fragment.Top >= keyTop - FooterPageNumberExclusionGap && Regex.IsMatch(fragment.Text, @"^\d{2,3}$"); } internal static IEnumerable> GroupByTop(IReadOnlyList fragments) @@ -470,10 +373,7 @@ internal static class CriticalTableParserSupport } internal static List CreateRowAnchors(IReadOnlyList rowLabelFragments) => - rowLabelFragments - .OrderBy(item => item.Top) - .Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1)) - .ToList(); + rowLabelFragments.OrderBy(item => item.Top).Select((item, index) => new RowAnchor(NormalizeRollBandLabel(item.Text), item.Top, index + 1)).ToList(); internal static int ResolveBodyStartTop(int headerTop, IReadOnlyList rowAnchors) { @@ -482,32 +382,12 @@ internal static class CriticalTableParserSupport return headerTop + HeaderToBodyMinimumGap; } - return Math.Min( - headerTop + HeaderToBodyMinimumGap, - Math.Max( - headerTop + HeaderToRowLabelMinimumGap, - rowAnchors[0].Top - HeaderToRowLabelMinimumGap - TopGroupingTolerance)); + return Math.Min(headerTop + HeaderToBodyMinimumGap, Math.Max(headerTop + HeaderToRowLabelMinimumGap, rowAnchors[0].Top - HeaderToRowLabelMinimumGap - TopGroupingTolerance)); } - internal static List BuildBodyFragments( - IReadOnlyList fragments, - int bodyStartTop, - int keyTop, - int leftCutoff, - IReadOnlyList rowAnchors, - IReadOnlyCollection excludedFragments, - IReadOnlyList<(string Key, double CenterX)> columnCenters, - IReadOnlySet affixLegendSymbols) + internal static List BuildBodyFragments(IReadOnlyList fragments, int bodyStartTop, int keyTop, int leftCutoff, IReadOnlyList rowAnchors, IReadOnlyCollection excludedFragments, IReadOnlyList<(string Key, double CenterX)> columnCenters, IReadOnlySet affixLegendSymbols) { - var bodyFragments = fragments - .Where(item => - item.Top >= bodyStartTop && - item.Top < keyTop - TopGroupingTolerance && - !IsFooterPageNumberFragment(item, keyTop) && - !IsPotentialRowLabelFragment(item, leftCutoff) && - !rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) && - !excludedFragments.Contains(item)) - .ToList(); + var bodyFragments = fragments.Where(item => item.Top >= bodyStartTop && item.Top < keyTop - TopGroupingTolerance && !IsFooterPageNumberFragment(item, keyTop) && !IsPotentialRowLabelFragment(item, leftCutoff) && !rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, NormalizeRollBandLabel(item.Text), StringComparison.OrdinalIgnoreCase)) && !excludedFragments.Contains(item)).ToList(); return SplitBoundaryCrossingFragments(bodyFragments, columnCenters, affixLegendSymbols); } @@ -515,23 +395,14 @@ internal static class CriticalTableParserSupport internal static void RepairLeadingAffixLeakage(List cellEntries, IReadOnlySet affixLegendSymbols) { var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex); - var axes = cellEntries - .Select(item => (item.GroupKey, item.ColumnKey)) - .Distinct() - .ToList(); + var axes = cellEntries.Select(item => (item.GroupKey, item.ColumnKey)).Distinct().ToList(); for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++) { foreach (var (groupKey, columnKey) in axes) { - var current = cellEntries.SingleOrDefault(item => - item.RowIndex == rowIndex && - string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) && - string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal)); - var next = cellEntries.SingleOrDefault(item => - item.RowIndex == rowIndex + 1 && - string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) && - string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal)); + var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) && string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal)); + var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && string.Equals(item.GroupKey, groupKey, StringComparison.Ordinal) && string.Equals(item.ColumnKey, columnKey, StringComparison.Ordinal)); if (current is null || next is null) { continue; @@ -554,15 +425,9 @@ internal static class CriticalTableParserSupport } } - internal static int ResolveRowBoundaryTop( - RowAnchor current, - RowAnchor next, - IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines) + internal static int ResolveRowBoundaryTop(RowAnchor current, RowAnchor next, IReadOnlyList<(int Top, bool IsAffixLike)> bodyLines) { - var linesBetweenLabels = bodyLines - .Where(item => item.Top >= current.Top && item.Top < next.Top) - .OrderBy(item => item.Top) - .ToList(); + var linesBetweenLabels = bodyLines.Where(item => item.Top >= current.Top && item.Top < next.Top).OrderBy(item => item.Top).ToList(); for (var index = linesBetweenLabels.Count - 2; index >= 0; index--) { @@ -575,14 +440,7 @@ internal static class CriticalTableParserSupport return (int)Math.Floor((current.Top + next.Top) / 2.0) + 1; } - internal static void BuildParsedArtifacts( - IReadOnlyList cellEntries, - AffixLegend affixLegend, - List parsedCells, - List parsedResults, - List validationErrors, - List? validationWarnings = null, - bool downgradeCellContentValidationToWarnings = false) + internal static void BuildParsedArtifacts(IReadOnlyList cellEntries, AffixLegend affixLegend, List parsedCells, List parsedResults, List validationErrors, List? validationWarnings = null, bool downgradeCellContentValidationToWarnings = false) { var sharedLegend = ToSharedAffixLegend(affixLegend); @@ -591,8 +449,7 @@ internal static class CriticalTableParserSupport var lineTexts = cellEntry.Lines.Select(line => line.Text).ToList(); var content = SharedParsing.CriticalCellTextParser.Parse(lineTexts, sharedLegend); var sourceBounds = BuildSourceBounds(cellEntry.Lines.SelectMany(line => line.Fragments).ToList()); - var contentIssues = content.ValidationErrors.Select(error => - $"Cell '{BuildCellIdentifier(cellEntry)}': {error}"); + var contentIssues = content.ValidationErrors.Select(error => $"Cell '{BuildCellIdentifier(cellEntry)}': {error}"); if (downgradeCellContentValidationToWarnings) { validationWarnings?.AddRange(contentIssues); @@ -605,29 +462,9 @@ internal static class CriticalTableParserSupport var effects = content.Effects.Select(ToImportToolEffect).ToList(); var branches = content.Branches.Select(ToImportToolBranch).ToList(); - parsedCells.Add(new ParsedCriticalCellArtifact( - cellEntry.GroupKey, - cellEntry.RollBandLabel, - cellEntry.ColumnKey, - lineTexts, - content.BaseLines, - content.RawCellText, - content.DescriptionText, - content.RawAffixText, - effects, - branches, - sourceBounds)); + parsedCells.Add(new ParsedCriticalCellArtifact(cellEntry.GroupKey, cellEntry.RollBandLabel, cellEntry.ColumnKey, lineTexts, content.BaseLines, content.RawCellText, content.DescriptionText, content.RawAffixText, effects, branches, sourceBounds)); - parsedResults.Add(new ParsedCriticalResult( - cellEntry.GroupKey, - cellEntry.ColumnKey, - cellEntry.RollBandLabel, - content.RawCellText, - content.DescriptionText, - content.RawAffixText, - effects, - branches, - sourceBounds)); + parsedResults.Add(new ParsedCriticalResult(cellEntry.GroupKey, cellEntry.ColumnKey, cellEntry.RollBandLabel, content.RawCellText, content.DescriptionText, content.RawAffixText, effects, branches, sourceBounds)); } } @@ -649,55 +486,42 @@ internal static class CriticalTableParserSupport var right = fragments.Max(fragment => fragment.Left + fragment.Width); var bottom = fragments.Max(fragment => fragment.Top + fragment.Height); - return new ParsedCriticalSourceRect( - pageNumber, - left, - top, - Math.Max(1, right - left), - Math.Max(1, bottom - top)); + return new ParsedCriticalSourceRect(pageNumber, left, top, Math.Max(1, right - left), Math.Max(1, bottom - top)); } private static SharedParsing.AffixLegend ToSharedAffixLegend(AffixLegend affixLegend) => - new( - affixLegend.SymbolEffects, - affixLegend.ClassificationSymbols.Except(affixLegend.EffectSymbols).ToList(), - affixLegend.SupportsFoePenalty, - affixLegend.SupportsAttackerBonus, - affixLegend.SupportsPowerPointModifier); + new(affixLegend.SymbolEffects, affixLegend.ClassificationSymbols.Except(affixLegend.EffectSymbols).ToList(), affixLegend.SupportsFoePenalty, affixLegend.SupportsAttackerBonus, affixLegend.SupportsPowerPointModifier); private static ParsedCriticalEffect ToImportToolEffect(SharedParsing.ParsedCriticalEffect effect) => - new( - effect.EffectCode, - effect.Target, - effect.ValueInteger, - effect.ValueExpression, - effect.DurationRounds, - effect.PerRound, - effect.Modifier, - effect.BodyPart, - effect.IsPermanent, - effect.SourceType, - effect.SourceText); + new(effect.EffectCode, effect.Target, effect.ValueInteger, effect.ValueExpression, effect.DurationRounds, effect.PerRound, effect.Modifier, effect.BodyPart, effect.IsPermanent, effect.SourceType, effect.SourceText); private static ParsedCriticalBranch ToImportToolBranch(SharedParsing.ParsedCriticalBranch branch) => - new( - branch.BranchKind, - branch.ConditionKey, - branch.ConditionText, - branch.RawText, - branch.DescriptionText, - branch.RawAffixText, - branch.Effects.Select(ToImportToolEffect).ToList(), - branch.SortOrder); + new(branch.BranchKind, branch.ConditionKey, branch.ConditionText, branch.RawText, branch.DescriptionText, branch.RawAffixText, branch.Effects.Select(ToImportToolEffect).ToList(), branch.SortOrder); private static string BuildCellIdentifier(ColumnarCellEntry cellEntry) => - cellEntry.GroupKey is null - ? $"{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}" - : $"{cellEntry.RollBandLabel}/{cellEntry.GroupKey}/{cellEntry.ColumnKey}"; + cellEntry.GroupKey is null ? $"{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}" : $"{cellEntry.RollBandLabel}/{cellEntry.GroupKey}/{cellEntry.ColumnKey}"; private static bool LooksLikeSplitRollBandStart(string value) => Regex.IsMatch(value.Trim(), @"^\d{2,3}\s*-$"); + private static bool TryNormalizeRollBandLabel(string label, out string normalized) + { + normalized = Regex.Replace(CollapseWhitespace(label), @"\s*-\s*", "-"); + + if (RollBandLabelRegex.IsMatch(normalized)) + { + return true; + } + + if (OcrDamagedStandardRollBandLabels.TryGetValue(normalized, out var repaired)) + { + normalized = repaired; + return true; + } + + return false; + } + private static bool TryMergeSplitRollBand(IReadOnlyList candidates, int index, out PositionedTextFragment mergedCandidate) { var current = candidates[index]; @@ -708,11 +532,7 @@ internal static class CriticalTableParserSupport } var next = candidates[index + 1]; - if (current.PageNumber != next.PageNumber || - !Regex.IsMatch(next.Text.Trim(), @"^\d{2,3}$") || - next.Top <= current.Top || - next.Top - current.Top > RowLabelDuplicateTolerance + 5 || - Math.Abs(next.Left - current.Left) > 20) + if (current.PageNumber != next.PageNumber || !Regex.IsMatch(next.Text.Trim(), @"^\d{2,3}$") || next.Top <= current.Top || next.Top - current.Top > RowLabelDuplicateTolerance + 5 || Math.Abs(next.Left - current.Left) > 20) { mergedCandidate = null!; return false; @@ -722,28 +542,18 @@ internal static class CriticalTableParserSupport var mergedLabel = $"{startDigits}-{next.Text.Trim()}"; var right = Math.Max(current.Left + current.Width, next.Left + next.Width); - mergedCandidate = new PositionedTextFragment( - current.PageNumber, - current.Top, - Math.Min(current.Left, next.Left), - right - Math.Min(current.Left, next.Left), - Math.Max(current.Height, next.Height), - mergedLabel); + mergedCandidate = new PositionedTextFragment(current.PageNumber, current.Top, Math.Min(current.Left, next.Left), right - Math.Min(current.Left, next.Left), Math.Max(current.Height, next.Height), mergedLabel); return true; } - private static IReadOnlyList SplitBoundaryCrossingFragment( - PositionedTextFragment fragment, - IReadOnlyList<(string Key, double CenterX)> columnCenters, - IReadOnlySet affixLegendSymbols) + private static IReadOnlyList SplitBoundaryCrossingFragment(PositionedTextFragment fragment, IReadOnlyList<(string Key, double CenterX)> columnCenters, IReadOnlySet affixLegendSymbols) { if (!CrossesColumnBoundary(fragment, columnCenters)) { return [fragment]; } - if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) && - fragment.Text.Contains(" ", StringComparison.Ordinal)) + if (IsAffixLikeLine(fragment.Text, affixLegendSymbols) && fragment.Text.Contains(" ", StringComparison.Ordinal)) { return BuildSplitFragmentsFromMatches(fragment, MultiFragmentSplitRegex.Matches(fragment.Text), columnCenters); } @@ -756,10 +566,7 @@ internal static class CriticalTableParserSupport return [fragment]; } - private static IReadOnlyList BuildSplitFragmentsFromMatches( - PositionedTextFragment fragment, - MatchCollection matches, - IReadOnlyList<(string Key, double CenterX)> columnCenters) + private static IReadOnlyList BuildSplitFragmentsFromMatches(PositionedTextFragment fragment, MatchCollection matches, IReadOnlyList<(string Key, double CenterX)> columnCenters) { if (matches.Count < 2) { @@ -780,13 +587,7 @@ internal static class CriticalTableParserSupport var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index); var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length)); - splitFragments.Add(new PositionedTextFragment( - fragment.PageNumber, - fragment.Top, - segmentLeft, - segmentWidth, - fragment.Height, - segmentText)); + splitFragments.Add(new PositionedTextFragment(fragment.PageNumber, fragment.Top, segmentLeft, segmentWidth, fragment.Height, segmentText)); } if (splitFragments.Count < 2) @@ -795,20 +596,12 @@ internal static class CriticalTableParserSupport } var originalColumn = ResolveColumn(fragment.CenterX, columnCenters); - var distinctColumns = splitFragments - .Select(item => ResolveColumn(item.CenterX, columnCenters)) - .Distinct(StringComparer.OrdinalIgnoreCase) - .ToList(); + var distinctColumns = splitFragments.Select(item => ResolveColumn(item.CenterX, columnCenters)).Distinct(StringComparer.OrdinalIgnoreCase).ToList(); - return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase)) - ? splitFragments - : [fragment]; + return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase)) ? splitFragments : [fragment]; } - private static bool TrySplitProseFragmentAtBoundaries( - PositionedTextFragment fragment, - IReadOnlyList<(string Key, double CenterX)> columnCenters, - out IReadOnlyList splitFragments) + private static bool TrySplitProseFragmentAtBoundaries(PositionedTextFragment fragment, IReadOnlyList<(string Key, double CenterX)> columnCenters, out IReadOnlyList splitFragments) { splitFragments = null!; @@ -848,9 +641,7 @@ internal static class CriticalTableParserSupport return true; } - private static List FindBoundarySplitIndexes( - PositionedTextFragment fragment, - IReadOnlyList<(string Key, double CenterX)> columnCenters) + private static List FindBoundarySplitIndexes(PositionedTextFragment fragment, IReadOnlyList<(string Key, double CenterX)> columnCenters) { var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1); var fragmentRight = fragment.Left + fragment.Width; @@ -917,11 +708,7 @@ internal static class CriticalTableParserSupport return bestIndex; } - private static PositionedTextFragment? CreateFragmentSegment( - PositionedTextFragment fragment, - int startIndex, - int length, - double characterWidth) + private static PositionedTextFragment? CreateFragmentSegment(PositionedTextFragment fragment, int startIndex, int length, double characterWidth) { if (length <= 0) { @@ -950,18 +737,10 @@ internal static class CriticalTableParserSupport var actualLength = trimmedEnd - trimmedStart + 1; var segmentText = CollapseWhitespace(fragment.Text.Substring(actualStart, actualLength)); - return new PositionedTextFragment( - fragment.PageNumber, - fragment.Top, - fragment.Left + (int)Math.Round(characterWidth * actualStart), - Math.Max(1, (int)Math.Round(characterWidth * actualLength)), - fragment.Height, - segmentText); + return new PositionedTextFragment(fragment.PageNumber, fragment.Top, fragment.Left + (int)Math.Round(characterWidth * actualStart), Math.Max(1, (int)Math.Round(characterWidth * actualLength)), fragment.Height, segmentText); } - private static bool CrossesColumnBoundary( - PositionedTextFragment fragment, - IReadOnlyList<(string Key, double CenterX)> columnCenters) + private static bool CrossesColumnBoundary(PositionedTextFragment fragment, IReadOnlyList<(string Key, double CenterX)> columnCenters) { var fragmentRight = fragment.Left + fragment.Width; @@ -980,11 +759,7 @@ internal static class CriticalTableParserSupport private static bool IsBoundaryBonusLine(string text) => BoundaryBonusLineRegex.IsMatch(text.Trim()); - private static void AddLegendMatch( - IDictionary symbolEffects, - string value, - string effectCode, - string pattern) + private static void AddLegendMatch(IDictionary symbolEffects, string value, string effectCode, string pattern) { foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase)) { @@ -998,4 +773,4 @@ internal static class CriticalTableParserSupport } } } -} +} \ No newline at end of file