@@ -1,208 +1,206 @@
using System.Text.RegularExpressions ;
using System.Xml ;
using System.Xml.Linq ;
namespace RolemasterDb.ImportTool.Parsing ;
public sealed class StandardCriticalTableParser
{
private static readonly Regex ColumnRegex = new ( @"\b([A-E])\b" , RegexOptions . IgnoreCase | RegexOptions . Compiled ) ;
private static readonly Regex RollBandRegex = new ( @"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)\s*$" , RegexOptions . Compiled ) ;
private static readonly Regex RollBandLineRegex = new ( @"^\s*(?<label>\d{2,3}(?:-\d{2,3})?|\d{2,3}\+)(?<rest>\s+.*)?$" , RegexOptions . Compiled ) ;
private const int HeaderToBodyMinimumGap = 20 ;
private const int TopGroupingTolerance = 2 ;
public Parse dCriticalTable Parse ( CriticalImportManifestEntry entry , string extractedTex t)
public Standar dCriticalTableParseResult Parse ( CriticalImportManifestEntry entry , string xmlConten t)
{
var lines = extractedText . Replace ( "\r\n" , "\n" , StringComparison . Ordinal )
. Replace ( '\f' , '\n' )
. Split ( '\n' ) ;
var fragments = LoadFragments ( xmlContent ) ;
var headerFragments = FindHeaderFragments ( fragments ) ;
var rowLabelFragments = FindRowLabelFragments ( fragments , headerFragments ) ;
var validationErrors = new List < string > ( ) ;
var headerIndex = Array . FindIndex ( lines , IsColumnHeaderLine ) ;
if ( headerIndex < 0 )
{
throw new InvalidOperationException ( "The standard table header could not be found in the extracted text." ) ;
}
var columnStarts = GetColumnStarts ( lines [ headerIndex ] ) ;
var boundaries = GetColumnBoundaries ( columnStarts ) ;
var columns = columnStarts
. Select ( ( item , index ) = > new ParsedCriticalColumn ( item . Label , item . Label , "severity" , index + 1 ) )
var columnCenters = headerFragments
. OrderBy ( item = > item . Left )
. Select ( item = > new ColumnAnchor ( item . Text . ToUpperInvariant ( ) , item . CenterX ) )
. ToList ( ) ;
var firstRollBandIndex = FindNextRollBandIndex ( lines , headerIndex + 1 ) ;
if ( firstRollBandIndex < 0 )
{
throw new InvalidOperationException ( "No roll bands were found in the extracted text." ) ;
}
var keyLineIndex = Array . FindIndex ( lines , firstRollBandIndex , item = > item . TrimStart ( ) . StartsWith ( "Key:" , StringComparison . OrdinalIgnoreCase ) ) ;
if ( keyLineIndex < 0 )
{
keyLineIndex = lines . Length ;
}
var leadingLines = lines [ ( headerIndex + 1 ) . . firstRollBandIndex ]
. Where ( item = > ! string . IsNullOrWhiteSpace ( item ) )
var rowAnchors = rowLabelFragments
. OrderBy ( item = > item . Top )
. Select ( ( item , index ) = > new RowAnchor ( item . Text , item . Top , index + 1 ) )
. ToList ( ) ;
var rollBands = new List < ParsedCriticalRollBand > ( ) ;
var results = new List < ParsedCriticalResult > ( ) ;
var currentLabel = string . Empty ;
var currentRowLines = new List < string > ( ) ;
var rowIndex = 0 ;
void FlushCurrentRow ( )
if ( rowAnchors . Count = = 0 )
{
if ( string . IsNullOrEmpty ( currentLabel ) )
{
return ;
}
rowIndex + + ;
var rollBand = CreateRollBand ( currentLabel , rowIndex ) ;
rollBands . Add ( rollBand ) ;
var cellLines = SplitRowLines ( currentRowLines , boundaries , columns . Count ) ;
for ( var columnIndex = 0 ; columnIndex < columns . Count ; columnIndex + + )
{
var rawCellLines = cellLines [ columnIndex ]
. Where ( item = > ! string . IsNullOrWhiteSpace ( item ) )
. ToList ( ) ;
var rawAffixLines = rawCellLines
. Where ( IsAffixLikeLine )
. ToList ( ) ;
var descriptionLines = rawCellLines
. Where ( item = > ! IsAffixLikeLine ( item ) )
. ToList ( ) ;
results . Add ( new ParsedCriticalResult (
columns [ columnIndex ] . ColumnKey ,
rollBand . Label ,
string . Join ( Environment . NewLine , rawCellLines ) ,
CollapseWhitespace ( string . Join ( ' ' , descriptionLines ) ) ,
rawAffixLines . Count = = 0 ? null : string . Join ( Environment . NewLine , rawAffixLines ) ) ) ;
}
currentLabel = string . Empty ;
currentRowLines = new List < string > ( ) ;
validationErrors . Add ( "No roll-band labels were found in the XML artifact." ) ;
}
for ( var lineIndex = firstRollBandIndex ; lineIndex < keyLineIndex ; lineIndex + + )
var bodyStartTop = headerFragments . Max ( item = > item . Top ) + HeaderToBodyMinimumGap ;
var keyTop = fragments
. Where ( item = > string . Equals ( item . Text , "Key:" , StringComparison . OrdinalIgnoreCase ) )
. Select ( item = > ( int? ) item . Top )
. Min ( ) ? ? int . MaxValue ;
var bodyFragments = fragments
. Where ( item = >
item . Top > = bodyStartTop & &
item . Top < keyTop - 1 & &
! rowAnchors . Any ( anchor = > anchor . Top = = item . Top & & string . Equals ( anchor . Label , item . Text , StringComparison . OrdinalIgnoreCase ) ) & &
! headerFragments . Contains ( item ) )
. ToList ( ) ;
var parsedRollBands = rowAnchors
. Select ( anchor = > CreateRollBand ( anchor . Label , anchor . SortOrder ) )
. ToList ( ) ;
var parsedCells = new List < ParsedCriticalCellArtifact > ( ) ;
var parsedResults = new List < ParsedCriticalResult > ( ) ;
for ( var rowIndex = 0 ; rowIndex < rowAnchors . Count ; rowIndex + + )
{
if ( TryParseRollBandLine ( lines [ lineIndex ] , out var label , out var trailingText ) )
var rowStart = rowIndex = = 0
? bodyStartTop
: ( int ) Math . Floor ( ( rowAnchors [ rowIndex - 1 ] . Top + rowAnchors [ rowIndex ] . Top ) / 2.0 ) ;
var rowEnd = rowIndex = = rowAnchors . Count - 1
? keyTop - 1
: ( int ) Math . Floor ( ( rowAnchors [ rowIndex ] . Top + rowAnchors [ rowIndex + 1 ] . Top ) / 2.0 ) ;
var rowFragments = bodyFragments
. Where ( item = > item . Top > = rowStart & & item . Top < rowEnd )
. ToList ( ) ;
foreach ( var columnAnchor in columnCenters )
{
var trailingTextBelongsToCurrentRow = IsAffixLikeLine ( trailingText ) ;
var cellFragments = rowFragments
. Where ( item = > ResolveColumn ( item . CenterX , columnCenters ) = = columnAnchor . Key )
. OrderBy ( item = > item . Top )
. ThenBy ( item = > item . Left )
. ToList ( ) ;
if ( ! string . IsNullOrWhiteSpace ( trailingText ) & &
! string . IsNullOrEmpty ( currentLabel ) & &
! trailingTextBelongsToCurrentRow )
if ( cellFragments . Count = = 0 )
{
currentRowLines . Add ( trailingText ) ;
validationErrors . Add ( $"Missing content for roll band '{rowAnchors[rowIndex].Label}', column '{columnAnchor.Key}'." ) ;
continue ;
}
FlushCurrentRow ( ) ;
currentLabel = label ;
if ( rowIndex = = 0 )
{
currentRowLines . AddRange ( leading Lines ) ;
}
var lines = BuildLines ( cellFragments ) ;
var rawAffixLines = lines . Where ( IsAffixLikeLine ) . ToList ( ) ;
var descriptionLines = lines . Where ( line = > ! IsAffixLikeLine ( line ) ) . ToList ( ) ;
var rawCellText = string . Join ( Environment . NewLine , lines ) ;
var descriptionText = CollapseWhitespace ( string . Join ( ' ' , description Lines) ) ;
var rawAffixText = rawAffixLines . Count = = 0 ? null : string . Join ( Environment . NewLine , rawAffixLines ) ;
if ( ! string . IsNullOrWhiteSpace ( trailingText ) & & trailingTextBelongsToCurrentRow )
{
currentRowLines . Add ( trailingText ) ;
}
parsedCells . Add ( new ParsedCriticalCellArtifact (
rowAnchors [ rowIndex ] . Label ,
columnAnchor . Key ,
lines ,
rawCellText ,
descriptionText ,
rawAffixText ) ) ;
continue ;
}
if ( ! string . IsNullOrWhiteSpace ( lines [ lineInd ex] ) )
{
currentRowLines . Add ( lines [ lineInd ex] ) ;
parsedResults . Add ( new ParsedCriticalResult (
columnAnchor . Key ,
rowAnchors [ rowIndex ] . Label ,
rawCellT ext ,
descriptionText ,
rawAffixT ext ) ) ;
}
}
FlushCurrentRow ( ) ;
if ( columnCenters . Count ! = 5 )
{
validationErrors . Add ( $"Expected 5 standard-table columns but found {columnCenters.Count}." ) ;
}
return new ParsedCriticalTable (
if ( parsedCells . Count ! = rowAnchors . Count * columnCenters . Count )
{
validationErrors . Add (
$"Expected {rowAnchors.Count * columnCenters.Count} parsed cells but produced {parsedCells.Count}." ) ;
}
var validationReport = new ImportValidationReport (
validationErrors . Count = = 0 ,
validationErrors ,
rowAnchors . Count ,
parsedCells . Count ) ;
var table = new ParsedCriticalTable (
entry . Slug ,
entry . DisplayName ,
entry . Family ,
Path . GetFileName ( entry . PdfPath ) ,
"Imported from PDF text extraction." ,
columns ,
r ollBands,
r esults) ;
"Imported from PDF XML extraction." ,
columnCenters . Select ( ( item , index ) = > new ParsedCriticalColumn ( item . Key , item . Key , "severity" , index + 1 ) ) . ToList ( ) ,
parsedR ollBands,
parsedR esults) ;
return new StandardCriticalTableParseResult ( table , fragments , parsedCells , validationReport ) ;
}
private static bool IsColumnHeaderLine ( string line )
private static List < XmlTextFragment > LoadFragments ( string xmlContent )
{
var matches = ColumnRegex . Matches ( line ) ;
return matches . Count = = 5 ;
}
using var stringReader = new StringReader ( xmlContent ) ;
using var xmlReader = XmlReader . Create (
stringReader ,
new XmlReaderSettings
{
DtdProcessing = DtdProcessing . Ignore
} ) ;
private static List < ( string Label , int Start ) > GetColumnStarts ( string headerLine )
{
var matches = ColumnRegex . Matches ( headerLine ) ;
return matches
. Select ( match = > ( match . Groups [ 1 ] . Value . ToUpperInvariant ( ) , match . Index ) )
var document = XDocument . Load ( xmlReader ) ;
return document . Descendants ( "page" )
. SelectMany ( page = >
{
var pageNumber = int . Parse ( page . Attribute ( "number" ) ? . Value ? ? "1" ) ;
return page . Elements ( "text" )
. Select ( item = > new XmlTextFragment (
pageNumber ,
int . Parse ( item . Attribute ( "top" ) ? . Value ? ? throw new InvalidOperationException ( "Missing text top attribute." ) ) ,
int . Parse ( item . Attribute ( "left" ) ? . Value ? ? throw new InvalidOperationException ( "Missing text left attribute." ) ) ,
int . Parse ( item . Attribute ( "width" ) ? . Value ? ? throw new InvalidOperationException ( "Missing text width attribute." ) ) ,
int . Parse ( item . Attribute ( "height" ) ? . Value ? ? throw new InvalidOperationException ( "Missing text height attribute." ) ) ,
NormalizeText ( string . Concat ( item . DescendantNodes ( ) . OfType < XText > ( ) . Select ( node = > node . Value ) ) ) ) )
. Where ( item = > ! string . IsNullOrWhiteSpace ( item . Text ) ) ;
} )
. ToList ( ) ;
}
private static int [ ] GetColumnBoundaries ( IReadOnlyList < ( string Label , int Start ) > column s)
private static List < XmlTextFragment > FindHeaderFragments ( IReadOnlyList < XmlTextFragment > fragment s)
{
var boundaries = new int [ columns . Count - 1 ] ;
for ( var index = 0 ; index < boundaries . Length ; ind ex+ + )
{
boundaries [ index ] = ( columns [ index ] . Start + columns [ index + 1 ] . Start ) / 2 ;
}
var groupedByTop = fragments
. Where ( item = > item . Text . Length = = 1 & & char . IsLetter ( item . T ext [ 0 ] ) )
. GroupBy ( item = > item . Top )
. OrderBy ( group = > group . Key ) ;
return boundaries ;
}
private static int FindNextRollBandIndex ( IReadOnlyList < string > lines , int startIndex )
{
for ( var index = startIndex ; index < lines . Count ; index + + )
foreach ( var group in groupedByTop )
{
if ( TryParseRollBandLine ( lines [ index ] , out _ , out _ ) )
var ordered = group . OrderBy ( item = > item . Left ) . ToList ( ) ;
var labels = ordered . Select ( item = > item . Text . ToUpperInvariant ( ) ) . ToList ( ) ;
if ( labels . SequenceEqual ( [ "A" , "B" , "C" , "D" , "E" ] ) )
{
return index ;
return ordered ;
}
}
return - 1 ;
throw new InvalidOperationException ( "Could not find the standard-table A-E header row in the XML artifact." ) ;
}
private static bool TryParseRollBandLabel ( string l ine , out string label )
private static List < XmlTextFragment > F indRowLabelFragments (
IReadOnlyList < XmlTextFragment > fragments ,
IReadOnlyList < XmlTextFragment > headerFragments )
{
var match = RollBandRegex . Match ( line ) ;
if ( ! match . Success )
{
label = string . Empty ;
return false ;
}
var leftCutoff = headerFragments . Min ( item = > item . Left ) - 10 ;
var bodyStartTop = headerFragments . Max ( item = > item . Top ) + HeaderToBodyMinimumGap ;
label = match . Groups [ 1 ] . Value . Replace ( " " , string . Empty , StringComparison . Ordinal ) ;
return true ;
return fragments
. Where ( item = >
item . Left < leftCutoff & &
item . Top > = bodyStartTop & &
IsRollBandLabel ( item . Text ) )
. OrderBy ( item = > item . Top )
. ToList ( ) ;
}
private static bool TryParse RollBandLine ( string line , out string label , out string trailingText )
{
var match = RollBandLineRegex . Match ( line ) ;
if ( ! match . Success )
{
label = string . Empty ;
trailingText = string . Empty ;
return false ;
}
label = match . Groups [ "label" ] . Value . Replace ( " " , string . Empty , StringComparison . Ordinal ) ;
var restGroup = match . Groups [ "rest" ] ;
trailingText = restGroup . Success
? string . Concat ( new string ( ' ' , restGroup . Index ) , restGroup . Value . TrimEnd ( ) )
: string . Empty ;
return true ;
}
private static bool Is RollBandLabel ( string value ) = >
Regex . IsMatch ( value . Trim ( ) , @"^\d{2,3}(?:-\d{2,3})?$|^\d{2,3}\+$" ) ;
private static ParsedCriticalRollBand CreateRollBand ( string label , int sortOrder )
{
@@ -217,35 +215,39 @@ public sealed class StandardCriticalTableParser
: new ParsedCriticalRollBand ( label , int . Parse ( parts [ 0 ] ) , int . Parse ( parts [ 1 ] ) , sortOrder ) ;
}
private static List < string > [ ] SplitRowLines ( IReadOnlyList < string > rowLines , int [ ] boundaries , int columnCount )
private static string ResolveColumn ( double centerX , IReadOnlyList < ColumnAnchor > columns )
{
var result = Enumerable . Range ( 0 , columnCount )
. Select ( _ = > new List < string > ( ) )
. ToArray ( ) ;
foreach ( var line in rowLines )
for ( var index = 0 ; index < columns . Count - 1 ; index + + )
{
for ( var columnIndex = 0 ; columnIndex < columnCount ; columnI ndex + + )
var boundary = ( columns [ index ] . CenterX + columns [ i ndex + 1 ] . CenterX ) / 2.0 ;
if ( centerX < boundary )
{
var start = columnIndex = = 0 ? 0 : boundaries [ columnIndex - 1 ] ;
var end = columnIndex = = columnCount - 1
? line . Length
: Math . Min ( boundaries [ columnIndex ] , line . Length ) ;
if ( start > = line . Length | | end < = start )
{
continue ;
}
var segment = line [ start . . end ] . Trim ( ) ;
if ( ! string . IsNullOrWhiteSpace ( segment ) )
{
result [ columnIndex ] . Add ( segment ) ;
}
return columns [ index ] . Key ;
}
}
return result ;
return columns [ ^ 1 ] . Key ;
}
private static IReadOnlyList < string > BuildLines ( IReadOnlyList < XmlTextFragment > fragments )
{
var lines = new List < List < XmlTextFragment > > ( ) ;
foreach ( var fragment in fragments . OrderBy ( item = > item . Top ) . ThenBy ( item = > item . Left ) )
{
if ( lines . Count = = 0 | | Math . Abs ( lines [ ^ 1 ] [ 0 ] . Top - fragment . Top ) > TopGroupingTolerance )
{
lines . Add ( [ fragment ] ) ;
continue ;
}
lines [ ^ 1 ] . Add ( fragment ) ;
}
return lines
. Select ( line = > CollapseWhitespace ( string . Join ( ' ' , line . OrderBy ( item = > item . Left ) . Select ( item = > item . Text ) ) ) )
. Where ( item = > ! string . IsNullOrWhiteSpace ( item ) )
. ToList ( ) ;
}
private static bool IsAffixLikeLine ( string line )
@@ -256,7 +258,7 @@ public sealed class StandardCriticalTableParser
return false ;
}
if ( value = = "— " )
if ( value = = "-" | | value = = "\u2014 " )
{
return true ;
}
@@ -270,16 +272,27 @@ public sealed class StandardCriticalTableParser
}
return value . StartsWith ( "+" , StringComparison . Ordinal ) | |
value . StartsWith ( '∑' ) | |
value . StartsWith ( '∏' ) | |
value . StartsWith ( 'π' ) | |
value . StartsWith ( '∫' ) | |
value . StartsWith ( "\u2211" , StringComparison . Ordinal ) | |
value . StartsWith ( "\u220F" , StringComparison . Ordinal ) | |
value . StartsWith ( "\u03C0" , StringComparison . Ordinal ) | |
value . StartsWith ( "\u222B" , StringComparison . Ordinal ) | |
char . IsDigit ( value [ 0 ] ) | |
value . Contains ( " – " , StringComparison . Ordinal ) | |
value . Contains ( " - " , StringComparison . Ordinal ) | |
value . Contains ( "(-" , StringComparison . Ordinal ) | |
value . Contains ( "(+" , StringComparison . Ordinal ) ;
}
private static string CollapseWhitespace ( string value ) = >
Regex . Replace ( value . Trim ( ) , @"\s+" , " " ) ;
private static string NormalizeText ( string value ) = >
value
. Replace ( ' \ u00a0 ' , ' ' )
. Replace ( '\r' , ' ' )
. Replace ( '\n' , ' ' )
. Trim ( ) ;
private sealed record ColumnAnchor ( string Key , double CenterX ) ;
private sealed record RowAnchor ( string Label , int Top , int SortOrder ) ;
}