From a391a1421ac119c70ebb31ce48ec54684291a02e Mon Sep 17 00:00:00 2001 From: Frank Tovar Date: Sat, 14 Mar 2026 02:53:33 +0100 Subject: [PATCH] Fix mana affix column leakage --- docs/critical_import_tool.md | 1 + src/RolemasterDb.App/rolemaster.db | Bin 630784 -> 634880 bytes ...dardCriticalTableParserIntegrationTests.cs | 64 ++++++++++++ .../Parsing/StandardCriticalTableParser.cs | 97 ++++++++++++++++++ 4 files changed, 162 insertions(+) diff --git a/docs/critical_import_tool.md b/docs/critical_import_tool.md index cf0dbc8..5e481c9 100644 --- a/docs/critical_import_tool.md +++ b/docs/critical_import_tool.md @@ -240,6 +240,7 @@ Current phase-3 notes: - header detection now tolerates minor `top` misalignment across the `A-E` header glyphs - row boundaries can snap to the last affix-to-prose transition between adjacent roll labels when midpoint slicing would leak into the next row - affix symbols are learned from the footer legend before body parsing, so symbol-only affix fragments are classified correctly +- affix fragments that cross a column boundary in the XML can be split on hard internal spacing before column assignment, which is required for `Mana.pdf` - footer page numbers are filtered out before body parsing - validation allows a single contiguous affix block either before or after prose diff --git a/src/RolemasterDb.App/rolemaster.db b/src/RolemasterDb.App/rolemaster.db index 89928957628153f21dd048e2ede04344bb16e309..f9fa5328fc6b765397e1127c2f5b6f1df2d77c43 100644 GIT binary patch delta 14519 zcmc(mdz?+x|NqZAXYalC+Gn40hM6;%3qun!h8YYa6lO$jA>>XOr5d@7Tn5LP8R=r? zoMU5m@hKlkDoT;aOv?nfCL2$-yo7qLC4HAo*mOl^Ln+5+V|RL zYB_{{MHA!0wWx~(0;wdmTeqp>o*Xyj(W061`d7yX-J_rawYaMey5hreHNpA$HNjGR zI8alXmseBji4Xg0O1pNgDRtKl`$Q>AU7z^)pKa6!eVgGlu9pmb`F!K0<=MLb3IBNi zL;ex|!M@Gw_bxxJE8w1L%r>^KH8Q6}9sxDYmJubI0#kv$CKLrB2)T2-~gnHm^PM~SlsR-E;sGIDXPrguZf~Qbdh>N)0b*GaR z>K)dWSIA2(N$P?jUlMD`m+m*uXuHXF&p%wgXMLsSq;qniHDkMndFYzJYk}_q2d$?+ z@ucc^i&~@#)D?&q@Y@(HzZLl1<$p+-WvhpjX~Ne;ppkX*OHXP)swrfXq`2U=%(b3M zljbqWr?{Fe(3(k>)=a+Kje2v*a8GBKSvbX0Y3-jR7if}yysMwpu~^v>{hbAwpHY^}vRN}WoDF1s*jzT9zFqs7>O30- z%1Rq!l~>JBl3FiRpu8gV%Qj}I-2^%dbg)sNyl7*r@`5NFC~%)ZPa6fw5*uTc#Wsr7 ze!@3OV5p4(C2C`=60uRNj^=2t4ij>ajRNI48)KDcZ9J*oCQx6Xp0($Syphx`R;Stp z#p)9Rj|vo+t&-Gsp{Hy$OAQOm7MLzDMPP!!c!9AtW~xsLOcp2+;(~f~PjyY=2YEHXA0XbrKiYF~w?$z#M@Y=I_aBvi06gc~O@6 zSeCNY%+FF9n~kzOeb;Bj#YxsvIm+)!sEj~rh$oP_va_-Z3gY9+Tu@NQE@Lyiz|K7Day{w$!XnAjVqm2Q`DwyW=>X%Ry3}(sHUiO+mV@Va?&%E zxYDULWv#L^TjvCCljBOK)D&g6&1{twmmtNJPIij2GqZEN0Z&}%q?)3vwwc*kists0 zPPB^{VX0PSCndLQZ42IM+Jknb9cf#dMVrxdc*D0_yXPs5iMeJf8D>^bA?qxqRH^N@ zuKuAaZr#pc^a{^oGfkzL)YKZVK-mZxY@HSNoBA#>Ut6e*GvoeHf3vi5<#Hp)*O0<~ zvFJKw0!i{Dd3>JXo@Jizr25imX}R>L^nve6A8WLY+w@OiMi9xta(ukIs?myOM+m(HYWcm(Ps6rzB>0l3ZOx@qD@zL>nQ> zXbc4IP)ZkihQki~-c$19>lTqHt;A%T_yn0v3(YzU{Cb6NFQNy{cb*gF z2L0A7#_h%Qgqg8H#4Mpdn}xW&l>TO}-JrBEm#m^qEcg4$A=zBCO-VA(Z&t>e^WUMx z=E~RTA#3dxWhSv!uBAuG#_fufEQm+ zy};d-+^qTQTKYf#nrU(Js3lA3{nq@EG?7@3&ZVaPa!S=!kAAePz3JI#RrIG#?M?Ty z*5(1Ux4lX3w6uY=EwL7zQF^+qV;XHLx2oeVbU~XkE6~8d+y8)Xi|=;5$oq@8iB_rg zVQ(>)T0|?A?eaA_SIYJ5@=$l7(J*d=wJ44*C1%eonyZ&girnRM*^ODr1na;gy4cs) z2dTp6`qdSX{hI!@_XlkfYeO$9>*c#7-Te`H!}Tj!@3P=PG1E$;trEi>joJ1g-0pBr zxZZ8G;!|DaZkJhi1Z`ygGMqjWOfZUtza$VI+tT*O$W2okNz=?%N6?>w-Hlo3&nu~u zY5Sw(ruE`T__e_n#!U3x5uVt|_C>&VauiJ=i6xVb8R(o)l96XSE5I4)OvjL5XJb10 zI+PH;V>z7k#jdoWmEM*5NwB@~BnIS^P^{Gr^JpGTwjRu*J`&6|reQ$q5{A{fVFtUw z5nj!w3JFdyiZI~Ol1!}I4eP^hbcF4jDtzHb!pZ$b)oz%X-RZGl8)FLY)-pT|Yj(qY zy9aGxb?iYU5}a;K#(gJ;yJO9+o4f9%4a}+c(oexZ$?>V1 zyt%eMt#1ylPxl6g86D9-DBLaHK6C}_x~u_BH3v1IZ^aiG9lZRpoH&+y%gOmAM`)uZ zpVPdA2F9Jbcx3B6(q7c+oS8;wBq76SkDl8nV|~t5;%XCxlZlg82+J`f+LTAlR3hXK?G#cz+v_gMQxCz$k zta)95i?mCj<-yiQHu^Ke4X|ow%||F*5v*^(lf+-^I$x`pdyi@SW?{HQm)3}wCPOTn4O1oTf0XZEuF z<*@&%K5%UZ_JNnBzEOzBuNNMU4`8*`vakJycQqczeLI8+_eDvy+3!BOF0mxdcntST z4v)loRa>L8Xn+L!8;|0Cy}|+RtAPD}$cBshN;Z8pIL;W4{?Xy&dqth9&26pdZRX5Y z^xZ^{@d*0og!A|d93$8A%y}Mn9dmGNc%jmahnhF~*^90Gk4Nt5@g0` z^vU7LLEBeBu9^)x(kgI`LYF6eH(uMTW=1DkRrh{lq;Q4%gqvj8d&S6=dnS%E|Jjem zn|WbGO+SwwCb# z`n=&z_uI9OlB(C&rBf1&HwL527j8dPT%sy#$L-Wjg47sS9g(qWf*W{M}l zR6CP0Yh9Q$BIfdWa2K}DBeSjCv9zf&1n&3{5}|wG^W|ya=fHWu&ww+4yMdE{p905| z3c3q82DlSA1h@m(5BLeN2k>KHXHrf-veKHf6CQJ40rhz%(q?9EvE0YHIDrP->2a>Z zF8?XtK3^|=xp%#`ge_1%pr0$B$$O=3o)_FJAiEqkn@)l^I6TN0Abw`}zHmG~v6s!~ zn!$_Gvl(3pO>}?sKN8O5Hzfj>VPXbko|Fvwad4{95B(Fu*?j*B@Sn?|Kf)Ef&$)tQ z@d{ryCuP!P(~}8zz-#nHzZ~wwZ%&L{GJP%RRcO5X;87=sbFfyI%<`6S)|{4fPhxnE z(HjG%hdc0q2<)3u3#$ET9Ib?E_d9x zRfl>A`11aTFAAR&ch!T-bf_*pwQ(LLX;fb6y5`>IGWUk%I@XcpG>3%f@ANzRCH;u5 zr>{|yK0}M?1Uj1br+GAoHlg*XSGlhIrTn0LqwG{Hy~Udd2W zm4HIz^YYK~A^9`;19@%WT;OEjU|@G(b6`#2Wf+kGuam-)Y}*cz8edZSbvfZ@cxc zrLUmRdzdfkV$dt9JI&gAM7_%mgE_=y#vN0G=C{Yx=~nS-8kAeXdzWYj@zEz|D9R3X zyWRFERi%3M7PUvcX!bjzHt~GR^35Cj)jIKI zSvlk{E)w~O5V79+oeoY9`8tx9J-t1?9A67x6JMIIp0Bpg>yz~B`X&7@{gnQLepLTP zuhe(yTXjocr@x`E&|lIQ>*e}WdPtw4Pu3sP$Lb^WL3&@kyWUxEueZ`O^mIK{Pt;)` zS0~=9-t*qm-k-h4y@$N}yq|eL_I}{q;9cuo<$cBbe8?N~KI<*@&h}38KH+`DJH|WI zJHXou?q5f58*g)Ox;Mp};MKj7R;^ut`*>12rtOEm^bT!{wn1B?t<;uji?xMXnKoN1 z(h9W#ZG<*Z>#gN$9kn)Eb1hv<(GoOWlUOyoz)rK1>=@h6Dq%!*3){ffu+U1jj4fsh zSs9znidZ2lU?X5SwKvOW9a$UJoTalAmcVo-snzNQ^|X3YJ*MtgE7cw97IlNVMqQ~c zQx~fX)iQOqTBH`L1?mWOpxRr_S39b0)aGisnxZDCx+>9XdV!v%C+RV|pH|WxbPL@; z*M#Uwx{NNS3uzghO^awDEubUlK-!z;!#GnL+MK4-6q-PFDk;^<1?9AIQaPsVS1Oeq z$`)mVvIYihm%+H|LZwWZtrRJRN`W#$8L0GD@|BKC8>P9DuB0dlimpgd5643-XMq8$SdV#@?v?RTqe(!i{wJNKpr6vlzYqha!0w1++0qVQ{)6$mnErM zx*(mFPD;n5{Zgg0L)s#3kk&{mrDf7$X`xgm&6bL!La9I+Aq|vzOZieqsg2ZJN|#ck z1WA`9PqpWQ=d|af=a^@|r_!^-v&FL^T>it5+@JOg^!M_}(meV_5|_Vo76@eT4620|wSo1pm^ALtyY z>pufSHp`)n7~{|N$N5hAw)_6!o964|DF_@6tP4CH7!$}1#6hF7-T%CQl7EOl3)+tp zzD>R*z6ri=?6^HQK|Qv0Mc5V23idr`x$wGT>@BXN>`f6DVsBvJY7Bgxb0J%W{3mi5 z(nKyomLs1=mLlgOXCjM`6OoT`h8D7MSTF`T961=-AK9DpIrbWIIr1gsQe+Id5V-(Z zf}De#VKd}$}=JY+HQN#tZ? zA@ULAL&%XxK3xXKzaMp9muWkek4Omg6d7=RpdqF8RYMrWh@_gH}Xzoj?Iv(jAda# z2C^}-Auew4ZDJ|XuA>=O7$V2gky@Seb0fz<-93h)I2 zb5*_wV6Mt1`sb>AGJmeh=j-RF&)AW3)G~n($DQgdA*TsU5_nwTVSxt)MhHA0Fu>eb zPi@!tXcFc3-mWhk{S#_XP&{j6h$PIm?rak8r4MR zi)f;}LKE+Gxkk7@rny8M6PYo>L?WtS;=M0t)gq2v!#I&SD_9l$1YT6mE(0Ma#n>f` z5m`0DL=vlDkTamgE@$UBW9%%(iL4u8BFR-Sky6Up9~dW^i6|3UIl@E|tY9MLlrxcX zVhrAJ`hFcWZ4)KNiE7m3W=}}F>VXSi7Xsr zBI!k$NHGz%3F9n`6InXO?tPyZM48Az5w;!!-@`zW#$!w*%_tK&D#G5zxV0F!W@DPV zXt}v*ySmj}v|VisqaPa;tNCsX@-^7j6z8O>W}m;9=wjZuTEU zuH$C!exw>VyZ0j3a5L{7auqkbb|qJEvr`vRg`2q@$z|MZ-Hu$+dWkc)Xhkk!ae5}X zfSbYF$a&n1OCaZP(;FaXaZ}OA8731LmP?h%UsmUh&8~B9&1Jo}Pi;a%`T7lg zlRiant~^F}>u&EB?~~pv?Kc=O7^T%^UrW2>d}V~Mt$s#-Q!mi#E9+Fh_k#Cr?|5$` z7-Cqe_0u%AL)s)iudMJj)qm8N>4Wt+<&ruOCi9njhj|mUz1q_-fm_WsLf`mV<&-Z; z->+BbJ>=uGqx#|y?_n5`>J6h$+qGF*8}=t#E!~j!(H1^k-=)vf?^M!hiF(5OIjqpS zORLh}gRzUo>;!vJ9-t)BB36WhUn#EV$VoQUQO{b+*XqCAK|Zg}s<7N1RldnCoBQ{x z+2De?dR8;q$YulWdsI6orrRB zk?k}O@`!t!h%cGyL{vb;E;Hq@+Uy~9t`qT};ZwVA7uly6pX^#dzF;-KHre$QD3`j! zm4d^+ut*V9O-=qY>}vmvi)^>K`mmbPjA>3h3>Sl{iHCt%r(Q5Wz&bgD5#5$ih#P{s0vW$9aRqMoTFmo zIy>vAD5x`zf;nRLm!m33=sNq;kuU+w{%}-`RI}5Lf(c&syQ3nYesdH|;<8^IRSxQu zqhNxT{o7G68_RxiRD@g$v7a3Y)2-~Jqsl@3gUL-tL`kx`IeI|^ph*jJ8%`7^f9QRSe%bQDaGvAvFpg8ITyFagFY9R<@} zY>%U0hKqgf3yHZ8`!zYwK6IS0AcSpk6f6K?A2=#T&ausoih}x=qhL`7+vKPUP?n?0 zL2YzYjGSff`;-u$da>WHvuuOojDTA2s0vW;IjS7gyN-gD6>OcOU^NAM$59bbZ#$|2 z)LKWCgIWVB&z||Q=ez%6Z#m8=s5c!I0riHXDnPAvR5_^EtpVSu@3^VHoXm#BhgQ?c z;nCdyd<)+@9g~FK++j_784PbNf5Dzj6C3x2L%MH@ClV`!lyE zx&4XTAG!U3+Y?|zZuUJF$GQEE+hg2*%k5Eak8pdK+e6$Qvw+poF(iranM ze#z}#ZolBRlG{DD4YAL;_>9{sZZC6tiQ9|Z{`am>=zqUN|J5rL;=k|zdx`#bg?97! zPr2R2?M`lYaQg|jA9MQ=x7)eh#_d*aKjd}`w;#wM`_bFX#lN`S#I42cMsDBdb_2KT zxqXk@ce!21?K|AQ&Fxxl*Kqq5w{LR$2DhuZeVyA?vd6mmv${%ww$4-Ae0iZ#Xu%TN zRqmdeYzJr(w;pcY+!AhGwq-ZDy}|8uZmYSy#_d&ZuWUS{k}Rs%B^j_zf9Y77Yp>OR z*jgSY&l7w3MH=rT*lnZzscpB4NZt*zTSC;l8)i3tTtl;m!%N^p0m*xy_Q!#(L;r`r-Tvtz_kZWN!_9;?m;YjDZIK6%yhjR6B=kt3j|>~U zOA6#&QXub=0(qAdT0!0=1@bN_katOeyh{q?T~hmdj-LR&1ChK-3gx^@3f#%d;hPru z3GXA@pT=0gyQH9bm(>0S=jHI3j^tfZyX8gAyQI)+@-8WmcS(V~OA6#&QXub=LbJ$w zq(I&y1+E{%4+O0zl6Og=<>OsaAn%d_d6yK(yQIK1JRX`v%p+h1l24d`P4eIO2YMXZ72YD5F1zCl>%xQl)yIhth zi;*$S6I@eV{yn~Zu!6U@rmGj}r^*s}g4D#5;%?w3cv-V!ncDtfx%kI@r*W)A7x|;@6bNn3*QoEC4<#}+ZfG>?`|{~-Y)I97rrgb zG>rz$WQ{h8zr)B7-rMSNFMMj4_i40VDA{Nx9PvKxfUk~>);-`~@ti3SPER)O_?s8L zLNZ$QfPd(1`Zq6pm1Jc1fPd(1@;5JhrQG6$^?I<{lvs@y!3XZrb#5te(*jeseExO^l+^w2E%`XgO$4>^tCHtd{HoT zjIEC!pv_3Ii!m2{ol1gt@P&bH*T$hVi@>VsIp~^MQtMIM1)Xd2>M&a0dT1E!L;fG$ CAL_CI delta 12257 zcmZvi33L?2_W!46y1S~nXL2B?J?JMnFt#)Pc>>e!j z@95x5^WvN{W3!P%@_JiH@%P=A{dmsfSx+s>RbsR1gTsc^2j$qz`ry!^^+8u`MtyL| zkousrVLA_z_=Am}x!w!U_SCVeIM%InZw%388B#y%+zsg$_sY86D-)MSo@7!^WHhsj zBLkRS6iH$>JJOojtVly%nyXhRa{y|d9=dLzpfPoEtm zHDL$#q?@)KD}C4r$&!~HR5s_ZEY6{soI^4=2d7&JD#@4Ut z(yv}O`h&%WDlUgp&9#2>OJ94vyYUPbn}S?Ur<9JmSl1%bQE%g>j$-L8e32>h-Kyk} zoX+@csR-p*M>k08yG!wwZ$^TTAO)#Dy+<&wI~! z|FFv5bhU7I^qz4TkExUG$~>k{Sv{;Pp=_GWC_V2FOqd+C3%ayP7c$m z6Rt^g<4;OcRokqYWULkbSsAXl($NBX{-%=PPe)7aI;W#78*|moHs+`=@nD$4MgVp@uS#Ar`U5ZA|2(-PB?h7Nao zf{2A0*tJMa9Wq3Z395Hek`q%?`VZD(f=YcTB{eZMsrS7kCMe$tB`2n&^i0DsL05gK zS88HPQnwx`Cg`jW^-P_S*ej)7q8byF>SH~U6MLq_wN+w*j`~n?YGRM17@r&y@ZSj~ zCnl%pURO+@s6LdGnwXrVYEC0iXorYI;;nQ2)!sv7KfhSQ9VwUkO!b?yD(!M+9dO|U1xz5%ul>~XNiz`hRlDA?D)9s&C**jlg_ zSpQ+bD_{?SJqY#y*!^Jlf!zys57^yccY%Ew>`t&dz-|Y-4eVC1TflAx`x4kqU|;kD zHiF#%_64xdgMAL{da&!jJ_~j&SPFIx*wtWHfn5o<2J8y3%fUv#R)bvzwo38ajb8~) z1=w=1WnfFehQWrwmVga{4S+Sl7K7ynQw0AD!4`nc2b%{r7wl57OB7dMw)-4#7K2>` zHXCdf*i5h)VAH`a1iQc*SE?RyI$aJc=6k%{soGIoP7hbAPg~kD_4hWaH&c?GlP$DQ zT_D{nUzfj>H_Ef*maflT&$%9PHFbXMTj8#Udk!LJ0J~Yl4uNiy1jl9J8ukoXI zGP||+r^W$wQrG$^N`_>jK-4Klg;B?kmf8-D98eqGRUm}cL~}~eil`GrRZ!r<0d-hx ze1;T8>+8jIsRS*j@l{GcH9lPmqDngNpbGy3C`|WO^0{ zUxc#g?n7#~1h#`hv@p7ZLbO0QjTTA;XukEIL+TewY<#8^L2uWmWlGiPE&9(RYPz2Z z%h2(tP=$^}ol5jd)Tu!GqE0z_IqH<5tx=~Gy%=@E=((s9LNw}>pc>op+cU&W))dR4 zQUH}j9TNqkPBAKqI)0QJb&AlUs8fj2>zxdC*k__nHF`7ZEJMfKW5pyjL#jfrMx{!0 zDC$(8y-}we?Tk8QXiL;7MH{WHN7Z7<&(3WGovv51q+;ZcI)0QFb&Am9s8fhC>Ya3U zf=<*sS*+Krj5I zdMArHuSK0obU5l%p#4#&9PM&X5N(<+RicCSQa0A&67~3-9a69ZLAhXGneHz5F z$Dm^RMH-64cB%FYp?zjQCb0fN&E9BztX5ql1SL0?37Jet>VxuqvDJkhzA29>*-8de z>iVFduXS~SP@0)uTS;fihQ6pZ&FzaaeABD*g*@3z?9SwD$xm355nOzbg|{e-yiZjoJsyC#ohKOru-TTK5S zF)+4zK5vnL^>+=mP&zbl)HvA9t*vK*J6%7eT_MNucC>;$#m-Td$kSY7odK!FQ9}nO zDNk7|W6)}ezSaPF=)$FFoNq)VMl|nWvq2g=26l&`?=MA`k3_s;H;Or8Ff@FSKA(&J z@HL4TLWwgc+e(%cq}}sTb9y=t{mVBm;t`viV|KJGS0qSh6{0L(Y(x{g^O_TQMFQ4$g~%l_iTtmWB?YM8kDAiP zesnD^5EsEhZ(vTf_n0XKtb}BwNWP~dNYwIivpcU$fcEWyKJm4Ws6uIFqE@U>I@@UH zo@ljiW<(LnRI>%IiAnqPVuR(Eo@lpkTtpV~{bnp5m?k~k3mx+nL|j5yVh-l@FzKxn z)RJyXLGSsdMw~)^*u0N-NRwVqWeu|}6}{ZS!>%b_hvo39o}2Cu^v|__X>s@lISk+$8jlZ($@sZ1X9zCqKSYdhFkF6MAwoI_Vo5X(i+l zrZg4~vXpL^f?oHvj%qk>bpUB8|CbK48Y$T9y=|4`iYQYGk6#zWI@OAwOv*+6R%z z+srgmcx6iHH;d7?;?#|D7&nbJo`;?^PQ7EitDih`IuFlTE(gMdQ zI(Vbno{mmJ$7zS*Xs&O3q`TPOXw%E%!+48<<{K$kyr9Hi*yxIygAk0 zXQot6J)>EJZXbny@HL2Z5wgdeU~f5tmE1lWNmA^fNM|vs`{j>-DwWgM$DsYJK|6^u zEzB81nUXD)QGFb1wHL>tpV@VyqmUmDwCAl>X6+b( zAp_aXys@%fUf@cWt~>H*@(tHynm-6Nwx+a4(7hLQe@GetWH=FLpxBtWKTm5s((?C-j~Hk_dIGqMvtW<)7({REc%-zCwR0kZ5l| zQ>wDUUF==YkM!Y^`J~xw0+ZMTmHyt9wNTGUZ&9ebxtLd~ismMwRu)QRuX0&$P7@_F z%@Mp(l{BFn`i?27ca$`{6J$x1bXj+HxlQSg-iZq|iKK`viyK2G(-Q|X{mnt>N8hMO zFH!tHvq>^ksgj-^jE?x0MtTY*+e{qI6gKmtWGFjn)(~`>T_bvk?KUwVv3p83>-9~B zv5vHVDBJGfNU|v1&y;vas-W)sSn12dkbPF!e9bY;VVw7b+za)+T6g>d+O1Ai2Fa@P zuvFm~#x9+0tYPC(SBchcAcMN$erZ8o!$)N$mbHR7kmj22U|5L#n2u_mY;@!98RwyBzk54B-i8 zirJ(sOJMVVdOHEV%x;l`MNzLgho8-5bWj`Ct^d{AY7hqx;&i+tI*w=r-2$-ck?w%K!OlFw$Z#!`Di5VX<`1>OBW_QvL8| zKZ4#7JS+IN;2FWwz-n|#@TA~df^Q0*04_ss2-XQ67d$5TI#8ZUBbS3xdxBXR&8z5m+y{4j4ku3a%BTf@=g<153~UMR#TCJM2=zV7!I;L0|Mh}=uPpj2Nlpbv61n`slX`U5z>`(WggR}+PzQZXuyI)0STE~xJM4|CQZ+YnVeLT|i8+Jrk>t&XW*O7x-E z)WG3=s5YsCUZaQfJbi&aTYpd=uJ_UJ(Oc;ab*%kIyRLnyUD8f#N3`9v_A0X9-A|US zpT9*%WctQS(!07QOR-M-q(5njf5(^beq4>8!6W_JGA*q6wWV67 z_Ov#KoM2y$3&>QGN@DR1{4U;!OYvMhSevdrtc}w~XoIvqS`YFXSxXj^iKIJmNvmJNp&ipDTkHi%G20D*U$-ewaG!_+2EAbFVtGKMxCz?R}+*!l}pMV zrA&DYt87poVb_#}Xe4T*{#U)M?qlBq=c)tL#>!91Ic2L-tV~lTDqR$({E2)(u8<#- z`^pVn-@8t@)_HHRueuw(OTFX0oxBdkKT2sM|0!RTcgg{In%qOyTwl44xN5xDymj8S z-VEQSPru0xW`78N|Tq8d%5An-QT|c?bx;A-l zdM|jldyBnOyh&bcTs5v3`;0R7mbteP!E@7d!L!vnwhS-|| zyJ98yM_>?N2bSW00mJy8zz~}h^StkYLHrM30AB-|`0v1C{GI*j)$hmO+APA~h_tUo z+E*g&OGv|4dDh=V))yk{bCLBK&-#>SeIl|x7FkzB)<-<+GSB*uMg2}k0ACVmABePz zJnem+b^)RRd|qU|C$iq=Hh?`!Rg;F2XP`WmzyUWg9ypvfzunAHyf{B5Xl4 zfDempy&|&UgFS10_+(y$VN%LA29r{@v3(*9KIr@L9)6TCC1qJKBV}1IBV|XqlefhV zo(1z!mId=smId=smIbQ;{CG3Zg4w8_rNMNRrNMNRrNL?eKiMm~PZMjoDL zBNsnuqYyt~qX0i{BOlMTk%#Bl$ixp;<+LOfj@_9HxJ z8qb-^bEfc|$umd5jRU${72(6ClE7n}qk_O=oTHq;z+pE}K5nWAA2*d` zBb2Kk8-V46kDM}Mj~sTLt0H{pR1$cMV;fv268YdMBYf~wkkum2hf5jZ!=;MAV;j4B zuK>M*EEjovyp$0>UaANmFO{T9R8qtRjnm zl_VQjL9&45BvZr}x+I}25XyXD8F>a+MVTVXtRjy9E6FsVzk*DKKjmZ!u#8LwR*^}- zO7bwUf;RC1GiPd+2>kvejKY$j{)RlE*A#Gc5` zpa8m;y&~DCKCQM>zF?1Y6BTz4`4nNJiOk0raTOkp9qfT@5$dL1W6yrm)kew%r9v4d z-;!&|OfsHm_$^#y;S0EffyTwXbPWtU`6r%*CPsvbv=@L$kYvw4CJ|P3ys#%BA z>1^f*GH(EF`k|0MCVlUcJ|=zW?0V@6Nx4h9LVD4yOhWbsKO#vqVK0iOzkZ0>3vm{? zOuEu;cBJ2B+zEF2A?a{e@efIRy7V$mrX%)1{VuWBvBzxxVeWrGno;>9QK^e0{;r}I zNh3O{UV5K2yi0nYG@#2r!d;r*iIqqfi06(MWS*N|W*NqvSc!C=V7lC{Yx^s>_1(gI z#6@3VLjO|fT_W8n5oVs_ju(>N#kcQxCCs~Z#|tv=FS`8-ZXVriNIHlAr2F=ucy?OO z;hXe2lV!UIH9p2e=#)*UBYk@h>O$9VLhXpt+0+q`&fsg-?k#A5Q)VyohSMLnp*VVW z6MC7l7dioJ;^%lTTSn-Rb}K*G3!y_6TW$WKlXd9jPV1*{*}PJ=7h_qQ|A`yA=%uha z>hKLLCD|qJS8D(7yJ7m&L5qWlV px3iVQc*k?Wy+$9Sx!FU`X?3--K string.Equals(item.Slug, "mana", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row71A = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "A", StringComparison.Ordinal)); + var row71B = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "B", StringComparison.Ordinal)); + + Assert.DoesNotContain("+10H -", row71A.RawAffixText, StringComparison.Ordinal); + Assert.Contains("+10H -", row71B.RawAffixText, StringComparison.Ordinal); + } + + [Fact] + public async Task Mana_affix_boundaries_keep_71_75_d_and_e_separate() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row71D = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "D", StringComparison.Ordinal)); + var row71E = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "71-75", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "E", StringComparison.Ordinal)); + + Assert.DoesNotContain("+16H - 6", row71D.RawAffixText, StringComparison.Ordinal); + Assert.Contains("+16H - 6", row71E.RawAffixText, StringComparison.Ordinal); + } + + [Fact] + public async Task Mana_affix_boundaries_keep_91_95_b_and_c_separate() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row91B = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "B", StringComparison.Ordinal)); + var row91C = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "91-95", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "C", StringComparison.Ordinal)); + + Assert.DoesNotContain("+19H - 9", row91B.RawAffixText, StringComparison.Ordinal); + Assert.Contains("+19H - 9", row91C.RawAffixText, StringComparison.Ordinal); + } + + [Fact] + public async Task Mana_affix_boundaries_keep_86_90_b_and_c_separate() + { + var entry = LoadManifest().Tables.Single(item => string.Equals(item.Slug, "mana", StringComparison.Ordinal)); + var parseResult = await LoadParseResultAsync(entry); + var row86B = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "B", StringComparison.Ordinal)); + var row86C = parseResult.Table.Results.Single(item => + string.Equals(item.RollBandLabel, "86-90", StringComparison.Ordinal) && + string.Equals(item.ColumnKey, "C", StringComparison.Ordinal)); + + Assert.DoesNotContain("+16H - 8", row86B.RawAffixText, StringComparison.Ordinal); + Assert.Contains("+16H - 8", row86C.RawAffixText, StringComparison.Ordinal); + } + private static async Task LoadParseResultAsync(CriticalImportManifestEntry entry) { var xmlPath = Path.Combine(GetArtifactCacheRoot(), $"{entry.Slug}.xml"); diff --git a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs index ea572cb..15db26e 100644 --- a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs @@ -11,6 +11,7 @@ public sealed class StandardCriticalTableParser private const int FooterPageNumberExclusionGap = 80; private const int RowLabelDuplicateTolerance = 15; private const int TopGroupingTolerance = 2; + private static readonly Regex MultiFragmentSplitRegex = new(@"\S(?:.*?\S)?(?=(?:\s{2,}|$))", RegexOptions.Compiled); private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled); private static readonly Regex StandaloneModifierAffixLineRegex = new(@"^(?:\d+)?\((?:\+|-)\d+\)$", RegexOptions.Compiled); @@ -55,6 +56,7 @@ public sealed class StandardCriticalTableParser !rowAnchors.Any(anchor => anchor.Top == item.Top && string.Equals(anchor.Label, item.Text, StringComparison.OrdinalIgnoreCase)) && !headerFragments.Contains(item)) .ToList(); + bodyFragments = SplitBoundaryCrossingAffixFragments(bodyFragments, columnCenters, affixLegendSymbols); var bodyLines = BuildBodyLines(bodyFragments, columnCenters, affixLegendSymbols); var parsedRollBands = rowAnchors @@ -460,6 +462,101 @@ public sealed class StandardCriticalTableParser return symbols; } + private static List SplitBoundaryCrossingAffixFragments( + IReadOnlyList bodyFragments, + IReadOnlyList columnCenters, + ISet affixLegendSymbols) + { + var splitFragments = new List(bodyFragments.Count); + + foreach (var fragment in bodyFragments) + { + splitFragments.AddRange(SplitBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols)); + } + + return splitFragments; + } + + private static IReadOnlyList SplitBoundaryCrossingAffixFragment( + XmlTextFragment fragment, + IReadOnlyList columnCenters, + ISet affixLegendSymbols) + { + if (!LooksLikeBoundaryCrossingAffixFragment(fragment, columnCenters, affixLegendSymbols)) + { + return [fragment]; + } + + var matches = MultiFragmentSplitRegex.Matches(fragment.Text); + if (matches.Count < 2) + { + return [fragment]; + } + + var characterWidth = fragment.Width / (double)Math.Max(fragment.Text.Length, 1); + var splitFragments = new List(matches.Count); + + foreach (Match match in matches) + { + var segmentText = CollapseWhitespace(match.Value); + if (segmentText.Length == 0) + { + continue; + } + + var segmentLeft = fragment.Left + (int)Math.Round(characterWidth * match.Index); + var segmentWidth = Math.Max(1, (int)Math.Round(characterWidth * match.Length)); + + splitFragments.Add(new XmlTextFragment( + fragment.PageNumber, + fragment.Top, + segmentLeft, + segmentWidth, + fragment.Height, + segmentText)); + } + + if (splitFragments.Count < 2) + { + return [fragment]; + } + + var originalColumn = ResolveColumn(fragment.CenterX, columnCenters); + var distinctColumns = splitFragments + .Select(item => ResolveColumn(item.CenterX, columnCenters)) + .Distinct(StringComparer.OrdinalIgnoreCase) + .ToList(); + + return distinctColumns.Count > 1 || distinctColumns.Any(item => !string.Equals(item, originalColumn, StringComparison.OrdinalIgnoreCase)) + ? splitFragments + : [fragment]; + } + + private static bool LooksLikeBoundaryCrossingAffixFragment( + XmlTextFragment fragment, + IReadOnlyList columnCenters, + ISet affixLegendSymbols) + { + if (!IsAffixLikeLine(fragment.Text, affixLegendSymbols) || + !fragment.Text.Contains(" ", StringComparison.Ordinal)) + { + return false; + } + + var fragmentRight = fragment.Left + fragment.Width; + + for (var index = 0; index < columnCenters.Count - 1; index++) + { + var boundary = (columnCenters[index].CenterX + columnCenters[index + 1].CenterX) / 2.0; + if (fragment.Left < boundary && fragmentRight > boundary) + { + return true; + } + } + + return false; + } + private static void AddLegendMatch(HashSet symbols, string value, string pattern) { foreach (Match match in Regex.Matches(value, pattern, RegexOptions.IgnoreCase))