From 5c4d5402460baad8e6150ffcb593c6b577eb0e1c Mon Sep 17 00:00:00 2001 From: Frank Tovar Date: Sat, 14 Mar 2026 01:44:30 +0100 Subject: [PATCH] Phase 2.1 import --- AGENTS.md | 1 + docs/critical_import_tool.md | 51 ++++++++ src/RolemasterDb.App/rolemaster.db | Bin 159744 -> 159744 bytes .../Parsing/StandardCriticalTableParser.cs | 121 ++++++++++++++---- 4 files changed, 151 insertions(+), 22 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 9ff8510..ac9b457 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -14,4 +14,5 @@ Also see the other related technical documentation in the docs folder. - When asked to begin working on a task, create a detailed implementation plan first, present the plan to the user, and ask for approval before beginning with the actual implementation. - When an task is finished, perform a code review to evaluate if the change is clean and maintainable with high software engineering standards. Iterate on the code and repeat the review process until satisfied. - After the implementation is finished, verify all changed files, and run `python D:\Code\crlf.py $file1 $file2 ...` only for files you recognize, in order to normalize all line endings of all touched files to CRLF. +- If there's documnentation present, always keep it updated. - At the end perform a git commit with a one-liner summary. \ No newline at end of file diff --git a/docs/critical_import_tool.md b/docs/critical_import_tool.md index 5db628f..4e96af7 100644 --- a/docs/critical_import_tool.md +++ b/docs/critical_import_tool.md @@ -32,6 +32,7 @@ The current implementation supports: - `standard` critical tables with columns `A-E` - XML-based extraction using `pdftohtml -xml` - geometry-based parsing for `Slash.pdf` +- row-boundary repair for trailing affix leakage - transactional loading into SQLite The current implementation does not yet support: @@ -149,6 +150,44 @@ This phase fixed the original `Slash / A / 72` corruption. The same lookup now r The important change is not only that the current output is correct, but that the importer now fails fast on structural ambiguity instead of silently loading corrupted rows. +## Phase 2.1: Boundary Hardening After Manual Validation + +After phase 2, a manual validation pass compared: + +- the rendered `Slash.pdf` +- the extracted `source.xml` +- the imported SQLite rows + +That review found a remaining defect around the `51-55` / `56-60` boundary: + +- `51-55` lost several affix lines +- `56-60` gained leading affix lines from the previous row + +The root cause was the original row segmentation rule: + +- rows were assigned strictly by the midpoint between adjacent roll-label `top` values + +That rule was too naive for rows whose affix block sits visually near the next row label. + +### Phase 2.1 fix + +The parser was hardened in two ways: + +1. Leading affix leakage repair + - after the initial row assignment, if a cell in the next row starts with affix-like lines and then continues with prose, those leading affix lines are moved back to the previous row +2. Better affix classification + - generic digit-starting lines are no longer assumed to be affixes + - this prevents prose such as `25% chance your weapon is stuck...` from being misclassified + +### Phase 2.1 validation rules + +The importer now explicitly rejects cells that still look structurally wrong after repair: + +- a cell may not begin with affix-like lines before prose +- a cell may not contain prose after affix lines + +This hardening step is important because it closed a class of row-boundary bugs that simple row/cell counts could not detect. + ## Planned Future Phases The current architecture is intended to support additional phases: @@ -353,6 +392,14 @@ Fragments inside a cell are grouped into lines by close `top` values and then or This produces a stable line list even when PDF text is broken into multiple fragments. +### Boundary Repair + +After the initial midpoint-based row assignment, the parser performs a repair step across adjacent rows in the same column. + +If the next row begins with affix-like lines and then continues with prose, those leading affix lines are treated as leaked trailing affixes from the previous row and moved back. + +This repair exists because some tables place affix lines close enough to the next row label that midpoint-only segmentation is not reliable. + ### Description vs Affix Splitting The parser classifies lines as: @@ -366,6 +413,8 @@ Affix-like lines include: - symbolic lines using the critical glyphs - branch-like affix lines such as `with leg greaves: +2H - ...` +Affix-like classification is intentionally conservative. Numeric prose lines such as `25% chance...` are not treated as affixes unless they match a known affix-like notation pattern. + The current implementation stores: - `RawCellText` @@ -384,6 +433,8 @@ At minimum, a valid `standard` table must satisfy: - roll-band labels are found - each detected row produces content for all five columns - total parsed cell count matches `row_count * 5` +- no cell begins with affix-like lines before prose +- no cell contains prose after affix lines If validation fails: diff --git a/src/RolemasterDb.App/rolemaster.db b/src/RolemasterDb.App/rolemaster.db index 183b54ec72aa2ef6c83414385abf942024f0d3cf..e1a6feb7d1b41871ad7ff8deaf22960c6e6e4f19 100644 GIT binary patch delta 8801 zcmeI1X?PUHy2rcUth032WM(2D0tsOW2WO^xl1T{rjxi!2Ms@-MA_pOWB6_$u6h6(4 zBo0{dpe!Q$zDRHdcTf~u!Sw_@B8ti)D%a!n+_$Q$c+P$9m;3F0IeDJ^fBk>os_yFU z>6+?y<~DfdHh5~9>b}|=dg;DBWBwV{>r!K4x%$=A{jsikJoQYhQ1|ZpGIpc|V$*=j zEk?z&G@X!Uy0=fCdnepKVaBBU9?S^YD%iWX3Z{oF73|eZ1=B(?73|ql1tA!9gC6g` zU#bIv!qpnNKt3l&$pNyS)Q~yk&twScO^QiAG0lIN=gg1HSIk}J>V46<)6Aqv8)Ns7 zuIpagvc``O1!yA-G~$7>l1N$Em~p`Xz3y7bj+7-vROJL{t_pR)R@NQE0|6So7P2E9 z5`!v9fQD44v?S8Ox@DjlpuuY)J5riV^s6ueG)KisN+YFKufBeR1~8PvM^RGJv!^dW z$+aeSq$JU$hc`e?6-tyuN~{i@JppQ53)zuGqNp@GK>aFIToOrG(RfyX`c$ZBbV;N* z(JGP|pk5V=+mWI~Zb3$XdQ`|RiNviyI6XkKuZ8T0o$!&g0L@Y%t0ZDunO-nyCJ%XZ zyd%FY&k4j@li$b{@*_D%&KT+b$-ZxWoxMvumpz@bcVzvRH9m7s#^sFm>B+Pmuu5O2 zO-glK8*4JJ*ohmOuIVr@M(32~guHQWao_3*dF=Og&^mQ!?LWfzJGR?`PN)6bVGCPZ z{Me1qHg#p)p|I&#vb%>)F2JOvEl$1h)F0tnotW%ZP7B&&B8G{R>t6~LI#ItkAT5}g zz<3nno1T6=+{=l`4qa%Y2qxm%qSUqxg{j#amV`SwH_Bv!=C;M8jmaq+Tc^5ioF9%k z?PanZ&ESW!FnMz0n#MurPx4U4BzFjldfrn#E00aDgivbjrpef~-L-`YOab?fkO z#Az*)ZD~`UjA5^BTb~TKb6Uwn0d2$+QA~{3Ru@b<1wL_=W>@oYTw9RpwEa)1==Oei zq?Z5lNH!*qZZAyj-aayTmy_@I8#}}u#DY|_9Ye$UPM+-3loEFU3sT2-v`g*TF)=vQ zY2o%6T<#8GLF$&B%~Sn$wh#7pn!BlP33nLtQxESL`ZCJ2$xDs!I3Z z^HXG3C5D^+YdD6jSMO?-dVH5%TI^i!redw#L(d;FVeI6*d+*aUt&MXpYFlRlYQ9s6 z+S2KTn&-s#?*3v;fwxaQD_tkLx3}%Gm*ui&%w>=0fr0wuX302Q%}mq5hF>`>_=O_| zKXXLk3P&7%;;`W|hXt27VsMco3Kuxy@FRx}KX6#^Jx2`Ab41}gjyQbFVZ%8N3%=oq zCE+Y*6u#z&!{0e<_=>}VGaNBE%@KtLjyU{{!-g+8Eck*W2A^|8;WLgne9B?NCma@> z;)uaXjwpQ05r>aBY)F2{X~72^F?gRN3h!~m;RJ^b$2lx`mm>!6a75u4M;zYfu;DEZ z3yyNc;0Q+)-sFhG8yq&g&SAl895Hy6BMPr1Ipgp$hYg20EI7mwgO@m>@FGVXUf{6d zc@7Jno8@ zd7NKrB*7@O1Xn`xTe)#g@*Bysl3xq1fWJ$AC3!~jv}A)|3jQYfrQ{cq zpG$rw_$YkpG6|o^z$wX-k{=7!!bg%HN`4^uzT|s?HE=@mxa7N%??@gKTn=wbz9pDk z4o79+h~%4skH8y}uS>or`KshAg3I7#$-|O|BwvzzQE(}|Ao;xHbDT+SDLg9!2L+cv zz2pJOXC(Ja?h{-LdnNZs?v~soxl?cv?2z0pxlM8_G6~uu*dhW8VYB2W$&Hd5B%c;s z0P7{6l3XXbR&tHtd{`~{q~sHlt0eKMPU2(ji)iF0vX@LJmU+S4X-+Y3G|n4ajY&qK z{~P~C{{(+4-&ek;d}GO3vVn{z1?CxZ9X_Egj4zERjgdxE|EK=P{X_kYd>{K(`fewm z;AAj_T!(LEE6l-W(0C7@%mIe!f5*SnU*Y%qj`|k*`jKN~3Avf{@!%w|!0c;g7_S)* z8$Au(f7qY&clG_3?*-p%Uq^C?JVd&fznjmSe=*DO;jcIDHxm9U{(b%#e%p7^x63!x z7hxxA_)}fOpXwSt;Z6E8`FjT7Viq55Wo$IRDr2GfMVT0ypOuNCxuQ&5!)IO@8_i{9 zEHsytiJ`fuOjJuUxS(8I!`VO?8_f^OSZKaiCWhv`GEp?&DHGRL!nexUXwE5Pq4`Fc z7@D)vByn-S624ZcxQ6qGGB%p8l(Ep9Q6`4wv@%gN4a&qdoLH2x(R`_lh2{%oVrV|+ zCdsdT6y0Yk71waeQN~8|i82JB@V!Pj-zp$xvB!**rxbsV<&Gn4MqA49iQrJ`uID1-0V zuvr;=uZB&^SZFpXgYV9;K^c5shNqRmw`5qa489e^Q_5IqlIxU3Sn@OJN$5h8MxK$6O=IU)oA(c;QQ1;Tlo$(o)i3 zu`8iH>Kaky(rP{FdJbTHORJG#UfI%?`_q&BkfNxiJ>sSkyqu*ibB!ouX-i#G#4A|Z z64!{@mA2S5qH3irl7`o-w1ut_l`3t4Gp+s;3@p)scl>y*PHv7Fr!l!XqDp&AZr(9mTPZgO zRcb5b<}CxYl-%rBp*<=$d-c_7CJ9&tAZd5cQ5WilMar&VsY zifBvZW^RGDSZ)Tw+9J8>Bicf_nd#LQcrtVko<@c?|G;7WpIEv$a6-?_E1slb8TC#q zn(oK1`Jgf7|5ClHrT;G8{kwSg|5>~X{_o-)`|O{3yTYJ0KqCqBZexSL!B2c8-oc*x zvqxv$nt6N1==9stRztmhN^hkNsHFwDbdXaiE0YzpJ+JKInu2v70EffWa&DJ3*!C0p zV+I!(D;LAM@CawH>{3Z{DlmbI4c1kMx5ML{+hk@GZCZ{QTzRk?06qv=&LFj~Id0Xt zmg#A@vutOe+htxFt>V>fT#+y<9a^ywLFmWw(qUP6hBH9!b_yMpD@Ibo4yVHztc~9) zGt+68GBK1I_DC3pGm;5Q!+xi~+<6wQxUR5Uz4=mTiX7T;2v(KWw)Lj)yCqa+w)G z4SpGLDa1zSWAA6ZusYn|xk;w`(G2%0Eoar&Lu71k8sFA7WO~yw5C43p_ z?0wiXC3|+(q|Co&G)Xt$Ykj@;AvV!yE2o#d-7Ti`9pd7RC2c&z5jNbx_OyiS!xc_X zdC)$zNfEZeH63e%MLl*V4vli>b#;2U`FUmZUiTI(V@46o1jEbz@Iwo3+OjQ0aEoh4 z=i;ffa=Oc2E$FCz;+&VUbrER8ii@E)?|R6DW<1iCtto~cEZ7WM=FZD@y2);7v^U>W&Y~4gmush-(@C82GInDr-c)ZXwB&~}@YI>7BbcmXDHIFq_MK-so#hBU zbTIFW--2xUpP-QKDus{3Q=Lw7*OO=wPva_<*%6q^GRt6EIOuegy$w3deNL9vFDe7j z!wIKM_KZ3=1?<62usAf$DU~;R@|>As zR7+THXE+_Uof6rvJvCa30WIMVSRD**uHOWcETASf1O`DV7(uDb1%k zrf|(%U)2?I^pNQk$xfa*?#~rV*x_#YNm$nns&2rA4gb^ezm@!NjckMeZR>BygRe=J z_aV)v&C*teVO;G%|1+~7%bTug8T0Drc_EpRgDo?B0gW^^ON`F`&b|Q4>kfTt2b%R0 zx&wHyY3AeglgnX)o;$D5c|cy#0{UP~oOxY+<3GdodT^FJ@mX}DJ6YASXD8#0x@R)H z5xid}r_*x&BO9(f*@1iTQ#0ircriTMnJLr5Xazq*dKd8*;pT>2l*47vAf=f&9X z1MzdRV;~#}PM7_f&}x1)un@qE+wg(kJ_ybSr^%jU=^%bZurR>3+ygNoB51`Dv@53EC7hSY@bxV<`00gOlVSX44_= z_tHvMI~-59`*1iGyj$+C8J)!+3amiXHy;6EJvdP&3_7@*7{JQ<-XkGh58mY_9v<*; zE>B=(f^`}V1M9brf_wEseuo0uAdPf2%Z*O{cD_d563?*g%B&ulWf|4!)oJ~~(2KP} zwX|I>t#T&Fu??eSkQnqzR#ydi?6xY{nM*63@iISvwsU{VUBN!{9LXpht2rw`EYqW53S_K!-@RGwC(GCw&ll%&CX>w$Cd*_S#}hk=?M&>s@4YI<$^4lg z^D}&(&N=tKt=4n9Uo&~lF?r2#CR=q*&#O_LPq%zjtWG6bN;1{^69XlyR3q_)k|Nb{ z^^KC379c%(%kLKKHD)O)AvvmJ-MUEoKzmpoqpGS-Q*bUHDo0sqXkFqnAbVSnQ4!^6I6bF(#U zC7oknn=z1B{n@L&Pnesmdpx0s6S>00(^0X06y!hCtZ?RcxbvLMw=flxN z5yM@yVjZV^8i$Z#S#uh}P|5?f%5-OZkbL zFO~cJ<{E2MHYH;1{>00d0*Q;4I?}hBtL3OIbz=4Y#D=HlC#s(crPr8MvZbm*96=-z zd#VDnt3I9$M-tz_?0$2l%rlPC*#r6o5fa1`Ke*S)!an(9bCd5l#W<<7B`QPGLCA zDFnTog3!Y$0Eajk(9J0f2RVh{0H+{yaSA{uCj&Y-h2aiPA!z3mgf>nA*e}k)fZJuC zR@tXT_Gy-V_Hi;`FQ+i<;S_@1oPw~6Qvh~yGGGU%Fl^@(f^D3Fu$5B)ZsTOY7EWQf zl~V|sQe((LXq0_!k$pDHKAU8pjhuqez$pOroD8Vr6oy((A=tnv2zp|)^Z9% z4W|%Pa|*&5P61fW$$+ZVq`AD$D%oeH?6X4lx%rvH>OX6-`%{t>fb#;Gz;goc6L?l& zLg2jurv;u7cv|2+0#6A%De!KAcL_Wp@VLM`1s)SPCD4osIw~+OkP4g>C|f6l9v3(! za8%%kz+r(y0tW>S2#gBs7Z?%PC-8{C!vcF#V*KoTL~ux8x4?q}4+!iM*eS3>;2i?n z1-1#?FYtDOtpZyFHVfP*aIe5U0(T4Cm68<0<#&o;hrsOuw+Y-T@HT;41l}sJNnoSE zTLf+vxJlqffeixd1=b0y6}Um*dV%W@V*qPKP$RHf;2MFe1y%`MC2*y{6#{Q|q8m1A zbqaZ({EqyZ{FJ;-zDvGEULr5zC$X#KN%C3pDZ>y2~p45;P zq=FQa!Wi+BEaD|DlC1w(|AYQp{TKS1`VaKi^jGw+>tE4l^eg(~`lt0z>Sy&+`Y}DG z59vqr1Nwe_kG@sktZ&d)>&x{rJ)ketv-EV`p{v^a+J9=l)ZWy-uYFs4S$k1?PJ3E= zTzg2nUz^rWXjB{4j%Zz4tF}{X($qTkYKvOLuC=JnnfE4Dk2?k*6e!Ot3cORGd=cq4 z`O0%hKg(C1MS3G&nL+xueB~LWSM!xGAibQYd>-i^^OUPd&(BH%P^RF1i_crkEIw!P zeS%FmYjHwy4DPiO(-xl*d=ySwe2>MaEIw)R-GbwAm&GS6K5p@y79SH#Vaj6D;-i9N z6yjEb3Z8^Xi(?i~SUhgAs0uMz6=Jd~#AH>7$*K^ORbfI_g$WQvVL}##30V{-WKkH0 zn5YWlvMP+rsxU6A!nmvoW3noY$*M3WtHPM93S+V=jLNDoDyzb%tO}#DDvZK@?tmk* zD2#w83L_wj!U!~riZKGBDh$i2Ff6OWu&fHhvMLPAsxTz0!jP;AL$WFi$*M3UtHPkH z3WKsL3}RKlr0oWue#sG+_Fd(bKfUF8pSrwwPDnw;fh{~!El~ti% zR)v0975ZgW=$BQYUsi<(uL>$eWKoEKC<+k}MIi$CR>$zQcJ(Ou3I16!Mpol{`I3H8 zZ`3oi?`eOpwP<2941Tj_wXy$q`p>9(_YceXt!xMxqslk z-@U^<$Ms#;edI^v0%;>f`fu=U?$?)T|E0a4nOZG=JNOs(Nq2+W>3Z39oV-fTqCKpe zr(e?_);smE_O|wUZAe?8Def2CG50FhpIu*ajgqgC<76Z8=-<)LqY3!6pK6b3-CC*p zUH4UYpS#TUw(GL1hrN1+ztv~>TYW~YaN=1dzox2?oSvsVjryOs_T(v7P(ROgMy_%h z^<}Poxyn*SGk0msqR$#QUc&G_^WLPVGPdOK?nn82SE(Z*+D>wO~HM3WGML3 z+d&wEgdK!1xYrJX7);wi0E07jU?^w?b`Zwk9ywIN0E00XzY!|l2y`4o8iFtwZQzwhp41 z**buZW@`g&&DLS`Hd}|#;A|a4m$P*Mt>x^F?r9z=#zFs2+h*gL3B)82hcWcZJ>ABI*bNt z>kzuAt%GQ#who}5+S))yH@3*7Yj5jyGD(7m5T*FB4786y$m>L9zwCbaW%QZQ3|LA|EOey=7k<9k($@Pphh zoyn>X_%4_-J#h&?YQZUuY>g~~web19Q9oIrIf=osew{o|o+giy2S@@xs>bkxY8Tl@wvbw~ za;Er-3IuOm^0r;MIP{ySoGG}429CBiWy+E*`xVoYyPK7xmfW^WiCeO1t3oZgu~C_H z@EKIchK87>YSvF!s=9jIQY+VtS!!9+icw3JFCDQ|@#0}i6t&3+~Ra5;-;tKgDY12Q{pVj;Ho3#G_%a+|Vtz4S3R(K}CO0d&~VRcY$l0bAqLn!v2}TQkdmTrni{~X5ycK zSW31tO>tsRhmp2_)+bZVl>N!Ol1c#724!%X?#`f%Cb2dm8)!D}D5vEgV5gTtCTLFf z3P3G~g=}jQ6tXi*A@2{4_a;7FD{5TCVvqs1JS_k6+ zQzWUxI8;cAAWq`YS}m(vv9oZrcwUWCn+i{PBsOtu;;sG}Sk z^E%nh;>=?%%t?RqedX5lu5^MNcYTM<+-;53=uUi79HC*>lnpnr zzsrJO`IeZwtg$83C4TG~X1|<^m*6XNA?90Q?zCE$QXg*(h_RQvz2*+fat@A1`SA>~ z^m*{6uhiUb4G2*!pC8H)e+lQXoE*5zn`3UXMrDoi#?a8Y`MA`z9O%keXl~VoU&piQ zLaJhGaA;=Vd`MTlnt7Ww%rP#02pM88EWr2Tp#{)fq~XKoQQ8#})(>if?gOr!&RR!D z+Fi4brS_#9NWPGC0VdQ{O8YcjIn%xxzvU&U zs&4qu=P-w@h7{UR#g8|}8V|t-zQg8_b(RNd0Nu(FjIn(^@TPaAIcOc}@+iK+_?3=5 z)(i95;a)hyjeWox)<8r2w>V?szBYK_oClKqy3(hR9BrMu#N}|Nq+OZSnYtsTIJqxr z0?w*^O5ZdsEsqz*d#wY?r44)2eW}^?p{2F{>yrt8`E@!r z0-tGum%Luth|gqi2{z()dG_|L=*OpTgySKs#uWqfhkPh=-B5?*DFg8?%~lv zJ`#5XSmSp1&?_qp#H(lro*5p&(V3PVP^tP{;<12u1FfLh*n%4cGmr0tYSk-C2GCKu zJ%A0kX~6!x3x4O7l>y>8bdW!@xPLIC?tu)|D=PrRHM&LIPHZUL3(2Zinmio6wBt?H$cWVK>gC))+ z___L;^2#)2CGn)V?J_aC32R8|ybYn{;;zM$%mw0FsZ=(l6&zxRYh_W&s)0r93)Qfi zp=xDEY9T8lPR#joK-@*E@JQ0~53y^tu!wtSa4j0b>ori$)h@QC4hnp~fF%uyLUl@^9*m`a8N?+q#GSk?z(= zr@s?d!wHKSw(@f*s>3Tc(*v`!#Rn?dW!_{>*h@3&GwRWoxfTi#$~i*~R!j7rNV9d;F~(tzA8l{$iZHyLS** z>u&GxxAwI8#eZJ$ABslycenfbm@JR~uLiMn6@2)?CN*l^tl_w06TjaF*>k1f=cicJ z4FzTXE#ivZ(c7*$?KO)1we9}>tv&IgDS6=`o=b9(#li5`GG(`TZ>QMPo1uUm*$f?-y@wC?_O$o( z`|s%O^OyPidZRsU#q4{VVbKE3d42M%ymxI=nbsuDv`JJKoZh$vUdxQ9O=ZZYcoL2B NCfpfi`x@bo{|!pF+L-_V diff --git a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs index 876c9cd..836f06f 100644 --- a/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs +++ b/src/RolemasterDb.ImportTool/Parsing/StandardCriticalTableParser.cs @@ -8,6 +8,7 @@ public sealed class StandardCriticalTableParser { private const int HeaderToBodyMinimumGap = 20; private const int TopGroupingTolerance = 2; + private static readonly Regex NumericAffixLineRegex = new(@"^\d+(?:H|∑|∏|π|∫|\s*[–-])", RegexOptions.Compiled); public StandardCriticalTableParseResult Parse(CriticalImportManifestEntry entry, string xmlContent) { @@ -49,8 +50,7 @@ public sealed class StandardCriticalTableParser .Select(anchor => CreateRollBand(anchor.Label, anchor.SortOrder)) .ToList(); - var parsedCells = new List(); - var parsedResults = new List(); + var cellEntries = new List(); for (var rowIndex = 0; rowIndex < rowAnchors.Count; rowIndex++) { @@ -80,30 +80,65 @@ public sealed class StandardCriticalTableParser continue; } - var lines = BuildLines(cellFragments); - var rawAffixLines = lines.Where(IsAffixLikeLine).ToList(); - var descriptionLines = lines.Where(line => !IsAffixLikeLine(line)).ToList(); - var rawCellText = string.Join(Environment.NewLine, lines); - var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines)); - var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines); - - parsedCells.Add(new ParsedCriticalCellArtifact( + cellEntries.Add(new CellEntry( rowAnchors[rowIndex].Label, + rowIndex, columnAnchor.Key, - lines, - rawCellText, - descriptionText, - rawAffixText)); - - parsedResults.Add(new ParsedCriticalResult( - columnAnchor.Key, - rowAnchors[rowIndex].Label, - rawCellText, - descriptionText, - rawAffixText)); + BuildLines(cellFragments).ToList())); } } + RepairLeadingAffixLeakage(cellEntries); + + var parsedCells = new List(); + var parsedResults = new List(); + + foreach (var cellEntry in cellEntries.OrderBy(item => item.RowIndex).ThenBy(item => item.ColumnKey)) + { + var firstProseIndex = cellEntry.Lines.FindIndex(line => !IsAffixLikeLine(line)); + var firstAffixIndex = cellEntry.Lines.FindIndex(IsAffixLikeLine); + + if (firstProseIndex > 0) + { + validationErrors.Add( + $"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' begins with affix-like lines before prose."); + } + + if (firstAffixIndex >= 0) + { + var proseAfterAffix = cellEntry.Lines + .Skip(firstAffixIndex + 1) + .Any(line => !IsAffixLikeLine(line)); + + if (proseAfterAffix) + { + validationErrors.Add( + $"Cell '{cellEntry.RollBandLabel}/{cellEntry.ColumnKey}' contains prose after affix lines."); + } + } + + var rawAffixLines = cellEntry.Lines.Where(IsAffixLikeLine).ToList(); + var descriptionLines = cellEntry.Lines.Where(line => !IsAffixLikeLine(line)).ToList(); + var rawCellText = string.Join(Environment.NewLine, cellEntry.Lines); + var descriptionText = CollapseWhitespace(string.Join(' ', descriptionLines)); + var rawAffixText = rawAffixLines.Count == 0 ? null : string.Join(Environment.NewLine, rawAffixLines); + + parsedCells.Add(new ParsedCriticalCellArtifact( + cellEntry.RollBandLabel, + cellEntry.ColumnKey, + cellEntry.Lines, + rawCellText, + descriptionText, + rawAffixText)); + + parsedResults.Add(new ParsedCriticalResult( + cellEntry.ColumnKey, + cellEntry.RollBandLabel, + rawCellText, + descriptionText, + rawAffixText)); + } + if (columnCenters.Count != 5) { validationErrors.Add($"Expected 5 standard-table columns but found {columnCenters.Count}."); @@ -276,12 +311,46 @@ public sealed class StandardCriticalTableParser value.StartsWith("\u220F", StringComparison.Ordinal) || value.StartsWith("\u03C0", StringComparison.Ordinal) || value.StartsWith("\u222B", StringComparison.Ordinal) || - char.IsDigit(value[0]) || + NumericAffixLineRegex.IsMatch(value) || value.Contains(" - ", StringComparison.Ordinal) || value.Contains("(-", StringComparison.Ordinal) || value.Contains("(+", StringComparison.Ordinal); } + private static void RepairLeadingAffixLeakage(List cellEntries) + { + var maxRowIndex = cellEntries.Count == 0 ? -1 : cellEntries.Max(item => item.RowIndex); + var columnKeys = cellEntries.Select(item => item.ColumnKey).Distinct(StringComparer.OrdinalIgnoreCase).ToList(); + + for (var rowIndex = 0; rowIndex < maxRowIndex; rowIndex++) + { + foreach (var columnKey in columnKeys) + { + var current = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex && item.ColumnKey == columnKey); + var next = cellEntries.SingleOrDefault(item => item.RowIndex == rowIndex + 1 && item.ColumnKey == columnKey); + + if (current is null || next is null) + { + continue; + } + + var leadingAffixCount = 0; + while (leadingAffixCount < next.Lines.Count && IsAffixLikeLine(next.Lines[leadingAffixCount])) + { + leadingAffixCount++; + } + + if (leadingAffixCount == 0 || leadingAffixCount == next.Lines.Count) + { + continue; + } + + current.Lines.AddRange(next.Lines.Take(leadingAffixCount)); + next.Lines.RemoveRange(0, leadingAffixCount); + } + } + } + private static string CollapseWhitespace(string value) => Regex.Replace(value.Trim(), @"\s+", " "); @@ -295,4 +364,12 @@ public sealed class StandardCriticalTableParser private sealed record ColumnAnchor(string Key, double CenterX); private sealed record RowAnchor(string Label, int Top, int SortOrder); + + private sealed class CellEntry(string rollBandLabel, int rowIndex, string columnKey, List lines) + { + public string RollBandLabel { get; } = rollBandLabel; + public int RowIndex { get; } = rowIndex; + public string ColumnKey { get; } = columnKey; + public List Lines { get; } = lines; + } }