/cnk/common/korpus/vertikaly/DIA_v0/vertikala
./store/corp/Starocesi/19stol
.cut -f1 vertikala > vertikala.cut
splitdoc-xml text < vertikala.cut
for ff in text.*; do gg=$(echo $ff|perl -pe 's/\.00/./'); echo $gg; mv $ff $gg; done
csts
:mkdir ../csts for ff in text*; do echo $ff; vert_csts.pl < $ff > ../csts/$ff; done
a spravíme ho:
cd ../csts for ff in *; do echo $ff; perl -i -pe 'undef $/; s/<s>\n(<doc)/$1/' $ff; done for ff in *; do echo $ff; perl -i -pe 'undef $/; s/<doc>\n//' $ff; done for ff in *; do echo $ff; perl -i -pe 'undef $/; s:</doc>\n(</csts>):$1:' $ff; done
make-corp.sh -Eucs2 -s csts -t csts-morf -A1 -B1 -p45 -v -M make-whole-corp-csts.sh -Eucs2 -f -M -p45 -trules -v
V nejnovějším PostMorfo
jsou doplněná slovíčka a LEX
je zkompilovaný i s přidanými tagy.
parallel-filter.sh -C "csts2cnk.pl | perl -pe 's/(<doc.*>)-$/\$1/'" -p45 \ -s csts-rules-frazrl-rulh1-tag-vid-corr -t vert-corr -v cd vert-corr for ff in *; do echo "</doc>" >> $ff; done
for ff in *; do echo $ff; sdiff -s <(cut -f1 $ff) ../orig/$ff; done | l
for ff in *; do cut -f2- $ff > $ff.2-; done for ff in *.2-; do paste ../orig/${ff%.2-} $ff | perl -pe 's/\t<.*//' > ${ff%.2-}; done rm *.2-