Tento skript nahradí Martinův skript. Na vstupu je XML
po zarovnání, na výstupu je opět XML
. Výstupní XML
:
XML
Skript provádí akce popsané v Tagování českých textů v InterCorpu.
#!/usr/bin/bash xmlTools=/cnk/common/tools/xml2standoff SYN2020_PREFIX=/cnk/common/tools/taggers/cs_2023/syn2020 bindir=${SYN2020_PREFIX}/bin fullName=`readlink -f $1` fileName=$(basename $fullName) filePath=$(dirname $fullName) rootName=${fileName%.xml} cd $filePath ### xml2standoff $xmlTools/xml2standoff.py $fileName perl -i.bak -pe 's/"s"/"sent"/' ${rootName}.json ### tagging ${SYN2020_PREFIX}/../tagger-debug.sh < ${rootName}.txt > ${rootName}.vrt errcode=$((${bindir}/check-vert-tag-v2020.pl -c5 -t -V < ${rootName}.vrt > /dev/null) |& wc -l) if [ "$errcode" -gt 0 ]; then echo "File ${rootName}.vrt incorrect" exit 1 fi ### xml restore $xmlTools/vrt2standoff.py -v cnk2020 ${rootName}.vrt $xmlTools/standoff2xml.py ${rootName}.txt perl -i -pe 's:</?s>::g' ${rootName}.merged.xml perl -i -pe 's:(</?)sent:$1s:g' ${rootName}.merged.xml ${bindir}/del_words.pl < ${rootName}.merged.json > ${rootName}.merged.del.json diff -q ${rootName}.json.bak ${rootName}.merged.del.json >/dev/null if [ $? -gt 0 ]; then echo "Segmentation mismatch in ${rootName}.merged.xml" fi