this directory contains files specific to running DELPH-IN tools on the UiO ABEL cluster. the scheduler used on TITAN is SLURM (plus Maui), so some of these files might be easy to adapt to another cluster running SLURM. using SGE instead should not be hard either, but in any case some of the defaults need to be tuned to the specific hardware of each cluster anyway. following are a few use examples. note that it absolutely must be the case that all source files are compiled, cache files built, and such, as jobs are likely to start up in parallel. to accomplish this for the relevant (T)ERG configurations, for example, it works to complete two small interactive jobs (in $LOGONROOT), e.g. ./parse --erg mrs ./parse --erg+tnt/speech mrs make terg ------------------------------------------------------------------------------- update all the profiles of a forthcoming new ERG release ------------------------------------------------------------------------------- { ti=0; tp=0; tr=0; for id in $(cat ~/deepbank); do profile=$id/14-02-17/pet.1; # profile=$id; if [ -f $profile/parse.gz ]; then i=$(zcat $profile/item.gz 2> /dev/null | wc -l); ti=$[$ti + $i]; o=$(zcat $profile/parse.gz 2> /dev/null | wc -l); p=$(tsdb -home $profile -query 'select i-id where readings > 0' | wc -l); tp=$[$tp + $p]; r=$(tsdb -home $profile -query 'select i-id where t-active > 0' | wc -l); tr=$[$tr + $r]; if [ $i != $o ]; then echo "incomplete: $id ($i vs. $o)"; else if [ $r -eq 0 ]; then echo "suspicious: $id"; fi echo "$id $i $p $r" fi; else echo "bogus: $id"; fi; done raw=$[$tp * 100 / $ti]; cooked=$[$tr * 100 / $ti]; echo "$tp parses and $tr results for $ti items ($raw% and $cooked% coverage)." } for i in ptb02a ptb02b ptb02c ptb02d; do /opt/slurm/bin/sbatch ${LOGONROOT}/uio/titan/parse --reset --seconds \ --terg/ptb --best 500 --compare readings,p-input \ --time 240 --memory 2560 --edges 200000 \ --update --compress --thin $i; sleep 0.5; done ------------------------------------------------------------------------------- apply a reasonable set of result filters to the full collection ------------------------------------------------------------------------------- for i in csli mrs fracas \ hike rondane \ jh0 jh1 jh2 jh3 jh4 jh5 jhk jhu tg1 tg2 tgk tgu ps psk psu \ cb trec sc01 sc02 sc03 rtc000 rtc001 \ ws01 ws02 ws03 ws04 ws05 ws06 ws07 ws08 ws09 ws10 ws11 ws12 ws13 \ ecoc ecos ecpa ecpr vm6 vm13 vm31 vm32; do /opt/slurm/bin/sbatch ${LOGONROOT}/uio/titan/redwoods \ --terg --filter syntax,lnk,cscope,fragmentation $i; sleep 0.5; done /opt/slurm/bin/sbatch ${LOGONROOT}/uio/titan/redwoods \ --terg/speech --filter syntax,lnk,cscope,fragmentation $i; sleep 0.5; done ------------------------------------------------------------------------------- compile a few summary statistics for (emerging) treebanks ------------------------------------------------------------------------------- { ti=0; tr=0; ta=0; for d in $(cat ~/files); do if [ -f $d/parse.gz ]; then i=$(tsdb -home $d -query 'select i-id where i-length > 0' | wc -l); o=$(zcat $d/parse.gz | wc -l); r=$(tsdb -home $d -query 'select i-id where readings > 0' | wc -l); a=$(tsdb -home $d -query 'select i-id where t-active > 0' | wc -l); if [ $i != $o ]; then echo "incomplete: $d ($i vs. $o)"; else ti=$[$ti + $i]; tr=$[$tr + $r]; ta=$[$ta + $a]; fi echo "$d $i $r $a: $[$r * 100 / $i]% raw; $[$a * 100 / $i]%" cooked; else echo "missing: $d"; fi; done echo -n "$tr ($ta) results of $ti items: $[$tr * 100 / $ti]% raw"; echo " and $[$ta * 100 / $ti]% cooked coverage."; } ------------------------------------------------------------------------------- parse the Penn Treebank (assuming the `ptb' add-on SVN component) ------------------------------------------------------------------------------- for i in ptb00 ptb01 ptb02 ptb03 ptb04 ptb05 ptb06 ptb07 ptb09 ptb09 \ ptb10 ptb11 ptb12 ptb13 ptb14 ptb15 ptb16 ptb17 ptb18 ptb19 \ ptb20 ptb21 ptb22 ptb23 ptb24; do /opt/slurm/bin/sbatch ${LOGONROOT}/uio/titan/parse --seconds \ --terg/ptb --best 1 --compress $i; sleep 0.5; done ------------------------------------------------------------------------------- for a complete run on English Wikipedia (assuming some local files) ------------------------------------------------------------------------------- { parse=${LOGONROOT}/uio/wikiwoods/1214/parse.0.job; export=${LOGONROOT}/uio/wikiwoods/1214/export.0.job; /bin/rm -f ${parse} ${export}; for file in ${LOGONROOT}/uio/wikiwoods/1212/gml/?????.gml.gz; do name=$(basename ${file} .gz); name=$(basename ${name} .gml); echo \ sbatch ${LOGONROOT}/uio/titan/parse --seconds \ --erg+tnt/gml/wescience --best 1 \ --time 600 --memory 6144 --edges 400000 --size 8192 \ --target "wikiwoods/${name}" --text --compress ${file} >> ${parse}; echo \ sbatch ${LOGONROOT}/uio/titan/redwoods \ --wikiwoods --target ${LOGONROOT}/uio/wikiwoods/1214/export \ --composite --active all --increment 200000 \ --export input,derivation,tree,mrs,eds,ltriples ${name} >> ${export}; done } cd ${LOGONROOT}/uio/wikiwoods/1214/log; ${LOGONROOT}/uio/titan/trickle \ --start --limit 400 ${LOGONROOT}/uio/wikiwoods/1214/parse.0.job; while true; do ${LOGONROOT}/uio/titan/trickle \ --limit 400 ${LOGONROOT}/uio/wikiwoods/1214/parse.0.job; sleep 60; done ${LOGONROOT}/uio/titan/trickle \ --start --limit 400 ${LOGONROOT}/uio/wikiwoods/1212/export.0.job; while true; do ${LOGONROOT}/uio/titan/trickle \ --limit 400 ${LOGONROOT}/uio/wikiwoods/1212/export.0.job; sleep 60; done ------------------------------------------------------------------------------- to confirm basic consistency of WikiWoods profiles ------------------------------------------------------------------------------- { cat /dev/null > .counts; ti=0; tr=0; for file in $LOGONROOT/uio/wikiwoods/1212/gml/?????.gml.gz; do id=$(basename $file .gml.gz); if [ -f $id/parse.gz ]; then i=$(zcat $id/item.gz 2> /dev/null | wc -l); ti=$[$ti + $i]; o=$(zcat $id/parse.gz 2> /dev/null | wc -l); r=$(zcat $id/result.gz 2> /dev/null | wc -l); tr=$[$tr + $r]; if [ $i != $o ]; then echo "incomplete: $id ($i vs. $o)"; else if [ $r -eq 0 ]; then echo "suspicious: $id"; fi echo "$id $i $r" >> .counts; fi; else echo "bogus: $id"; fi; done echo "$tr results of $ti items: $[$tr * 100 / $ti]% coverage" } > errors 2>&1 { tmp="${LOGONTMP}/.count.${USER}.io.$$"; cat /dev/null > .counts; for file in $LOGONROOT/uio/wikiwoods/1212/gml/?????.gml.gz; do id=$(basename $file .gml.gz); if [ ! -f $LOGONROOT/uio/wikiwoods/1214/export/${id}.gz ]; then echo missing: $id; else zcat $LOGONROOT/uio/wikiwoods/1214/export/${id}.gz \ > ${tmp} 2> /dev/null; if [ $? -ne 0 ]; then echo "invalid: $id"; else p=$(zcat $LOGONROOT/uio/wikiwoods/1212/tsdb/${id}/result.gz | wc -l); e=$(egrep '^\[[0-9]{13}\] ' ${tmp} | wc -l); if [ $p != $e ]; then echo "incomplete: $id ($p vs. $e)"; else echo "$id $p" >> .counts; fi fi fi done } > errors 2>&1 ------------------------------------------------------------------------------- while playing with CoNLL 2007 data (for parser stacking) ------------------------------------------------------------------------------- for i in 02 03 04 05 06 07 08 09 \ 10 11 12 13 14 15 16 17 18 19 \ 20 21 22 23 24; do sbatch ${LOGONROOT}/uio/titan/parse --seconds \ --terg/conll --best 1 conll${i}; sleep 0.5; done; ------------------------------------------------------------------------------- parse the GENIA Treebank ------------------------------------------------------------------------------- for i in gtb00 gtb01 gtb02 gtb03 gtb04 gtb05 gtb06 gtb07 gtb08 gtb09 \ gtb10 gtb11 gtb12 gtb13 gtb14 gtb15 gtb16 gtb17 gtb18 \ pgtb00 pgtb01 pgtb02 pgtb03 pgtb04 pgtb05 pgtb06 pgtb07 pgtb08 pgtb09; do sbatch ${LOGONROOT}/uio/titan/parse --seconds \ --terg+genia+tnt/genia --best 1 --compress ${i}; sleep 0.5; done ------------------------------------------------------------------------------- parse the North American News Corpus (NANC) ------------------------------------------------------------------------------- { parse=${LOGONROOT}/nanc.parse.0.job; /bin/rm -f ${parse}; for file in ${HOME}/src/2008T15/data/???/*.gz; do name=$(basename ${file} .gz); echo \ sbatch ${LOGONROOT}/uio/titan/parse --seconds \ --terg+tnt/wsj --best 1 \ --time 600 --memory 4096 --edges 400000 \ --target "nanc/${name}" --text --compress ${file} >> ${parse}; done } ------------------------------------------------------------------------------- blazing (via a pseudo-update) from the GTB annotation ------------------------------------------------------------------------------- for i in 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18; do sbatch ${LOGONROOT}/uio/titan/redwoods \ --terg --default --epilogue ${LOGONROOT}/mu/epilogue.lisp \ --gold erg/1010/gtb${i}/10-12-19/pet.500 --update \ erg/1010/gtb${i}/10-12-19/pet.500; sleep 0.5; done ------------------------------------------------------------------------------- lexical-only profiles (for ubertagger training) ------------------------------------------------------------------------------- for i in ws01 ws02 ws03 ws04 ws05 ws06 ws07 ws08 ws09 ws10 ws11 ws12 ws13; do sbatch ${LOGONROOT}/uio/titan/parse --reset --seconds \ --erg+tnt/wiki --lexical --update --compress ${i}; sleep 0.5; done for i in wsj00a wsj00b wsj00c wsj00d wsj01a wsj01b wsj01c wsj01d \ wsj02a wsj02b wsj02c wsj02d wsj03a wsj03b wsj03c \ wsj04a wsj04b wsj04c wsj04d wsj04e wsj05a wsj05b wsj05c wsj05d wsj05e \ wsj06a wsj06b wsj06c wsj06d wsj07a wsj07b wsj07c wsj07d wsj07e \ wsj08a wsj09a wsj09b wsj09c wsj09d wsj10a wsj10b wsj10c wsj10d \ wsj11a wsj11b wsj11c wsj11d wsj11e wsj12a wsj12b wsj12c wsj12d \ wsj13a wsj13b wsj13c wsj13d wsj13e wsj14a wsj14b wsj14c wsj14d wsj14e \ wsj15a wsj15b wsj15c wsj15d wsj15e \ wsj16a wsj16b wsj16c wsj16d wsj16e wsj16f wsj17a wsj17b wsj17c wsj17d \ wsj18a wsj18b wsj18c wsj18d wsj18e wsj19a wsj19b wsj19c wsj19d \ wsj20a wsj20b wsj20c wsj20d wsj21a wsj21b wsj21c wsj21d; do sbatch ${LOGONROOT}/uio/titan/parse --reset --seconds \ --erg+tnt --lexical --gold gold/deepbank/${i} --update --compress ${i}; sleep 0.5; done ------------------------------------------------------------------------------- conversion of DeepBank to various dependency formats ------------------------------------------------------------------------------- for i in $LOGONROOT/coli/deepbank/tsdb/home/*.1; do i=$(basename $i); $LOGONROOT/redwoods --deepbank --active resolved \ --target $HOME/deepbank --export input,derivation,tree,mrs,eds $i; done