#!/pkg/bin/perl

# PREPROCESSING FOR SHALLOW PARSING
# Tags the text, and prints each tagged sentence on a single line
# the tagged sentence is a sequence of Word/Tag, the last tag is
# always Punct-Sent.

# NAME of the tagging command :
$tag_command = "/pkg/cis639/bin/inxight-tag -l english" ;
$sent_beg = "S[" ;

open(TAG, "$tag_command |") ;
$sent_end = "Punct-Sent" ;
$sentence = $sent_beg ;

while (<TAG>) {
     chop ;
     if ( ($_ ne "") ) {
        ($tag,$word) = split(/	+/,$_) ;
        $element = &get_element ;
        if ($element ne " /") {
	    $sentence .= $element ;
	}
        if ( $tag eq $sent_end ) {
            if ( $sentence ne $sent_beg ) {
                print "$sentence]S\n\n" ;
            }
            $sentence = $sent_beg ;
        }
    }
}

if ( $sentence ne $sent_beg ) {
    print "$sentence]S\n";
}


# SUBROUTINES

sub get_element
{
    # if word contains one of the characaters " << or >> then skip it
    if ($word =~ /[\"\«\»]/) {
        "" ;
    } else {
        $word =~ tr/ /\_/ ; # word
        $tag = "/".$tag ;
        " ".$word.$tag ;
    }
}



