001    /**
002     * LTAG-spinal API, an interface to the treebank format introduced by Libin Shen.
003     * Copyright (C) 2007  Lucas Champollion
004     *
005     * This program is free software: you can redistribute it and/or modify
006     * it under the terms of the GNU General Public License as published by
007     * the Free Software Foundation, either version 3 of the License, or
008     * (at your option) any later version.
009     * 
010     * This program is distributed in the hope that it will be useful,
011     * but WITHOUT ANY WARRANTY; without even the implied warranty of
012     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013     * GNU General Public License for more details.
014     *
015     * You should have received a copy of the GNU General Public License
016     * along with this program.  If not, see <http://www.gnu.org/licenses/>.
017     *
018     */
019    package edu.upenn.cis.spinal;
020    
021    import java.io.*;
022    import java.util.*;
023    import java.util.regex.Matcher;
024    import java.util.regex.Pattern;
025    import edu.upenn.cis.propbank_shen.*;
026    
027    /**
028     * Represents a "spinal" elementary tree in Libin's LTAG
029     * treebank. A typical elementary tree (taken from p. 73
030     * of his thesis) looks like this:
031     *
032     * <code>
033     * #3 failed
034     * spine: a_( S ( VP VBD^ ) )
035     * att #0, on 0, slot 0, order 0
036     * att #2, on 0, slot 0, order 1
037     * att #6, on 0.0, slot 1, order 0
038     * </code>
039     *
040     * or like this (a structure for predicate coordination):
041     *
042     * <code>
043     * &20
044     * spine: c_( S S S )
045     * crd #3, on 0.0
046     * att #9, on 0, slot 1, order 0
047     * crd #10, on 0.1
048     * </code>
049     *
050     * @author Lucas Champollion
051     * @author Ryan Gabbard
052     */
053    public class ElemTree implements Serializable {
054    
055        /**
056         * Designates a field (tree, slot, etc.) of unknown type.
057         */
058        public static final int UNKNOWN = -1; 
059        
060        /**
061         * Designates an initial tree.
062         */
063        public static final int INITIAL = 0;
064        
065        /**
066         * Designates an auxiliary tree.
067         */
068        public static final int AUXILIARY = 1;
069        
070        /**
071         * Designates a coordination tree -- these trees are special constructs and not anchored in a lexical item.
072         */
073        public static final int COORD = 2;
074        
075        /**
076         * A return value designating a special case when the current tree that has no parent.
077         */
078        public static final int ROOT = 3;
079        
080        /**
081         * Designates a tree combined with its parent by an attachment operation,
082         * indicated by the keyword "att".
083         */
084        public static final int ATTACH = 4;
085    
086        /**
087         * Designates a tree combined with its parent by an adjunction operation,
088         * indicated by the keyword "adj".
089         */
090        public static final int ADJOIN = 5;
091    
092        /**
093         * Designates a tree combined with its parent by an attachment operation,
094         * indicated by the keyword "crd" in the output of the incremental parser
095         * and in the LTAG-spinal treebank.
096         */
097        public static final int CONJUNCT = 6;
098    
099        /**
100         * Designates a tree combined with its parent by a conjunction or an attachment operation,
101         * indicated by the keyword "con" in the output of the incremental parser
102         * and in the LTAG-spinal treebank.
103         * 
104         * In the treebank and in the output of the incremental parser, "crd" 
105         * is used for conjuncts and "att" is used for
106         * connectives. In the output of the bidirectional parser, we don't distinguish 
107         * conjuncts from connectives, in order to reduce an operation in parsing, so "con" is used
108         * to represent both.
109         */
110        public static final int CONJUNCT_OR_CONNECTIVE = 7;
111        
112        /**
113         * Designates a tree that attaches/adjoins on the left of its parent.
114         */
115        public static final int LEFT = 8;
116        
117        /**
118         * Designates a tree that attaches/adjoins on the right of its parent.
119         */
120        public static final int RIGHT = 9;
121        
122        private static final int ATT_TYPE_OFFSET = 2;
123        private static final String newline = System.getProperty("line.separator");
124        private Sentence containingSentence;
125        int number = -1;
126        String terminal = "";
127        private int type = UNKNOWN;
128        private SpinalNode rootOfSpine = null;
129        
130        /**
131         * Indicates that the input only specified the part of speech and not the 
132         * spine, as is the case for the output of Shen's bidirectional parser.
133         */
134        private boolean bidirectionalParserOutput = false;
135        private String pos = null;
136        private ArrayList attachments = new ArrayList();
137        /**
138         * Null iff this tree is the root of the derivation tree.
139         */
140        private ElemTree parent;
141        /**
142         * the attachment by which this <code>ElemTree</code>
143         * attaches to its parent; null iff this is root of derivation tree
144         */
145        private TAGAttachment attachmentToParent = null;
146        /**
147         * the yield of the subtree rooted by this ElemTree
148         */
149        private WordSpan span;
150        
151        /**
152         * Represents to which side this tree attaches to its parent.
153         */
154        private int slot = UNKNOWN;
155        
156        private boolean completed = false;
157        private SpinalNode anchor;
158        private SpinalNode foot;
159        
160        private static final Pattern posPattern = Pattern.compile("\\s*pos: (.*)");
161        private static final Pattern spinePattern = Pattern.compile("\\s*spine: (.*)");
162        private static final Pattern coordPattern = Pattern.compile("\\s*coord\\s*");
163        
164        
165        /**
166         * Creates an <code>ElemTree</code> from a string representation and attaches it
167         * into the provided derivation tree (Sentence). We assume that the nodes are 
168         * numbered in ascending order of linear precedence of their anchors, as 
169         * is the case in the LTAG-spinal treebank.
170         * @param container the <code>Sentence</code> to which this 
171         * <code>ElemTree</code> is to be added
172         * @param representation the string that is to be parsed into an 
173         * <code>ElemTree</code>
174         * @throws edu.upenn.cis.spinal.ElemTreeFormatException if the
175         * string representation is not well-formed
176         */
177        public ElemTree(Sentence container, String representation)
178                throws ElemTreeFormatException {
179            attachments = new ArrayList();
180            containingSentence = container;
181            completed = false;
182            parent = null;
183            span = null;
184    
185            loadFromStringRepresentation(representation);
186        }
187        
188        /**
189         * Returns true if this <code>ElemTree</code> attaches or adjoins from the 
190         * left of its parent.
191         * @return a boolean value
192         * @see #getSlot()
193         */
194        public boolean attachesFromLeft() {
195            return this.getSlot() == LEFT;
196        }
197        
198        /**
199         * Returns true if this <code>ElemTree</code> attaches or adjoins from the 
200         * right of its parent.
201         * @return a boolean value
202         * @see #getSlot()
203         */
204        public boolean attachesFromRight() {
205            return this.getSlot() == RIGHT;
206        }
207    
208        /**
209         * Returns the node representing the lexical anchor of this tree, or null 
210         * if there is no anchor (in the case of a coordination structure).
211         * @return a <code>SpinalNode</code> element representing the lexical anchor
212         */
213        public SpinalNode getAnchor() {
214            
215            Iterator iter = this.getSpine().getAllNodes().iterator();
216            SpinalNode current;
217            while (iter.hasNext()) {
218                current = (SpinalNode) iter.next();
219                if (current.isAnchor()) {
220                    this.anchor = current;
221                    return current;
222                }
223            }
224            this.anchor = null;
225            return null;
226        }
227    
228        /**
229         * Returns the <code>SpinalNode</code> in the parent tree to which this <code>ElemTree</code>
230         * is attached, or null if this <code>ElemTree</code> is the root of the derivation tree.
231         * @return the parent's attachment site
232         */
233        public SpinalNode getAttachmentSite() {
234            checkCompletion();
235            if (this.isRoot()) {
236                return null;
237            }
238            return this.getParent().getSpinalNodeAt(this.getAttachmentToParent().getGornAddress());
239        }
240    
241        TAGAttachment getAttachmentToParent() {
242            
243            return attachmentToParent;
244        }
245        
246        /**
247         * Returns the type of the attachment of this elementary tree to its parent tree, one of 
248         * {@link #ATTACH}, {@link #ADJOIN}, {@link #CONJUNCT} (only used 
249         * in the LTAG-spinal treebank and in the output of the incremental parser),
250         * and {@link #CONJUNCT_OR_CONNECTIVE} (only used in the output of the 
251         * bidirectional parser). If this elementary tree is the root of the sentence,
252         * the special value {@link #ROOT} is returned.
253         * @return the type of attachment of this <code>ElemTree</code>
254         */
255        public int getAttachmentType() {
256            if (this.isRoot()) {
257                return ROOT;
258            }
259            return attachmentToParent.getType();
260        }
261    
262        /**
263         * Internal method, returns <code>TAGAttachment</code> list for this tree.
264         * @return a list of attachments
265         */
266        private List getAttachments() {
267            return attachments;
268        }
269    
270        /**
271         * Returns the <code>ElemTree</code>s that attach to this <code>ElemTree</code>.
272         * @return the children of this node in the derivation tree
273         */
274        public Iterator getChildren() {
275            checkCompletion();
276    
277            return new TAGAttachmentToElemTreeIterator(getAttachments());
278        }
279    
280        /**
281         * Returns the spans of the <code>ElemTree</code>s that attach to this 
282         * <code>ElemTree</code>.
283         * @return an iterator over (@link WordSpan} objects
284         */
285        public Iterator getChildrenSpans() {
286            checkCompletion();
287    
288            return new TAGAttachmentToWordSpanIterator(getAttachments());
289        }
290    
291        /**
292         * Returns a <code>List</code> of the elementary trees dominated by this 
293         * elementary tree, 
294         * including this tree itself.
295         * @return a list over <code>ElemTree</code> objects
296         */
297        public List getDominatedElemTrees() {
298            // TODO check if this, called on root, gives the same results as 
299            // this.getSentence().getElemTrees(this.getSpan());
300            Iterator candidates = this.getSentence().getElemTrees().iterator();
301            List result = new Vector();
302            while (candidates.hasNext()) {
303                ElemTree current = (ElemTree) candidates.next();
304                if (this.dominates(current)) {
305                    result.add(current);
306                }
307            }
308            return result;
309        }
310    
311        /**
312         * Returns a <code>List</code> of the terminals attached to the elementary trees 
313         * dominated by this elementary tree, 
314         * including this tree itself.
315         * @return a list of terminals (<code>String</code> objects)
316         */
317        public List getDominatedTerminals() {
318            Iterator iter = this.getDominatedElemTrees().iterator();
319            List l = new Vector();
320            while (iter.hasNext()) {
321                ElemTree current = (ElemTree) iter.next();
322                l.add(current.getTerminal());
323            }
324            return l;
325        }
326    
327        /**
328         * Returns the number of the file that contains the sentence to which 
329         * this <code>ElemTree</code> belongs, or -1 if there is no such number.
330         * @return an <code>int</code>
331         */
332        public int getFileNumber() {
333            return this.getSentence().getFileNumber();
334        }
335    
336        /**
337         * Returns the node representing the foot node (if any) of this tree, 
338         * or null if there is no foot node. (Only auxiliary trees have foot nodes.)
339         * @return a <code>SpinalNode</code>
340         */
341        public SpinalNode getFoot() {
342            Iterator iter = this.getSpine().getAllNodes().iterator();
343            SpinalNode current;
344            while (iter.hasNext()) {
345                current = (SpinalNode) iter.next();
346                if (current.isFoot()) {
347                    this.foot = current;
348                    return current;
349                }
350            }
351            this.foot = null;
352            return null;
353        }
354    
355        /**
356         * Method used internally to create the Graphviz representation of this tree.
357         */
358        String getGraphvizNodeID() {
359    
360            return "node_" + this.getNumber();
361        }
362    
363        /**
364         * Returns the number of this tree, corresponding to the position in the sentence.
365         * @return the number, or -1 if unknown
366         */
367        public int getNumber() {
368            checkCompletion();
369            return number;
370        }
371    
372    
373    
374        /** 
375         * Returns the location of the predicate-argument structure 
376         * corresponding to this elementary tree in the Propbank, or null
377         * if this tree has no section and file numbers. This will be a string
378         * of the following shape: 
379         * <code>wsj/&lt;section&gt;/wsj_&lt;section&gt;&lt;file&gt;.mrg</code>.
380         * @return a string that indicates the location of this word in the Propbank
381         */
382        public PASLoc getPASLoc() {
383    
384    
385            if (getSectionNumber() == -1 || getFileNumber() == -1) {
386                return null;
387            }
388            // these both return -1 if no number is available.
389    
390            String section = Integer.toString(getSectionNumber());
391            String file = Integer.toString(getFileNumber());
392    
393    
394            if (section.length() == 1) {
395                section = "0" + section;
396            }
397            if (file.length() == 1) {
398                file = "0" + file;
399            }
400    
401            String p = "wsj/" + section + "/wsj_" + section + file + ".mrg";
402    
403            return new PASLoc(p, getSentenceNumber(), getNumber());
404        }
405    
406        /**
407         * Returns the part of speech of this elementary tree, or "NA" in the case of a coordination structure
408         * (where part of speech is not really applicable since Coord nodes aren't lexicalized).
409         * @return the POS tag of the current word
410         */
411        public String getPOS() {
412            if (this.pos != null) return this.pos;
413            
414            SpinalNode theAnchor = this.getAnchor();
415            if (theAnchor == null) {
416                assert this.isCoord();
417                return "NA";
418            } else {
419                return theAnchor.getLabel();
420            }
421        }
422    
423        /**
424         * Returns the <code>ElemTree</code> to which this <code>ElemTree</code> attaches,
425         * or null iff this is the root.
426         * @return the parent of this node in the derivation tree
427         */
428        public ElemTree getParent() {
429            checkCompletion();
430            return parent;
431        }
432    
433    
434        /**
435         * Returns the number of the section that contains the sentence to which 
436         * this <code>ElemTree</code> belongs, or -1 if there is no such number.
437         * @return an <code>int</code>
438         */
439        public int getSectionNumber() {
440            return this.getSentence().getSectionNumber();
441        }
442    
443        /**
444         * Return the <code>Sentence</code> to which this <code>ElemTree</code>
445         * belongs.
446         * @return a <code>Sentence</code> object
447         */
448        public Sentence getSentence() {
449            //checkCompletion();
450            return containingSentence;
451        }
452    
453        /**
454         * Returns the number of the the sentence to which 
455         * this <code>ElemTree</code> belongs, or -1 if there is no such number.
456         * @return an <code>int</code>
457         */
458        public int getSentenceNumber() {
459            return this.getSentence().getSentenceNumber();
460        }
461    
462        /**
463         * Returns whether this elementary tree attaches to the {@link #LEFT} or {@link #RIGHT}
464         * side of its parent. Returns {@link #ROOT} if this tree is the root of the
465         * sentence and {@link #UNKNOWN} if the value is not known.
466         * 
467         * @return one of the given values
468         * @see #attachesFromLeft()
469         * @see #attachesFromRight()
470         */
471        public int getSlot() {
472            if (this.isRoot()) return ROOT;
473            
474            return this.slot;
475        }
476        
477        /**
478         * Returns the part of the sentence that is spanned by the yield of this
479         * elementary tree and its descendents. If the span is discontinuous, the
480         * left- and rightmost <code>ElemTree</code> locations are given.
481         * @return a <code>WordSpan</code> object
482         */
483        public WordSpan getSpan() {
484            checkCompletion();
485            if (span == null) {
486                computeSpan();
487            }
488            return span;
489        }
490    
491        /**
492         * Returns the spinal node at a given Gorn address in this tree.
493         * @param g a {@link GornAddress}
494         * @return a {@link SpinalNode}
495         */
496        public SpinalNode getSpinalNodeAt(GornAddress g) {
497            checkCompletion();
498            Iterator parts = g.iterator();
499            SpinalNode currentNode = this.getSpine(); // current node is root of spine
500            int currentChild = ((Integer) parts.next()).intValue();
501            if (currentChild != 0) {
502                throw new RuntimeException("Bad GornAddress: "+ g.toString());
503            }
504            while (parts.hasNext()) {
505                // TODO raise exception if we're given a bad address
506                currentNode = currentNode.getChild(((Integer) parts.next()).intValue());
507            }
508            return currentNode;
509        }
510    
511        /**
512         * Returns the spine of this tree.
513         * @return a {@link SpinalNode} representing the root of the spine
514         */
515        public SpinalNode getSpine() {
516            checkCompletion();
517            return rootOfSpine;
518        }
519    
520        /**
521         * Returns the yield of the subtree rooted in this elementary tree, with
522         * terminals separated by a single white space. Empty elements are included.
523         * @return a substring corresponding to the current <code>ElemTree</code>
524         * and all its descendents.
525         */
526        public String getSurfaceString() {
527            String result = "";
528            Iterator iter = this.getDominatedTerminals().iterator();
529            while (iter.hasNext()) {
530                if (!result.equals("")) {
531                    result += " ";
532                }
533                result += (String) iter.next();
534            }
535            return result;
536        }
537    
538        /**
539         * Returns the actual terminal string (word in most cases) represented by this <code>ElemTree</code>.
540         * @return the terminal string at the fringe of this elementary tree
541         */
542        public String getTerminal() {
543            checkCompletion();
544            return terminal;
545        }
546    
547        /**
548         * Returns the type of this elementary tree, one of 
549         * {@link #UNKNOWN}, {@link #INITIAL}, {@link #AUXILIARY}, and {@link #COORD}.
550         * @return the type of this <code>ElemTree</code>
551         */
552        public int getType() {
553            checkCompletion();
554            return type;
555        }
556    
557        /**
558         * Returns a string representing the type of this elementary tree, one of
559         * <code>initial</code>, <code>auxiliary</code>, <code>coordination</code>, 
560         * and <code>unknown</code>.
561         * @return the type of this <code>ElemTree</code> in string representation
562         */
563        public String getTypeAsString() {
564            checkCompletion();
565            switch (this.type) {
566                case INITIAL:
567                    return "initial";
568                case AUXILIARY:
569                    return "auxiliary";
570                case COORD:
571                    return "coordination";
572                case UNKNOWN:
573                    return "unknown";
574                default:
575                    assert false;
576                    return "unknown";
577            }
578        }
579    
580        /**
581         * Returns true iff this tree is of type "coordination".
582         * @return a boolean value
583         */
584        public boolean isCoord() {
585            return this.type == COORD;
586        }
587    
588        /**
589         * Returns true iff this tree is of type "auxiliary".
590         * @return a boolean value
591         */
592        public boolean isAuxiliary() {
593            return this.type == AUXILIARY;
594        }
595    
596        /**
597         * Returns true iff this tree is of type "initial".
598         * @return a boolean value
599         */
600        public boolean isInitial() {
601            return this.type == INITIAL;
602        }
603    
604        /**
605         * Returns true iff this tree is of unknown type.
606         * @return a boolean value
607         */
608        public boolean isOfUnknownType() {
609            return this.type == UNKNOWN;
610        }
611        
612    
613        /**
614         * Returns if the anchor (if there is one) is an empty element 
615         * by checking if the terminal has an unescaped asterisk. Returns false
616         * if this is a coordination node, in which case there is no anchor anyway.
617         * @return a boolean value
618         */
619        public boolean isEmptyElement() {
620            if (this.isCoord()) {
621                return false;
622            }
623            String t = this.getTerminal();
624            return (t.indexOf("*") != -1 && t.indexOf("\\*") == -1);
625        }
626    
627        /**
628         * Returns true iff this elementary tree has no parent.
629         * @return a boolean value
630         */
631        public boolean isRoot() {
632            return this.getParent() == null;
633        }
634    
635        /**
636         * Returns true iff this <code>ElemTree</code> has other 
637         * <code>ElemTree</code>s attached to it.
638         * @return a boolean value
639         */
640        public boolean hasChildren() {
641            return this.getChildren().hasNext();
642        }
643    
644        /**
645         * Method used internally.
646         */
647        void computeSpan() {
648            int position = this.getNumber();
649            WordSpan lexicalSpan = new WordSpan(position, position);
650    
651            // Coordination trees need special treatment: Their "lexical span"
652            // refers to a dummy node with no lexical content, so we must ignore
653            // that span.
654    
655            if (this.getType() == COORD) {
656                this.span = WordSpan.merge(this.getChildrenSpans());
657            } else if (this.hasChildren()) {
658                this.span = WordSpan.combine(lexicalSpan, WordSpan.merge(this.getChildrenSpans()));
659            } else { // this has no children
660                this.span = lexicalSpan;
661            }
662        }
663    
664        /**
665         * Method used internally while building up a derivation tree.
666         */
667        void complete() {
668    
669            // tells each child that this is its parent
670            for (Iterator it = getAttachments().iterator(); it.hasNext();) {
671                TAGAttachment attach = (TAGAttachment) it.next();
672    
673                attach.complete();
674            }
675    
676            completed = true;
677        }
678    
679        /**
680         * Method used internally while building up a derivation tree.
681         */
682        private void checkCompletion() {
683            if (!completed) {
684                throw new UncompletedElemTreeException();
685            }
686        }
687    
688        /**
689         * Method used internally and called from the constructor to parse
690         * a string representation into an ElemTree object.
691         */
692        void loadFromStringRepresentation(String rep)
693                throws ElemTreeFormatException {
694            String[] lines = rep.split("\\n");
695    
696            // first line has format:
697            // #4 will
698            // or:
699            // &25
700            // The latter case is coordination.
701            // Actually we have already stripped off the first character (& or #).
702    
703            String[] firstLinePieces = lines[0].split("\\s+");
704    
705            if (firstLinePieces.length < 1) {
706                throw new ElemTreeFormatException("First line empty.");
707            }
708    
709    
710            if (firstLinePieces.length == 1) { // coordination doesn't come with a word
711                this.type=COORD;
712            }
713            
714            this.number = Integer.parseInt(firstLinePieces[0]);
715    
716            terminal = "";
717    
718            for (int i = 1; i < firstLinePieces.length; i++) {
719                terminal += firstLinePieces[i];
720            }
721    
722            // second line has format:
723            // spine: a_( XP NN^ )        // spinePattern
724            // or
725            // coord                      // coordPattern
726            // or (for bidirectional parser output)
727            // pos: NN                    // posPattern
728    
729            String secondLine = lines[1];
730            
731            Matcher coordMatcher = coordPattern.matcher(secondLine);
732            Matcher posMatcher = posPattern.matcher(secondLine);
733            Matcher spineMatcher = spinePattern.matcher(secondLine);
734            
735            if (coordMatcher.matches()) {
736                rootOfSpine = null;
737                type = COORD;
738            } else if (posMatcher.matches()) { 
739                rootOfSpine = null;
740                pos = posMatcher.group(1);
741                bidirectionalParserOutput = true;
742            } else if (spineMatcher.matches())  {
743                try {
744                    secondLine = spineMatcher.group(1);
745                } catch (StringIndexOutOfBoundsException e) {
746                    System.out.println(secondLine);
747                    throw e;
748                }
749    
750                char typeChar = secondLine.charAt(0);
751    
752                switch (typeChar) {
753                    case 'a':
754                        type = INITIAL;
755                        break;
756                    case 'b':
757                        type = AUXILIARY;
758                        break;
759                    case 'c':
760                        type = COORD;
761                        break;
762                }
763    
764                // parse the spine
765                rootOfSpine = new SpinalNode(secondLine.substring(ATT_TYPE_OFFSET), this);
766            } // end treatment of spine
767            
768    
769            // read attachmentToParent lines
770            for (int i = 2; i < lines.length; i++) {
771                getAttachments().add(new TAGAttachment(this, lines[i]));
772            }
773        }
774    
775        /**
776         * Converts this tree to a canonical string representation of the kind used in Libin 
777         * Shen's thesis.
778         * @return a string representation
779         */
780        public String toString() {
781    
782            checkCompletion();
783    
784            StringBuffer s = new StringBuffer();
785            // "#20 remained" (hash, number, terminal: this is the normal case)
786            // "&20" (ampersand, number, no terminal: the format for coordination)
787            String nodeno = Integer.toString(this.getNumber());
788    
789            if (this.getType() == COORD) {
790                s.append("&");
791                s.append(nodeno);
792            } else {
793                s.append("#");
794                s.append(nodeno);
795                s.append(" ");
796                s.append(this.getTerminal());
797            }
798            s.append(newline);
799            
800            if (this.isBidirectionalParserOutput()) {
801                // no line describing the spine, instead just a line like:
802                //  pos: DT
803                // This is typical of the output of Shen's bidirectional parser.
804                s.append(" pos: ");
805                s.append(this.getPOS());
806                s.append(newline);
807            } else if (this.getSpine() != null) {
808                // Format of the line describing the spine:
809                //  spine: b_( VP VB^ VP* )
810                s.append(" spine: ");
811                switch (this.getType()) {
812                    case INITIAL:
813                        s.append("a");
814                        break;
815                    case AUXILIARY:
816                        s.append("b");
817                        break;
818                    case COORD:
819                        s.append("c");
820                        break;
821                }
822    
823                s.append("_");
824                s.append(this.getSpine().toString());
825                s.append(newline);
826            } else if (this.getSpine() == null && this.getType() == COORD) {
827                // Mimic the output of Shen's incremental parser
828                s.append(" coord");
829                s.append(newline);
830            }
831    
832            Iterator theAttachments = this.getAttachments().iterator();
833    
834            while (theAttachments.hasNext()) {
835                TAGAttachment current = (TAGAttachment) theAttachments.next();
836                s.append(current.toString());
837                s.append(newline);
838            }
839            return s.toString();
840        }
841    
842         
843        // TODO maybe move this code to SpinalNode?
844        private void writeSpineToGraphviz(BufferedWriter b, SpinalNode current,
845                boolean isRoot, int start, int end, boolean includeSpans, boolean beanPoleStyle)
846                throws IOException {
847    
848            checkCompletion();
849            
850            int n = current.getElemTree().getNumber();
851    
852            if (start != -1 && end != -1) {
853                if (n < start || n > end) throw new IllegalArgumentException();
854            }
855    
856            // create a graphviz node for the current spinal node
857            String currentNode = current.getGraphvizNodeID();
858            b.write(currentNode + "[label=\"" + current.getLabel());
859            
860            if (includeSpans) {
861                String thespan = this.getSpan().toString();
862                if (!isRoot) {
863                    thespan = "";
864                }
865                b.write(" " + thespan);
866            }
867            
868            if (!current.getType().equals(SpinalNode.ANCHOR)) {
869                b.write(current.getType());
870            }
871            b.write("\" ");
872    
873            // uncomment the next line to make the root of the spine stand out
874            //if (isRoot) b.write("style=bold");
875    
876            b.write("];");
877            b.newLine();
878            
879            // create edges from the anchor SpinalNode to the actual
880            // lexical anchor, which is represented by the
881            // graphviz node ID of the ElemTree itself (i.e. this)
882    
883            if (current.getType().equals(SpinalNode.ANCHOR)) {
884                b.write(currentNode 
885                        + " -> " 
886                        + this.getGraphvizNodeID() 
887                        + "[style=bold arrowhead=none];");
888                b.newLine();
889            }
890    
891            // create edges from the current node to its children (if any)
892    
893            // children are never drawn emphasized (only the root of the
894            // present spine is)
895    
896            SpinalNode[] children = current.getChildren();
897    
898            for (int i = 0; i < children.length; i++) {
899                writeSpineToGraphviz(b, children[i], false, start, end, includeSpans, beanPoleStyle);
900    
901                // add edges to children
902                b.write(currentNode 
903                        + " -> " 
904                        + children[i].getGraphvizNodeID() 
905                        + "[style=bold arrowhead=none];");
906                b.newLine();
907            }
908    
909        }
910    
911        /**
912         * Method used internally to convert this ElemTree and its subtrees 
913         * into Graphviz format.
914         * Should only be called by Sentence class --
915         * doesn't generate complete graphviz output by itself.
916         */
917        void writeGraphvizTo(BufferedWriter b, int start, int end, boolean includeSpans, boolean beanPoleStyle, boolean showSpines)
918                throws IOException {
919    
920            // If we are in bidirectional parser output, we never show the spine because
921            // it is not known, but we include the part of speech in the terminal node instead 
922            // because otherwise it would not show up.
923            if (this.isBidirectionalParserOutput()) {
924                showSpines = false;
925            }
926            
927            // We skip any nodes that are not within start and end.
928            int n = this.getNumber();
929            if ((start != -1 && end != -1) && (n < start || n > end)) {
930                return;
931            }
932    
933            if (showSpines) {
934                
935                // Make sure that foot nodes appear on the side of the tree on which they should.
936                if (this.isAuxiliary()) {
937                    String anchorID = this.getAnchor().getGraphvizNodeID();
938                    String footID = this.getFoot().getGraphvizNodeID();
939                    b.write("subgraph { rank = same; edge [style=invis] ");
940                    if (this.attachesFromLeft()) {
941                        b.write(anchorID + " -> " + footID);
942                    } else if (this.attachesFromRight()) {
943                        b.write(footID + " -> " + anchorID);
944                    }
945                    b.write("; }");
946                    b.newLine();
947                }
948                
949               // Write out the spine of this elementary tree.
950               this.writeSpineToGraphviz(b, this.getSpine(), true, start, end, includeSpans, beanPoleStyle);
951            }
952            
953            // Write out the node that represents the current terminal.
954            // This node has a different shape (a rounded box).
955            b.write(this.getGraphvizNodeID() 
956                    + "[label=\" #" 
957                    + Integer.toString(this.getNumber()));
958            // If we are in bidirectional parser output, we never show the spine because
959            // it is not known, but we include the part of speech in the terminal node instead 
960            // because otherwise it would not show up.       
961            if (this.isBidirectionalParserOutput()) {
962                b.write("\\n" 
963                    + this.getPOS());
964            }
965            b.write("\\n" 
966                    + this.getTerminal() 
967                    + "\" style=rounded shape=box];");
968            b.newLine();
969    
970            // Create edges to attached ElemTrees
971            // and recursively graphviz them.
972            Iterator theAttachments = this.getAttachments().iterator();
973            while (theAttachments.hasNext()) {
974                TAGAttachment current = (TAGAttachment) theAttachments.next();
975    
976                // skip any attached ElemTrees that are not in the span
977                int position = current.getElemTree().getNumber();
978                if ((start != -1 && end != -1) && (position < start || position > end)) {
979                    continue;
980                }
981    
982                // create links between ElemTrees
983    
984                String constraint;
985                if (beanPoleStyle) {
986                    constraint = "false";
987                } else {
988                    constraint = "true";
989                }
990                
991                String source = null;
992                String target = null;
993                
994                if (showSpines) {
995                    source = current.getAttachmentSiteOnParent().getGraphvizNodeID();
996                    target = current.getChild().getSpine().getGraphvizNodeID(); // root of child's spine
997                } else {
998                    source = current.getParent().getGraphvizNodeID();
999                    target = current.getChild().getGraphvizNodeID();
1000                }
1001                
1002                String style;
1003                if (showSpines) {
1004                    style = "dotted";
1005                } else {
1006                    style = "solid";
1007                }
1008                b.write(source
1009                        + " -> " 
1010                        + target 
1011                        + "[style=" + style
1012                        + " constraint=" +constraint + "];");
1013                b.newLine();
1014    
1015                // recursive call
1016                current.getElemTree().writeGraphvizTo(b, start, end, includeSpans, beanPoleStyle, showSpines);
1017                b.newLine();
1018            }
1019    
1020        }
1021    
1022        /**
1023         * Returns true iff this <code>ElemTree</code> dominates the other tree 
1024         * in the sentence (this includes
1025         * the case in which <code>this.equals(other)</code>).
1026         * @param other the other tree
1027         * @return a boolean value
1028         */
1029        public boolean dominates(ElemTree other) {
1030    
1031            if (this.equals(other)) {
1032                return true;
1033            }
1034            if (this.isParentOf(other)) {
1035                return true;
1036            }
1037            return (!other.isRoot()) && this.dominates(other.getParent());
1038        }
1039    
1040        /**
1041         * Returns true iff this <code>ElemTree</code> is the direct parent of 
1042         * the other tree in the derivation tree.
1043         * @param other the other tree
1044         * @return a boolean value
1045         */
1046        public boolean isParentOf(ElemTree other) {
1047            Iterator iter = this.getChildren();
1048            boolean result = false;
1049            while (iter.hasNext()) {
1050                result = ((ElemTree) iter.next()).equals(other);
1051            }
1052            return result;
1053        }
1054    
1055        /**
1056         * Returns true if this <code>ElemTree</code> has been read in from the 
1057         * format used in the output of Shen's bidirectional parser. If this is the case,
1058         * no information about the spine is present. 
1059         * @return a boolean value
1060         */
1061        public boolean isBidirectionalParserOutput() {
1062            return this.bidirectionalParserOutput;
1063        }
1064    
1065        /**
1066         * Internal class representing the attachment of another 
1067         * <code>ElemTree</code> to this one.
1068         */
1069        private static class TAGAttachment implements Serializable {
1070    
1071            private int type = -1;
1072            private int nodeNumber = -1;
1073            private GornAddress gornAddress;
1074            private int slot = ElemTree.UNKNOWN;
1075            private int order = -1;
1076            private ElemTree child;
1077            private ElemTree parent;
1078            private boolean completed;
1079            
1080            /**
1081             * true indicates a format as in the LTAG-spinal treebank or in 
1082             * the Shen incremental parser output, where slots and numbers are
1083             * specified. False indicates a format as in the Shen bidirectional parser
1084             * output, where only the node id of the child is specified.
1085             */
1086            private boolean locationKnown = true;
1087    
1088            private void checkCompletion() {
1089                if (!completed) {
1090                    throw new UncompletedElemTreeException();
1091                }
1092            }
1093    
1094            /**
1095             * Note: The parent is the current tree (this). The child is the tree
1096             * designated by the attachmentToParent.
1097             */
1098            public TAGAttachment(ElemTree parent, String representation)
1099                    throws ElemTreeFormatException {
1100                this.parent = parent;
1101                completed = false;
1102    
1103                loadFromStringRepresentation(representation);
1104            }
1105    
1106    
1107            
1108            public void complete() {
1109                child = getParent().containingSentence.getElemTree(this.getNodeNumber());
1110                child.parent=(getParent());
1111                child.attachmentToParent = this;
1112                child.slot=this.slot;
1113                
1114                // In the LTAG-spinal treebank format, and in the output format of Shen's 
1115                // incremental parser, the information about attachment
1116                // and adjunction
1117                // is recorded redundantly in the spine line (via letter "a" vs. "b") and
1118                // in the attachment line (via "att" vs. "adj"). 
1119                
1120                // By contrast, in the output format of Shen's bidirectional parser, 
1121                // spine lines are not included and so
1122                // we propagate att/adj information from the attachment to the 
1123                // elementary tree beneath it. (We know
1124                // which nodes are coordination nodes because they are marked with an
1125                // ampersand (&).)
1126     
1127                if (child.isBidirectionalParserOutput()) {
1128                    if (this.getType() == ElemTree.ATTACH) {
1129                        child.type = ElemTree.INITIAL;
1130                    } else if (this.getType() == ElemTree.ADJOIN) {
1131                        child.type = ElemTree.AUXILIARY;
1132                    }
1133                }
1134                
1135                completed = true;
1136                }
1137    
1138            void loadFromStringRepresentation(String rep)
1139                    throws ElemTreeFormatException {
1140                
1141                
1142                // Format is:
1143                // att #24, on 0, slot 1, order 2
1144                // or
1145                // crd #6, on 0.0
1146                // or (in the output of Libin's bidirectional parser)
1147                // att #24
1148                
1149                rep = rep.trim();
1150                String[] parts = rep.split("\\s+");
1151    
1152                if ((parts.length != 8) && (parts.length != 4) && (parts.length != 2)) {
1153                    throw new ElemTreeFormatException(
1154                            "Malformed attachment representation: " + rep);
1155                }
1156    
1157                // remove trailing commas
1158                for (int i = 0; i < parts.length; i++) {
1159                    if ((parts[i].length() > 0) &&
1160                            parts[i].charAt(parts[i].length() - 1) == ',') {
1161                        parts[i] = parts[i].substring(0, parts[i].length() - 1);
1162                    }
1163                }
1164    
1165                String typeString = parts[0];
1166    
1167                if (typeString.equals("att")) {
1168                    type = ATTACH;
1169                } else if (typeString.equals("adj")) {
1170                    type = ADJOIN;
1171                } else if (typeString.equals("crd")) {
1172                    type = CONJUNCT;
1173                } else if (typeString.equals("con")) {
1174                    type = CONJUNCT_OR_CONNECTIVE;
1175                    
1176                } else {
1177                    throw new ElemTreeFormatException("Unknown attachment type " + typeString 
1178                            + " at " + parent.getSentence().prettyPrintLocation() + ".");
1179                }
1180    
1181                try {
1182                    this.nodeNumber = Integer.parseInt(parts[1].substring(1));
1183                } catch (NumberFormatException e) {
1184                    String partsMessage = "";
1185    
1186                    for (int i = 0; i < parts.length; i++) {
1187                        partsMessage += i + "=" + parts[i] + "\n";
1188                    }
1189    
1190                    throw new ElemTreeFormatException("Invalid node number " +
1191                            parts[1].substring(1) +
1192                            ". Attachment representation was " + rep + ". Parts array was " + partsMessage);
1193                }
1194    
1195                if (parts.length == 2) { // bidirectional parser output
1196                    locationKnown = false;
1197                    return;
1198                }
1199                
1200                this.gornAddress = new GornAddress(parts[3]);
1201    
1202                if (parts.length == 8) { // LTAG-spinal and incremental parser output
1203                    int theSlot = Integer.parseInt(parts[5]);
1204                    if (theSlot == 0) {
1205                        this.slot = ElemTree.LEFT;
1206                    } else if (theSlot == 1) {
1207                        this.slot = ElemTree.RIGHT;
1208                    }
1209                    this.order = Integer.parseInt(parts[7]);
1210                }
1211            }
1212    
1213            /**
1214             * Returns a string representation of this <code>ElemTree</code> in LTAG-spinal format.
1215             *
1216             * @return a string representing this elementary tree
1217             */
1218            public String toString() {
1219                checkCompletion();
1220                StringBuffer s = new StringBuffer(20);
1221                switch (this.getType()) {
1222                    case ATTACH:
1223                        s.append(" att");
1224                        break;
1225                    case ADJOIN:
1226                        s.append(" adj");
1227                        break;
1228                    case CONJUNCT:
1229                        s.append(" crd");
1230                        break;
1231                    case CONJUNCT_OR_CONNECTIVE:
1232                        s.append(" con");
1233                        break;
1234                }
1235                s.append(" #");
1236                s.append(Integer.toString(this.getNodeNumber()));
1237                if (locationKnown) {
1238                    s.append(", on ");
1239                    s.append(this.getGornAddress().toString());
1240                    if (this.getSlot() != -1 && this.getOrder() != -1) {
1241                        assert this.getSlot() != -1;
1242                        assert this.getOrder() != -1;
1243                        s.append(", slot ");
1244                        s.append(Integer.toString(this.getSlot()));
1245                        s.append(", order ");
1246                        s.append(Integer.toString(this.getOrder()));
1247                    }
1248                }
1249                return s.toString();
1250            }
1251    
1252    
1253            /**
1254             * Returns the ElemTree closer to the root of the sentence
1255             * (the one to which is attached).
1256             */
1257            public ElemTree getParent() {
1258                return parent;
1259            }
1260    
1261            /**
1262             * Returns the ElemTree closer to the fringe of the sentence
1263             * (the one which attaches to something).
1264             */
1265            public ElemTree getChild() {
1266                return child;
1267            }
1268    
1269            /**
1270             * Returns the precise place at which the current attachment
1271             * is linked to its parent (a node in the spine of the parent designated
1272             * by the GornAddress of this attachment). Return null if unknown
1273             * (as in the output of Shen's bidirectional parser).
1274             */
1275            public SpinalNode getAttachmentSiteOnParent() {
1276                if (!locationKnown) return null;
1277                return this.getParent().getSpinalNodeAt(this.getGornAddress());
1278            }
1279            
1280    
1281            /**
1282             * A synonym of getChild().
1283             */
1284            public ElemTree getElemTree() {
1285                return getChild();
1286            }
1287    
1288            public int getType() {
1289                return type;
1290            }
1291    
1292            public int getNodeNumber() {
1293                // always known, even if !locationKnown
1294                return nodeNumber;
1295            }
1296    
1297            public int getSlot() {
1298                if (!locationKnown) return -1;
1299                return slot;
1300            }
1301    
1302            public int getOrder() {
1303                if (!locationKnown) return -1;
1304                return order;
1305            }
1306    
1307            public GornAddress getGornAddress() {
1308                if (!locationKnown) return null;
1309                return gornAddress;
1310            }
1311        } // end of class TAGAttachment
1312    
1313        private static class TAGAttachmentToElemTreeIterator implements ListIterator {
1314    
1315            ListIterator wrappedIterator;
1316    
1317            public TAGAttachmentToElemTreeIterator(List attachments) {
1318                wrappedIterator = attachments.listIterator();
1319            }
1320    
1321            public void add(Object o) {
1322                throw new UnsupportedOperationException("add not supported.");
1323            }
1324    
1325            public void remove() {
1326                throw new UnsupportedOperationException("remove not supported.");
1327            }
1328    
1329            public void set(Object o) {
1330                throw new UnsupportedOperationException("set not supported");
1331            }
1332    
1333            public boolean hasPrevious() {
1334                return wrappedIterator.hasPrevious();
1335            }
1336    
1337            public boolean hasNext() {
1338                return wrappedIterator.hasNext();
1339            }
1340    
1341            public int nextIndex() {
1342                return wrappedIterator.nextIndex();
1343            }
1344    
1345            public int previousIndex() {
1346                return wrappedIterator.previousIndex();
1347            }
1348    
1349            //returns ElemTree
1350            public Object next() {
1351                TAGAttachment nextOne = (TAGAttachment) wrappedIterator.next();
1352    
1353                return nextOne.getChild();
1354            }
1355    
1356            //returns ElemTree
1357            public Object previous() {
1358                TAGAttachment previousOne = (TAGAttachment) wrappedIterator.previous();
1359    
1360                return previousOne.getChild();
1361            }
1362        }
1363    
1364        static class TAGAttachmentToWordSpanIterator implements ListIterator {
1365    
1366            ListIterator wrappedIterator;
1367    
1368            public TAGAttachmentToWordSpanIterator(List attachments) {
1369                wrappedIterator = attachments.listIterator();
1370            }
1371    
1372            public void add(Object o) {
1373                throw new UnsupportedOperationException("add not supported.");
1374            }
1375    
1376            public void remove() {
1377                throw new UnsupportedOperationException("remove not supported.");
1378            }
1379    
1380            public void set(Object o) {
1381                throw new UnsupportedOperationException("set not supported");
1382            }
1383    
1384            public boolean hasPrevious() {
1385                return wrappedIterator.hasPrevious();
1386            }
1387    
1388            public boolean hasNext() {
1389                return wrappedIterator.hasNext();
1390            }
1391    
1392            public int nextIndex() {
1393                return wrappedIterator.nextIndex();
1394            }
1395    
1396            public int previousIndex() {
1397                return wrappedIterator.previousIndex();
1398            }
1399    
1400            //returns WordSpan
1401            public Object next() {
1402                TAGAttachment nextOne = (TAGAttachment) wrappedIterator.next();
1403    
1404                return nextOne.getChild().getSpan();
1405            }
1406    
1407            //returns WordSpan
1408            public Object previous() {
1409                TAGAttachment previousOne = (TAGAttachment) wrappedIterator.previous();
1410    
1411                return previousOne.getChild().getSpan();
1412            }
1413        }
1414    }