001 /**
002 * LTAG-spinal API, an interface to the treebank format introduced by Libin Shen.
003 * Copyright (C) 2007 Lucas Champollion
004 *
005 * This program is free software: you can redistribute it and/or modify
006 * it under the terms of the GNU General Public License as published by
007 * the Free Software Foundation, either version 3 of the License, or
008 * (at your option) any later version.
009 *
010 * This program is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 * GNU General Public License for more details.
014 *
015 * You should have received a copy of the GNU General Public License
016 * along with this program. If not, see <http://www.gnu.org/licenses/>.
017 *
018 */
019 package edu.upenn.cis.spinal;
020
021 import java.io.*;
022 import java.util.*;
023 import java.util.regex.Matcher;
024 import java.util.regex.Pattern;
025 import edu.upenn.cis.propbank_shen.*;
026
027 /**
028 * Represents a "spinal" elementary tree in Libin's LTAG
029 * treebank. A typical elementary tree (taken from p. 73
030 * of his thesis) looks like this:
031 *
032 * <code>
033 * #3 failed
034 * spine: a_( S ( VP VBD^ ) )
035 * att #0, on 0, slot 0, order 0
036 * att #2, on 0, slot 0, order 1
037 * att #6, on 0.0, slot 1, order 0
038 * </code>
039 *
040 * or like this (a structure for predicate coordination):
041 *
042 * <code>
043 * &20
044 * spine: c_( S S S )
045 * crd #3, on 0.0
046 * att #9, on 0, slot 1, order 0
047 * crd #10, on 0.1
048 * </code>
049 *
050 * @author Lucas Champollion
051 * @author Ryan Gabbard
052 */
053 public class ElemTree implements Serializable {
054
055 /**
056 * Designates a field (tree, slot, etc.) of unknown type.
057 */
058 public static final int UNKNOWN = -1;
059
060 /**
061 * Designates an initial tree.
062 */
063 public static final int INITIAL = 0;
064
065 /**
066 * Designates an auxiliary tree.
067 */
068 public static final int AUXILIARY = 1;
069
070 /**
071 * Designates a coordination tree -- these trees are special constructs and not anchored in a lexical item.
072 */
073 public static final int COORD = 2;
074
075 /**
076 * A return value designating a special case when the current tree that has no parent.
077 */
078 public static final int ROOT = 3;
079
080 /**
081 * Designates a tree combined with its parent by an attachment operation,
082 * indicated by the keyword "att".
083 */
084 public static final int ATTACH = 4;
085
086 /**
087 * Designates a tree combined with its parent by an adjunction operation,
088 * indicated by the keyword "adj".
089 */
090 public static final int ADJOIN = 5;
091
092 /**
093 * Designates a tree combined with its parent by an attachment operation,
094 * indicated by the keyword "crd" in the output of the incremental parser
095 * and in the LTAG-spinal treebank.
096 */
097 public static final int CONJUNCT = 6;
098
099 /**
100 * Designates a tree combined with its parent by a conjunction or an attachment operation,
101 * indicated by the keyword "con" in the output of the incremental parser
102 * and in the LTAG-spinal treebank.
103 *
104 * In the treebank and in the output of the incremental parser, "crd"
105 * is used for conjuncts and "att" is used for
106 * connectives. In the output of the bidirectional parser, we don't distinguish
107 * conjuncts from connectives, in order to reduce an operation in parsing, so "con" is used
108 * to represent both.
109 */
110 public static final int CONJUNCT_OR_CONNECTIVE = 7;
111
112 /**
113 * Designates a tree that attaches/adjoins on the left of its parent.
114 */
115 public static final int LEFT = 8;
116
117 /**
118 * Designates a tree that attaches/adjoins on the right of its parent.
119 */
120 public static final int RIGHT = 9;
121
122 private static final int ATT_TYPE_OFFSET = 2;
123 private static final String newline = System.getProperty("line.separator");
124 private Sentence containingSentence;
125 int number = -1;
126 String terminal = "";
127 private int type = UNKNOWN;
128 private SpinalNode rootOfSpine = null;
129
130 /**
131 * Indicates that the input only specified the part of speech and not the
132 * spine, as is the case for the output of Shen's bidirectional parser.
133 */
134 private boolean bidirectionalParserOutput = false;
135 private String pos = null;
136 private ArrayList attachments = new ArrayList();
137 /**
138 * Null iff this tree is the root of the derivation tree.
139 */
140 private ElemTree parent;
141 /**
142 * the attachment by which this <code>ElemTree</code>
143 * attaches to its parent; null iff this is root of derivation tree
144 */
145 private TAGAttachment attachmentToParent = null;
146 /**
147 * the yield of the subtree rooted by this ElemTree
148 */
149 private WordSpan span;
150
151 /**
152 * Represents to which side this tree attaches to its parent.
153 */
154 private int slot = UNKNOWN;
155
156 private boolean completed = false;
157 private SpinalNode anchor;
158 private SpinalNode foot;
159
160 private static final Pattern posPattern = Pattern.compile("\\s*pos: (.*)");
161 private static final Pattern spinePattern = Pattern.compile("\\s*spine: (.*)");
162 private static final Pattern coordPattern = Pattern.compile("\\s*coord\\s*");
163
164
165 /**
166 * Creates an <code>ElemTree</code> from a string representation and attaches it
167 * into the provided derivation tree (Sentence). We assume that the nodes are
168 * numbered in ascending order of linear precedence of their anchors, as
169 * is the case in the LTAG-spinal treebank.
170 * @param container the <code>Sentence</code> to which this
171 * <code>ElemTree</code> is to be added
172 * @param representation the string that is to be parsed into an
173 * <code>ElemTree</code>
174 * @throws edu.upenn.cis.spinal.ElemTreeFormatException if the
175 * string representation is not well-formed
176 */
177 public ElemTree(Sentence container, String representation)
178 throws ElemTreeFormatException {
179 attachments = new ArrayList();
180 containingSentence = container;
181 completed = false;
182 parent = null;
183 span = null;
184
185 loadFromStringRepresentation(representation);
186 }
187
188 /**
189 * Returns true if this <code>ElemTree</code> attaches or adjoins from the
190 * left of its parent.
191 * @return a boolean value
192 * @see #getSlot()
193 */
194 public boolean attachesFromLeft() {
195 return this.getSlot() == LEFT;
196 }
197
198 /**
199 * Returns true if this <code>ElemTree</code> attaches or adjoins from the
200 * right of its parent.
201 * @return a boolean value
202 * @see #getSlot()
203 */
204 public boolean attachesFromRight() {
205 return this.getSlot() == RIGHT;
206 }
207
208 /**
209 * Returns the node representing the lexical anchor of this tree, or null
210 * if there is no anchor (in the case of a coordination structure).
211 * @return a <code>SpinalNode</code> element representing the lexical anchor
212 */
213 public SpinalNode getAnchor() {
214
215 Iterator iter = this.getSpine().getAllNodes().iterator();
216 SpinalNode current;
217 while (iter.hasNext()) {
218 current = (SpinalNode) iter.next();
219 if (current.isAnchor()) {
220 this.anchor = current;
221 return current;
222 }
223 }
224 this.anchor = null;
225 return null;
226 }
227
228 /**
229 * Returns the <code>SpinalNode</code> in the parent tree to which this <code>ElemTree</code>
230 * is attached, or null if this <code>ElemTree</code> is the root of the derivation tree.
231 * @return the parent's attachment site
232 */
233 public SpinalNode getAttachmentSite() {
234 checkCompletion();
235 if (this.isRoot()) {
236 return null;
237 }
238 return this.getParent().getSpinalNodeAt(this.getAttachmentToParent().getGornAddress());
239 }
240
241 TAGAttachment getAttachmentToParent() {
242
243 return attachmentToParent;
244 }
245
246 /**
247 * Returns the type of the attachment of this elementary tree to its parent tree, one of
248 * {@link #ATTACH}, {@link #ADJOIN}, {@link #CONJUNCT} (only used
249 * in the LTAG-spinal treebank and in the output of the incremental parser),
250 * and {@link #CONJUNCT_OR_CONNECTIVE} (only used in the output of the
251 * bidirectional parser). If this elementary tree is the root of the sentence,
252 * the special value {@link #ROOT} is returned.
253 * @return the type of attachment of this <code>ElemTree</code>
254 */
255 public int getAttachmentType() {
256 if (this.isRoot()) {
257 return ROOT;
258 }
259 return attachmentToParent.getType();
260 }
261
262 /**
263 * Internal method, returns <code>TAGAttachment</code> list for this tree.
264 * @return a list of attachments
265 */
266 private List getAttachments() {
267 return attachments;
268 }
269
270 /**
271 * Returns the <code>ElemTree</code>s that attach to this <code>ElemTree</code>.
272 * @return the children of this node in the derivation tree
273 */
274 public Iterator getChildren() {
275 checkCompletion();
276
277 return new TAGAttachmentToElemTreeIterator(getAttachments());
278 }
279
280 /**
281 * Returns the spans of the <code>ElemTree</code>s that attach to this
282 * <code>ElemTree</code>.
283 * @return an iterator over (@link WordSpan} objects
284 */
285 public Iterator getChildrenSpans() {
286 checkCompletion();
287
288 return new TAGAttachmentToWordSpanIterator(getAttachments());
289 }
290
291 /**
292 * Returns a <code>List</code> of the elementary trees dominated by this
293 * elementary tree,
294 * including this tree itself.
295 * @return a list over <code>ElemTree</code> objects
296 */
297 public List getDominatedElemTrees() {
298 // TODO check if this, called on root, gives the same results as
299 // this.getSentence().getElemTrees(this.getSpan());
300 Iterator candidates = this.getSentence().getElemTrees().iterator();
301 List result = new Vector();
302 while (candidates.hasNext()) {
303 ElemTree current = (ElemTree) candidates.next();
304 if (this.dominates(current)) {
305 result.add(current);
306 }
307 }
308 return result;
309 }
310
311 /**
312 * Returns a <code>List</code> of the terminals attached to the elementary trees
313 * dominated by this elementary tree,
314 * including this tree itself.
315 * @return a list of terminals (<code>String</code> objects)
316 */
317 public List getDominatedTerminals() {
318 Iterator iter = this.getDominatedElemTrees().iterator();
319 List l = new Vector();
320 while (iter.hasNext()) {
321 ElemTree current = (ElemTree) iter.next();
322 l.add(current.getTerminal());
323 }
324 return l;
325 }
326
327 /**
328 * Returns the number of the file that contains the sentence to which
329 * this <code>ElemTree</code> belongs, or -1 if there is no such number.
330 * @return an <code>int</code>
331 */
332 public int getFileNumber() {
333 return this.getSentence().getFileNumber();
334 }
335
336 /**
337 * Returns the node representing the foot node (if any) of this tree,
338 * or null if there is no foot node. (Only auxiliary trees have foot nodes.)
339 * @return a <code>SpinalNode</code>
340 */
341 public SpinalNode getFoot() {
342 Iterator iter = this.getSpine().getAllNodes().iterator();
343 SpinalNode current;
344 while (iter.hasNext()) {
345 current = (SpinalNode) iter.next();
346 if (current.isFoot()) {
347 this.foot = current;
348 return current;
349 }
350 }
351 this.foot = null;
352 return null;
353 }
354
355 /**
356 * Method used internally to create the Graphviz representation of this tree.
357 */
358 String getGraphvizNodeID() {
359
360 return "node_" + this.getNumber();
361 }
362
363 /**
364 * Returns the number of this tree, corresponding to the position in the sentence.
365 * @return the number, or -1 if unknown
366 */
367 public int getNumber() {
368 checkCompletion();
369 return number;
370 }
371
372
373
374 /**
375 * Returns the location of the predicate-argument structure
376 * corresponding to this elementary tree in the Propbank, or null
377 * if this tree has no section and file numbers. This will be a string
378 * of the following shape:
379 * <code>wsj/<section>/wsj_<section><file>.mrg</code>.
380 * @return a string that indicates the location of this word in the Propbank
381 */
382 public PASLoc getPASLoc() {
383
384
385 if (getSectionNumber() == -1 || getFileNumber() == -1) {
386 return null;
387 }
388 // these both return -1 if no number is available.
389
390 String section = Integer.toString(getSectionNumber());
391 String file = Integer.toString(getFileNumber());
392
393
394 if (section.length() == 1) {
395 section = "0" + section;
396 }
397 if (file.length() == 1) {
398 file = "0" + file;
399 }
400
401 String p = "wsj/" + section + "/wsj_" + section + file + ".mrg";
402
403 return new PASLoc(p, getSentenceNumber(), getNumber());
404 }
405
406 /**
407 * Returns the part of speech of this elementary tree, or "NA" in the case of a coordination structure
408 * (where part of speech is not really applicable since Coord nodes aren't lexicalized).
409 * @return the POS tag of the current word
410 */
411 public String getPOS() {
412 if (this.pos != null) return this.pos;
413
414 SpinalNode theAnchor = this.getAnchor();
415 if (theAnchor == null) {
416 assert this.isCoord();
417 return "NA";
418 } else {
419 return theAnchor.getLabel();
420 }
421 }
422
423 /**
424 * Returns the <code>ElemTree</code> to which this <code>ElemTree</code> attaches,
425 * or null iff this is the root.
426 * @return the parent of this node in the derivation tree
427 */
428 public ElemTree getParent() {
429 checkCompletion();
430 return parent;
431 }
432
433
434 /**
435 * Returns the number of the section that contains the sentence to which
436 * this <code>ElemTree</code> belongs, or -1 if there is no such number.
437 * @return an <code>int</code>
438 */
439 public int getSectionNumber() {
440 return this.getSentence().getSectionNumber();
441 }
442
443 /**
444 * Return the <code>Sentence</code> to which this <code>ElemTree</code>
445 * belongs.
446 * @return a <code>Sentence</code> object
447 */
448 public Sentence getSentence() {
449 //checkCompletion();
450 return containingSentence;
451 }
452
453 /**
454 * Returns the number of the the sentence to which
455 * this <code>ElemTree</code> belongs, or -1 if there is no such number.
456 * @return an <code>int</code>
457 */
458 public int getSentenceNumber() {
459 return this.getSentence().getSentenceNumber();
460 }
461
462 /**
463 * Returns whether this elementary tree attaches to the {@link #LEFT} or {@link #RIGHT}
464 * side of its parent. Returns {@link #ROOT} if this tree is the root of the
465 * sentence and {@link #UNKNOWN} if the value is not known.
466 *
467 * @return one of the given values
468 * @see #attachesFromLeft()
469 * @see #attachesFromRight()
470 */
471 public int getSlot() {
472 if (this.isRoot()) return ROOT;
473
474 return this.slot;
475 }
476
477 /**
478 * Returns the part of the sentence that is spanned by the yield of this
479 * elementary tree and its descendents. If the span is discontinuous, the
480 * left- and rightmost <code>ElemTree</code> locations are given.
481 * @return a <code>WordSpan</code> object
482 */
483 public WordSpan getSpan() {
484 checkCompletion();
485 if (span == null) {
486 computeSpan();
487 }
488 return span;
489 }
490
491 /**
492 * Returns the spinal node at a given Gorn address in this tree.
493 * @param g a {@link GornAddress}
494 * @return a {@link SpinalNode}
495 */
496 public SpinalNode getSpinalNodeAt(GornAddress g) {
497 checkCompletion();
498 Iterator parts = g.iterator();
499 SpinalNode currentNode = this.getSpine(); // current node is root of spine
500 int currentChild = ((Integer) parts.next()).intValue();
501 if (currentChild != 0) {
502 throw new RuntimeException("Bad GornAddress: "+ g.toString());
503 }
504 while (parts.hasNext()) {
505 // TODO raise exception if we're given a bad address
506 currentNode = currentNode.getChild(((Integer) parts.next()).intValue());
507 }
508 return currentNode;
509 }
510
511 /**
512 * Returns the spine of this tree.
513 * @return a {@link SpinalNode} representing the root of the spine
514 */
515 public SpinalNode getSpine() {
516 checkCompletion();
517 return rootOfSpine;
518 }
519
520 /**
521 * Returns the yield of the subtree rooted in this elementary tree, with
522 * terminals separated by a single white space. Empty elements are included.
523 * @return a substring corresponding to the current <code>ElemTree</code>
524 * and all its descendents.
525 */
526 public String getSurfaceString() {
527 String result = "";
528 Iterator iter = this.getDominatedTerminals().iterator();
529 while (iter.hasNext()) {
530 if (!result.equals("")) {
531 result += " ";
532 }
533 result += (String) iter.next();
534 }
535 return result;
536 }
537
538 /**
539 * Returns the actual terminal string (word in most cases) represented by this <code>ElemTree</code>.
540 * @return the terminal string at the fringe of this elementary tree
541 */
542 public String getTerminal() {
543 checkCompletion();
544 return terminal;
545 }
546
547 /**
548 * Returns the type of this elementary tree, one of
549 * {@link #UNKNOWN}, {@link #INITIAL}, {@link #AUXILIARY}, and {@link #COORD}.
550 * @return the type of this <code>ElemTree</code>
551 */
552 public int getType() {
553 checkCompletion();
554 return type;
555 }
556
557 /**
558 * Returns a string representing the type of this elementary tree, one of
559 * <code>initial</code>, <code>auxiliary</code>, <code>coordination</code>,
560 * and <code>unknown</code>.
561 * @return the type of this <code>ElemTree</code> in string representation
562 */
563 public String getTypeAsString() {
564 checkCompletion();
565 switch (this.type) {
566 case INITIAL:
567 return "initial";
568 case AUXILIARY:
569 return "auxiliary";
570 case COORD:
571 return "coordination";
572 case UNKNOWN:
573 return "unknown";
574 default:
575 assert false;
576 return "unknown";
577 }
578 }
579
580 /**
581 * Returns true iff this tree is of type "coordination".
582 * @return a boolean value
583 */
584 public boolean isCoord() {
585 return this.type == COORD;
586 }
587
588 /**
589 * Returns true iff this tree is of type "auxiliary".
590 * @return a boolean value
591 */
592 public boolean isAuxiliary() {
593 return this.type == AUXILIARY;
594 }
595
596 /**
597 * Returns true iff this tree is of type "initial".
598 * @return a boolean value
599 */
600 public boolean isInitial() {
601 return this.type == INITIAL;
602 }
603
604 /**
605 * Returns true iff this tree is of unknown type.
606 * @return a boolean value
607 */
608 public boolean isOfUnknownType() {
609 return this.type == UNKNOWN;
610 }
611
612
613 /**
614 * Returns if the anchor (if there is one) is an empty element
615 * by checking if the terminal has an unescaped asterisk. Returns false
616 * if this is a coordination node, in which case there is no anchor anyway.
617 * @return a boolean value
618 */
619 public boolean isEmptyElement() {
620 if (this.isCoord()) {
621 return false;
622 }
623 String t = this.getTerminal();
624 return (t.indexOf("*") != -1 && t.indexOf("\\*") == -1);
625 }
626
627 /**
628 * Returns true iff this elementary tree has no parent.
629 * @return a boolean value
630 */
631 public boolean isRoot() {
632 return this.getParent() == null;
633 }
634
635 /**
636 * Returns true iff this <code>ElemTree</code> has other
637 * <code>ElemTree</code>s attached to it.
638 * @return a boolean value
639 */
640 public boolean hasChildren() {
641 return this.getChildren().hasNext();
642 }
643
644 /**
645 * Method used internally.
646 */
647 void computeSpan() {
648 int position = this.getNumber();
649 WordSpan lexicalSpan = new WordSpan(position, position);
650
651 // Coordination trees need special treatment: Their "lexical span"
652 // refers to a dummy node with no lexical content, so we must ignore
653 // that span.
654
655 if (this.getType() == COORD) {
656 this.span = WordSpan.merge(this.getChildrenSpans());
657 } else if (this.hasChildren()) {
658 this.span = WordSpan.combine(lexicalSpan, WordSpan.merge(this.getChildrenSpans()));
659 } else { // this has no children
660 this.span = lexicalSpan;
661 }
662 }
663
664 /**
665 * Method used internally while building up a derivation tree.
666 */
667 void complete() {
668
669 // tells each child that this is its parent
670 for (Iterator it = getAttachments().iterator(); it.hasNext();) {
671 TAGAttachment attach = (TAGAttachment) it.next();
672
673 attach.complete();
674 }
675
676 completed = true;
677 }
678
679 /**
680 * Method used internally while building up a derivation tree.
681 */
682 private void checkCompletion() {
683 if (!completed) {
684 throw new UncompletedElemTreeException();
685 }
686 }
687
688 /**
689 * Method used internally and called from the constructor to parse
690 * a string representation into an ElemTree object.
691 */
692 void loadFromStringRepresentation(String rep)
693 throws ElemTreeFormatException {
694 String[] lines = rep.split("\\n");
695
696 // first line has format:
697 // #4 will
698 // or:
699 // &25
700 // The latter case is coordination.
701 // Actually we have already stripped off the first character (& or #).
702
703 String[] firstLinePieces = lines[0].split("\\s+");
704
705 if (firstLinePieces.length < 1) {
706 throw new ElemTreeFormatException("First line empty.");
707 }
708
709
710 if (firstLinePieces.length == 1) { // coordination doesn't come with a word
711 this.type=COORD;
712 }
713
714 this.number = Integer.parseInt(firstLinePieces[0]);
715
716 terminal = "";
717
718 for (int i = 1; i < firstLinePieces.length; i++) {
719 terminal += firstLinePieces[i];
720 }
721
722 // second line has format:
723 // spine: a_( XP NN^ ) // spinePattern
724 // or
725 // coord // coordPattern
726 // or (for bidirectional parser output)
727 // pos: NN // posPattern
728
729 String secondLine = lines[1];
730
731 Matcher coordMatcher = coordPattern.matcher(secondLine);
732 Matcher posMatcher = posPattern.matcher(secondLine);
733 Matcher spineMatcher = spinePattern.matcher(secondLine);
734
735 if (coordMatcher.matches()) {
736 rootOfSpine = null;
737 type = COORD;
738 } else if (posMatcher.matches()) {
739 rootOfSpine = null;
740 pos = posMatcher.group(1);
741 bidirectionalParserOutput = true;
742 } else if (spineMatcher.matches()) {
743 try {
744 secondLine = spineMatcher.group(1);
745 } catch (StringIndexOutOfBoundsException e) {
746 System.out.println(secondLine);
747 throw e;
748 }
749
750 char typeChar = secondLine.charAt(0);
751
752 switch (typeChar) {
753 case 'a':
754 type = INITIAL;
755 break;
756 case 'b':
757 type = AUXILIARY;
758 break;
759 case 'c':
760 type = COORD;
761 break;
762 }
763
764 // parse the spine
765 rootOfSpine = new SpinalNode(secondLine.substring(ATT_TYPE_OFFSET), this);
766 } // end treatment of spine
767
768
769 // read attachmentToParent lines
770 for (int i = 2; i < lines.length; i++) {
771 getAttachments().add(new TAGAttachment(this, lines[i]));
772 }
773 }
774
775 /**
776 * Converts this tree to a canonical string representation of the kind used in Libin
777 * Shen's thesis.
778 * @return a string representation
779 */
780 public String toString() {
781
782 checkCompletion();
783
784 StringBuffer s = new StringBuffer();
785 // "#20 remained" (hash, number, terminal: this is the normal case)
786 // "&20" (ampersand, number, no terminal: the format for coordination)
787 String nodeno = Integer.toString(this.getNumber());
788
789 if (this.getType() == COORD) {
790 s.append("&");
791 s.append(nodeno);
792 } else {
793 s.append("#");
794 s.append(nodeno);
795 s.append(" ");
796 s.append(this.getTerminal());
797 }
798 s.append(newline);
799
800 if (this.isBidirectionalParserOutput()) {
801 // no line describing the spine, instead just a line like:
802 // pos: DT
803 // This is typical of the output of Shen's bidirectional parser.
804 s.append(" pos: ");
805 s.append(this.getPOS());
806 s.append(newline);
807 } else if (this.getSpine() != null) {
808 // Format of the line describing the spine:
809 // spine: b_( VP VB^ VP* )
810 s.append(" spine: ");
811 switch (this.getType()) {
812 case INITIAL:
813 s.append("a");
814 break;
815 case AUXILIARY:
816 s.append("b");
817 break;
818 case COORD:
819 s.append("c");
820 break;
821 }
822
823 s.append("_");
824 s.append(this.getSpine().toString());
825 s.append(newline);
826 } else if (this.getSpine() == null && this.getType() == COORD) {
827 // Mimic the output of Shen's incremental parser
828 s.append(" coord");
829 s.append(newline);
830 }
831
832 Iterator theAttachments = this.getAttachments().iterator();
833
834 while (theAttachments.hasNext()) {
835 TAGAttachment current = (TAGAttachment) theAttachments.next();
836 s.append(current.toString());
837 s.append(newline);
838 }
839 return s.toString();
840 }
841
842
843 // TODO maybe move this code to SpinalNode?
844 private void writeSpineToGraphviz(BufferedWriter b, SpinalNode current,
845 boolean isRoot, int start, int end, boolean includeSpans, boolean beanPoleStyle)
846 throws IOException {
847
848 checkCompletion();
849
850 int n = current.getElemTree().getNumber();
851
852 if (start != -1 && end != -1) {
853 if (n < start || n > end) throw new IllegalArgumentException();
854 }
855
856 // create a graphviz node for the current spinal node
857 String currentNode = current.getGraphvizNodeID();
858 b.write(currentNode + "[label=\"" + current.getLabel());
859
860 if (includeSpans) {
861 String thespan = this.getSpan().toString();
862 if (!isRoot) {
863 thespan = "";
864 }
865 b.write(" " + thespan);
866 }
867
868 if (!current.getType().equals(SpinalNode.ANCHOR)) {
869 b.write(current.getType());
870 }
871 b.write("\" ");
872
873 // uncomment the next line to make the root of the spine stand out
874 //if (isRoot) b.write("style=bold");
875
876 b.write("];");
877 b.newLine();
878
879 // create edges from the anchor SpinalNode to the actual
880 // lexical anchor, which is represented by the
881 // graphviz node ID of the ElemTree itself (i.e. this)
882
883 if (current.getType().equals(SpinalNode.ANCHOR)) {
884 b.write(currentNode
885 + " -> "
886 + this.getGraphvizNodeID()
887 + "[style=bold arrowhead=none];");
888 b.newLine();
889 }
890
891 // create edges from the current node to its children (if any)
892
893 // children are never drawn emphasized (only the root of the
894 // present spine is)
895
896 SpinalNode[] children = current.getChildren();
897
898 for (int i = 0; i < children.length; i++) {
899 writeSpineToGraphviz(b, children[i], false, start, end, includeSpans, beanPoleStyle);
900
901 // add edges to children
902 b.write(currentNode
903 + " -> "
904 + children[i].getGraphvizNodeID()
905 + "[style=bold arrowhead=none];");
906 b.newLine();
907 }
908
909 }
910
911 /**
912 * Method used internally to convert this ElemTree and its subtrees
913 * into Graphviz format.
914 * Should only be called by Sentence class --
915 * doesn't generate complete graphviz output by itself.
916 */
917 void writeGraphvizTo(BufferedWriter b, int start, int end, boolean includeSpans, boolean beanPoleStyle, boolean showSpines)
918 throws IOException {
919
920 // If we are in bidirectional parser output, we never show the spine because
921 // it is not known, but we include the part of speech in the terminal node instead
922 // because otherwise it would not show up.
923 if (this.isBidirectionalParserOutput()) {
924 showSpines = false;
925 }
926
927 // We skip any nodes that are not within start and end.
928 int n = this.getNumber();
929 if ((start != -1 && end != -1) && (n < start || n > end)) {
930 return;
931 }
932
933 if (showSpines) {
934
935 // Make sure that foot nodes appear on the side of the tree on which they should.
936 if (this.isAuxiliary()) {
937 String anchorID = this.getAnchor().getGraphvizNodeID();
938 String footID = this.getFoot().getGraphvizNodeID();
939 b.write("subgraph { rank = same; edge [style=invis] ");
940 if (this.attachesFromLeft()) {
941 b.write(anchorID + " -> " + footID);
942 } else if (this.attachesFromRight()) {
943 b.write(footID + " -> " + anchorID);
944 }
945 b.write("; }");
946 b.newLine();
947 }
948
949 // Write out the spine of this elementary tree.
950 this.writeSpineToGraphviz(b, this.getSpine(), true, start, end, includeSpans, beanPoleStyle);
951 }
952
953 // Write out the node that represents the current terminal.
954 // This node has a different shape (a rounded box).
955 b.write(this.getGraphvizNodeID()
956 + "[label=\" #"
957 + Integer.toString(this.getNumber()));
958 // If we are in bidirectional parser output, we never show the spine because
959 // it is not known, but we include the part of speech in the terminal node instead
960 // because otherwise it would not show up.
961 if (this.isBidirectionalParserOutput()) {
962 b.write("\\n"
963 + this.getPOS());
964 }
965 b.write("\\n"
966 + this.getTerminal()
967 + "\" style=rounded shape=box];");
968 b.newLine();
969
970 // Create edges to attached ElemTrees
971 // and recursively graphviz them.
972 Iterator theAttachments = this.getAttachments().iterator();
973 while (theAttachments.hasNext()) {
974 TAGAttachment current = (TAGAttachment) theAttachments.next();
975
976 // skip any attached ElemTrees that are not in the span
977 int position = current.getElemTree().getNumber();
978 if ((start != -1 && end != -1) && (position < start || position > end)) {
979 continue;
980 }
981
982 // create links between ElemTrees
983
984 String constraint;
985 if (beanPoleStyle) {
986 constraint = "false";
987 } else {
988 constraint = "true";
989 }
990
991 String source = null;
992 String target = null;
993
994 if (showSpines) {
995 source = current.getAttachmentSiteOnParent().getGraphvizNodeID();
996 target = current.getChild().getSpine().getGraphvizNodeID(); // root of child's spine
997 } else {
998 source = current.getParent().getGraphvizNodeID();
999 target = current.getChild().getGraphvizNodeID();
1000 }
1001
1002 String style;
1003 if (showSpines) {
1004 style = "dotted";
1005 } else {
1006 style = "solid";
1007 }
1008 b.write(source
1009 + " -> "
1010 + target
1011 + "[style=" + style
1012 + " constraint=" +constraint + "];");
1013 b.newLine();
1014
1015 // recursive call
1016 current.getElemTree().writeGraphvizTo(b, start, end, includeSpans, beanPoleStyle, showSpines);
1017 b.newLine();
1018 }
1019
1020 }
1021
1022 /**
1023 * Returns true iff this <code>ElemTree</code> dominates the other tree
1024 * in the sentence (this includes
1025 * the case in which <code>this.equals(other)</code>).
1026 * @param other the other tree
1027 * @return a boolean value
1028 */
1029 public boolean dominates(ElemTree other) {
1030
1031 if (this.equals(other)) {
1032 return true;
1033 }
1034 if (this.isParentOf(other)) {
1035 return true;
1036 }
1037 return (!other.isRoot()) && this.dominates(other.getParent());
1038 }
1039
1040 /**
1041 * Returns true iff this <code>ElemTree</code> is the direct parent of
1042 * the other tree in the derivation tree.
1043 * @param other the other tree
1044 * @return a boolean value
1045 */
1046 public boolean isParentOf(ElemTree other) {
1047 Iterator iter = this.getChildren();
1048 boolean result = false;
1049 while (iter.hasNext()) {
1050 result = ((ElemTree) iter.next()).equals(other);
1051 }
1052 return result;
1053 }
1054
1055 /**
1056 * Returns true if this <code>ElemTree</code> has been read in from the
1057 * format used in the output of Shen's bidirectional parser. If this is the case,
1058 * no information about the spine is present.
1059 * @return a boolean value
1060 */
1061 public boolean isBidirectionalParserOutput() {
1062 return this.bidirectionalParserOutput;
1063 }
1064
1065 /**
1066 * Internal class representing the attachment of another
1067 * <code>ElemTree</code> to this one.
1068 */
1069 private static class TAGAttachment implements Serializable {
1070
1071 private int type = -1;
1072 private int nodeNumber = -1;
1073 private GornAddress gornAddress;
1074 private int slot = ElemTree.UNKNOWN;
1075 private int order = -1;
1076 private ElemTree child;
1077 private ElemTree parent;
1078 private boolean completed;
1079
1080 /**
1081 * true indicates a format as in the LTAG-spinal treebank or in
1082 * the Shen incremental parser output, where slots and numbers are
1083 * specified. False indicates a format as in the Shen bidirectional parser
1084 * output, where only the node id of the child is specified.
1085 */
1086 private boolean locationKnown = true;
1087
1088 private void checkCompletion() {
1089 if (!completed) {
1090 throw new UncompletedElemTreeException();
1091 }
1092 }
1093
1094 /**
1095 * Note: The parent is the current tree (this). The child is the tree
1096 * designated by the attachmentToParent.
1097 */
1098 public TAGAttachment(ElemTree parent, String representation)
1099 throws ElemTreeFormatException {
1100 this.parent = parent;
1101 completed = false;
1102
1103 loadFromStringRepresentation(representation);
1104 }
1105
1106
1107
1108 public void complete() {
1109 child = getParent().containingSentence.getElemTree(this.getNodeNumber());
1110 child.parent=(getParent());
1111 child.attachmentToParent = this;
1112 child.slot=this.slot;
1113
1114 // In the LTAG-spinal treebank format, and in the output format of Shen's
1115 // incremental parser, the information about attachment
1116 // and adjunction
1117 // is recorded redundantly in the spine line (via letter "a" vs. "b") and
1118 // in the attachment line (via "att" vs. "adj").
1119
1120 // By contrast, in the output format of Shen's bidirectional parser,
1121 // spine lines are not included and so
1122 // we propagate att/adj information from the attachment to the
1123 // elementary tree beneath it. (We know
1124 // which nodes are coordination nodes because they are marked with an
1125 // ampersand (&).)
1126
1127 if (child.isBidirectionalParserOutput()) {
1128 if (this.getType() == ElemTree.ATTACH) {
1129 child.type = ElemTree.INITIAL;
1130 } else if (this.getType() == ElemTree.ADJOIN) {
1131 child.type = ElemTree.AUXILIARY;
1132 }
1133 }
1134
1135 completed = true;
1136 }
1137
1138 void loadFromStringRepresentation(String rep)
1139 throws ElemTreeFormatException {
1140
1141
1142 // Format is:
1143 // att #24, on 0, slot 1, order 2
1144 // or
1145 // crd #6, on 0.0
1146 // or (in the output of Libin's bidirectional parser)
1147 // att #24
1148
1149 rep = rep.trim();
1150 String[] parts = rep.split("\\s+");
1151
1152 if ((parts.length != 8) && (parts.length != 4) && (parts.length != 2)) {
1153 throw new ElemTreeFormatException(
1154 "Malformed attachment representation: " + rep);
1155 }
1156
1157 // remove trailing commas
1158 for (int i = 0; i < parts.length; i++) {
1159 if ((parts[i].length() > 0) &&
1160 parts[i].charAt(parts[i].length() - 1) == ',') {
1161 parts[i] = parts[i].substring(0, parts[i].length() - 1);
1162 }
1163 }
1164
1165 String typeString = parts[0];
1166
1167 if (typeString.equals("att")) {
1168 type = ATTACH;
1169 } else if (typeString.equals("adj")) {
1170 type = ADJOIN;
1171 } else if (typeString.equals("crd")) {
1172 type = CONJUNCT;
1173 } else if (typeString.equals("con")) {
1174 type = CONJUNCT_OR_CONNECTIVE;
1175
1176 } else {
1177 throw new ElemTreeFormatException("Unknown attachment type " + typeString
1178 + " at " + parent.getSentence().prettyPrintLocation() + ".");
1179 }
1180
1181 try {
1182 this.nodeNumber = Integer.parseInt(parts[1].substring(1));
1183 } catch (NumberFormatException e) {
1184 String partsMessage = "";
1185
1186 for (int i = 0; i < parts.length; i++) {
1187 partsMessage += i + "=" + parts[i] + "\n";
1188 }
1189
1190 throw new ElemTreeFormatException("Invalid node number " +
1191 parts[1].substring(1) +
1192 ". Attachment representation was " + rep + ". Parts array was " + partsMessage);
1193 }
1194
1195 if (parts.length == 2) { // bidirectional parser output
1196 locationKnown = false;
1197 return;
1198 }
1199
1200 this.gornAddress = new GornAddress(parts[3]);
1201
1202 if (parts.length == 8) { // LTAG-spinal and incremental parser output
1203 int theSlot = Integer.parseInt(parts[5]);
1204 if (theSlot == 0) {
1205 this.slot = ElemTree.LEFT;
1206 } else if (theSlot == 1) {
1207 this.slot = ElemTree.RIGHT;
1208 }
1209 this.order = Integer.parseInt(parts[7]);
1210 }
1211 }
1212
1213 /**
1214 * Returns a string representation of this <code>ElemTree</code> in LTAG-spinal format.
1215 *
1216 * @return a string representing this elementary tree
1217 */
1218 public String toString() {
1219 checkCompletion();
1220 StringBuffer s = new StringBuffer(20);
1221 switch (this.getType()) {
1222 case ATTACH:
1223 s.append(" att");
1224 break;
1225 case ADJOIN:
1226 s.append(" adj");
1227 break;
1228 case CONJUNCT:
1229 s.append(" crd");
1230 break;
1231 case CONJUNCT_OR_CONNECTIVE:
1232 s.append(" con");
1233 break;
1234 }
1235 s.append(" #");
1236 s.append(Integer.toString(this.getNodeNumber()));
1237 if (locationKnown) {
1238 s.append(", on ");
1239 s.append(this.getGornAddress().toString());
1240 if (this.getSlot() != -1 && this.getOrder() != -1) {
1241 assert this.getSlot() != -1;
1242 assert this.getOrder() != -1;
1243 s.append(", slot ");
1244 s.append(Integer.toString(this.getSlot()));
1245 s.append(", order ");
1246 s.append(Integer.toString(this.getOrder()));
1247 }
1248 }
1249 return s.toString();
1250 }
1251
1252
1253 /**
1254 * Returns the ElemTree closer to the root of the sentence
1255 * (the one to which is attached).
1256 */
1257 public ElemTree getParent() {
1258 return parent;
1259 }
1260
1261 /**
1262 * Returns the ElemTree closer to the fringe of the sentence
1263 * (the one which attaches to something).
1264 */
1265 public ElemTree getChild() {
1266 return child;
1267 }
1268
1269 /**
1270 * Returns the precise place at which the current attachment
1271 * is linked to its parent (a node in the spine of the parent designated
1272 * by the GornAddress of this attachment). Return null if unknown
1273 * (as in the output of Shen's bidirectional parser).
1274 */
1275 public SpinalNode getAttachmentSiteOnParent() {
1276 if (!locationKnown) return null;
1277 return this.getParent().getSpinalNodeAt(this.getGornAddress());
1278 }
1279
1280
1281 /**
1282 * A synonym of getChild().
1283 */
1284 public ElemTree getElemTree() {
1285 return getChild();
1286 }
1287
1288 public int getType() {
1289 return type;
1290 }
1291
1292 public int getNodeNumber() {
1293 // always known, even if !locationKnown
1294 return nodeNumber;
1295 }
1296
1297 public int getSlot() {
1298 if (!locationKnown) return -1;
1299 return slot;
1300 }
1301
1302 public int getOrder() {
1303 if (!locationKnown) return -1;
1304 return order;
1305 }
1306
1307 public GornAddress getGornAddress() {
1308 if (!locationKnown) return null;
1309 return gornAddress;
1310 }
1311 } // end of class TAGAttachment
1312
1313 private static class TAGAttachmentToElemTreeIterator implements ListIterator {
1314
1315 ListIterator wrappedIterator;
1316
1317 public TAGAttachmentToElemTreeIterator(List attachments) {
1318 wrappedIterator = attachments.listIterator();
1319 }
1320
1321 public void add(Object o) {
1322 throw new UnsupportedOperationException("add not supported.");
1323 }
1324
1325 public void remove() {
1326 throw new UnsupportedOperationException("remove not supported.");
1327 }
1328
1329 public void set(Object o) {
1330 throw new UnsupportedOperationException("set not supported");
1331 }
1332
1333 public boolean hasPrevious() {
1334 return wrappedIterator.hasPrevious();
1335 }
1336
1337 public boolean hasNext() {
1338 return wrappedIterator.hasNext();
1339 }
1340
1341 public int nextIndex() {
1342 return wrappedIterator.nextIndex();
1343 }
1344
1345 public int previousIndex() {
1346 return wrappedIterator.previousIndex();
1347 }
1348
1349 //returns ElemTree
1350 public Object next() {
1351 TAGAttachment nextOne = (TAGAttachment) wrappedIterator.next();
1352
1353 return nextOne.getChild();
1354 }
1355
1356 //returns ElemTree
1357 public Object previous() {
1358 TAGAttachment previousOne = (TAGAttachment) wrappedIterator.previous();
1359
1360 return previousOne.getChild();
1361 }
1362 }
1363
1364 static class TAGAttachmentToWordSpanIterator implements ListIterator {
1365
1366 ListIterator wrappedIterator;
1367
1368 public TAGAttachmentToWordSpanIterator(List attachments) {
1369 wrappedIterator = attachments.listIterator();
1370 }
1371
1372 public void add(Object o) {
1373 throw new UnsupportedOperationException("add not supported.");
1374 }
1375
1376 public void remove() {
1377 throw new UnsupportedOperationException("remove not supported.");
1378 }
1379
1380 public void set(Object o) {
1381 throw new UnsupportedOperationException("set not supported");
1382 }
1383
1384 public boolean hasPrevious() {
1385 return wrappedIterator.hasPrevious();
1386 }
1387
1388 public boolean hasNext() {
1389 return wrappedIterator.hasNext();
1390 }
1391
1392 public int nextIndex() {
1393 return wrappedIterator.nextIndex();
1394 }
1395
1396 public int previousIndex() {
1397 return wrappedIterator.previousIndex();
1398 }
1399
1400 //returns WordSpan
1401 public Object next() {
1402 TAGAttachment nextOne = (TAGAttachment) wrappedIterator.next();
1403
1404 return nextOne.getChild().getSpan();
1405 }
1406
1407 //returns WordSpan
1408 public Object previous() {
1409 TAGAttachment previousOne = (TAGAttachment) wrappedIterator.previous();
1410
1411 return previousOne.getChild().getSpan();
1412 }
1413 }
1414 }