001 /** 002 * LTAG-spinal API, an interface to the treebank format introduced by Libin Shen. 003 * Copyright (C) 2007 Lucas Champollion 004 * 005 * This program is free software: you can redistribute it and/or modify 006 * it under the terms of the GNU General Public License as published by 007 * the Free Software Foundation, either version 3 of the License, or 008 * (at your option) any later version. 009 * 010 * This program is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU General Public License for more details. 014 * 015 * You should have received a copy of the GNU General Public License 016 * along with this program. If not, see <http://www.gnu.org/licenses/>. 017 * 018 */ 019 package edu.upenn.cis.spinal; 020 021 import java.io.*; 022 import java.util.*; 023 import java.util.regex.Matcher; 024 import java.util.regex.Pattern; 025 import edu.upenn.cis.propbank_shen.*; 026 027 /** 028 * Represents a "spinal" elementary tree in Libin's LTAG 029 * treebank. A typical elementary tree (taken from p. 73 030 * of his thesis) looks like this: 031 * 032 * <code> 033 * #3 failed 034 * spine: a_( S ( VP VBD^ ) ) 035 * att #0, on 0, slot 0, order 0 036 * att #2, on 0, slot 0, order 1 037 * att #6, on 0.0, slot 1, order 0 038 * </code> 039 * 040 * or like this (a structure for predicate coordination): 041 * 042 * <code> 043 * &20 044 * spine: c_( S S S ) 045 * crd #3, on 0.0 046 * att #9, on 0, slot 1, order 0 047 * crd #10, on 0.1 048 * </code> 049 * 050 * @author Lucas Champollion 051 * @author Ryan Gabbard 052 */ 053 public class ElemTree implements Serializable { 054 055 /** 056 * Designates a field (tree, slot, etc.) of unknown type. 057 */ 058 public static final int UNKNOWN = -1; 059 060 /** 061 * Designates an initial tree. 062 */ 063 public static final int INITIAL = 0; 064 065 /** 066 * Designates an auxiliary tree. 067 */ 068 public static final int AUXILIARY = 1; 069 070 /** 071 * Designates a coordination tree -- these trees are special constructs and not anchored in a lexical item. 072 */ 073 public static final int COORD = 2; 074 075 /** 076 * A return value designating a special case when the current tree that has no parent. 077 */ 078 public static final int ROOT = 3; 079 080 /** 081 * Designates a tree combined with its parent by an attachment operation, 082 * indicated by the keyword "att". 083 */ 084 public static final int ATTACH = 4; 085 086 /** 087 * Designates a tree combined with its parent by an adjunction operation, 088 * indicated by the keyword "adj". 089 */ 090 public static final int ADJOIN = 5; 091 092 /** 093 * Designates a tree combined with its parent by an attachment operation, 094 * indicated by the keyword "crd" in the output of the incremental parser 095 * and in the LTAG-spinal treebank. 096 */ 097 public static final int CONJUNCT = 6; 098 099 /** 100 * Designates a tree combined with its parent by a conjunction or an attachment operation, 101 * indicated by the keyword "con" in the output of the incremental parser 102 * and in the LTAG-spinal treebank. 103 * 104 * In the treebank and in the output of the incremental parser, "crd" 105 * is used for conjuncts and "att" is used for 106 * connectives. In the output of the bidirectional parser, we don't distinguish 107 * conjuncts from connectives, in order to reduce an operation in parsing, so "con" is used 108 * to represent both. 109 */ 110 public static final int CONJUNCT_OR_CONNECTIVE = 7; 111 112 /** 113 * Designates a tree that attaches/adjoins on the left of its parent. 114 */ 115 public static final int LEFT = 8; 116 117 /** 118 * Designates a tree that attaches/adjoins on the right of its parent. 119 */ 120 public static final int RIGHT = 9; 121 122 private static final int ATT_TYPE_OFFSET = 2; 123 private static final String newline = System.getProperty("line.separator"); 124 private Sentence containingSentence; 125 int number = -1; 126 String terminal = ""; 127 private int type = UNKNOWN; 128 private SpinalNode rootOfSpine = null; 129 130 /** 131 * Indicates that the input only specified the part of speech and not the 132 * spine, as is the case for the output of Shen's bidirectional parser. 133 */ 134 private boolean bidirectionalParserOutput = false; 135 private String pos = null; 136 private ArrayList attachments = new ArrayList(); 137 /** 138 * Null iff this tree is the root of the derivation tree. 139 */ 140 private ElemTree parent; 141 /** 142 * the attachment by which this <code>ElemTree</code> 143 * attaches to its parent; null iff this is root of derivation tree 144 */ 145 private TAGAttachment attachmentToParent = null; 146 /** 147 * the yield of the subtree rooted by this ElemTree 148 */ 149 private WordSpan span; 150 151 /** 152 * Represents to which side this tree attaches to its parent. 153 */ 154 private int slot = UNKNOWN; 155 156 private boolean completed = false; 157 private SpinalNode anchor; 158 private SpinalNode foot; 159 160 private static final Pattern posPattern = Pattern.compile("\\s*pos: (.*)"); 161 private static final Pattern spinePattern = Pattern.compile("\\s*spine: (.*)"); 162 private static final Pattern coordPattern = Pattern.compile("\\s*coord\\s*"); 163 164 165 /** 166 * Creates an <code>ElemTree</code> from a string representation and attaches it 167 * into the provided derivation tree (Sentence). We assume that the nodes are 168 * numbered in ascending order of linear precedence of their anchors, as 169 * is the case in the LTAG-spinal treebank. 170 * @param container the <code>Sentence</code> to which this 171 * <code>ElemTree</code> is to be added 172 * @param representation the string that is to be parsed into an 173 * <code>ElemTree</code> 174 * @throws edu.upenn.cis.spinal.ElemTreeFormatException if the 175 * string representation is not well-formed 176 */ 177 public ElemTree(Sentence container, String representation) 178 throws ElemTreeFormatException { 179 attachments = new ArrayList(); 180 containingSentence = container; 181 completed = false; 182 parent = null; 183 span = null; 184 185 loadFromStringRepresentation(representation); 186 } 187 188 /** 189 * Returns true if this <code>ElemTree</code> attaches or adjoins from the 190 * left of its parent. 191 * @return a boolean value 192 * @see #getSlot() 193 */ 194 public boolean attachesFromLeft() { 195 return this.getSlot() == LEFT; 196 } 197 198 /** 199 * Returns true if this <code>ElemTree</code> attaches or adjoins from the 200 * right of its parent. 201 * @return a boolean value 202 * @see #getSlot() 203 */ 204 public boolean attachesFromRight() { 205 return this.getSlot() == RIGHT; 206 } 207 208 /** 209 * Returns the node representing the lexical anchor of this tree, or null 210 * if there is no anchor (in the case of a coordination structure). 211 * @return a <code>SpinalNode</code> element representing the lexical anchor 212 */ 213 public SpinalNode getAnchor() { 214 215 Iterator iter = this.getSpine().getAllNodes().iterator(); 216 SpinalNode current; 217 while (iter.hasNext()) { 218 current = (SpinalNode) iter.next(); 219 if (current.isAnchor()) { 220 this.anchor = current; 221 return current; 222 } 223 } 224 this.anchor = null; 225 return null; 226 } 227 228 /** 229 * Returns the <code>SpinalNode</code> in the parent tree to which this <code>ElemTree</code> 230 * is attached, or null if this <code>ElemTree</code> is the root of the derivation tree. 231 * @return the parent's attachment site 232 */ 233 public SpinalNode getAttachmentSite() { 234 checkCompletion(); 235 if (this.isRoot()) { 236 return null; 237 } 238 return this.getParent().getSpinalNodeAt(this.getAttachmentToParent().getGornAddress()); 239 } 240 241 TAGAttachment getAttachmentToParent() { 242 243 return attachmentToParent; 244 } 245 246 /** 247 * Returns the type of the attachment of this elementary tree to its parent tree, one of 248 * {@link #ATTACH}, {@link #ADJOIN}, {@link #CONJUNCT} (only used 249 * in the LTAG-spinal treebank and in the output of the incremental parser), 250 * and {@link #CONJUNCT_OR_CONNECTIVE} (only used in the output of the 251 * bidirectional parser). If this elementary tree is the root of the sentence, 252 * the special value {@link #ROOT} is returned. 253 * @return the type of attachment of this <code>ElemTree</code> 254 */ 255 public int getAttachmentType() { 256 if (this.isRoot()) { 257 return ROOT; 258 } 259 return attachmentToParent.getType(); 260 } 261 262 /** 263 * Internal method, returns <code>TAGAttachment</code> list for this tree. 264 * @return a list of attachments 265 */ 266 private List getAttachments() { 267 return attachments; 268 } 269 270 /** 271 * Returns the <code>ElemTree</code>s that attach to this <code>ElemTree</code>. 272 * @return the children of this node in the derivation tree 273 */ 274 public Iterator getChildren() { 275 checkCompletion(); 276 277 return new TAGAttachmentToElemTreeIterator(getAttachments()); 278 } 279 280 /** 281 * Returns the spans of the <code>ElemTree</code>s that attach to this 282 * <code>ElemTree</code>. 283 * @return an iterator over (@link WordSpan} objects 284 */ 285 public Iterator getChildrenSpans() { 286 checkCompletion(); 287 288 return new TAGAttachmentToWordSpanIterator(getAttachments()); 289 } 290 291 /** 292 * Returns a <code>List</code> of the elementary trees dominated by this 293 * elementary tree, 294 * including this tree itself. 295 * @return a list over <code>ElemTree</code> objects 296 */ 297 public List getDominatedElemTrees() { 298 // TODO check if this, called on root, gives the same results as 299 // this.getSentence().getElemTrees(this.getSpan()); 300 Iterator candidates = this.getSentence().getElemTrees().iterator(); 301 List result = new Vector(); 302 while (candidates.hasNext()) { 303 ElemTree current = (ElemTree) candidates.next(); 304 if (this.dominates(current)) { 305 result.add(current); 306 } 307 } 308 return result; 309 } 310 311 /** 312 * Returns a <code>List</code> of the terminals attached to the elementary trees 313 * dominated by this elementary tree, 314 * including this tree itself. 315 * @return a list of terminals (<code>String</code> objects) 316 */ 317 public List getDominatedTerminals() { 318 Iterator iter = this.getDominatedElemTrees().iterator(); 319 List l = new Vector(); 320 while (iter.hasNext()) { 321 ElemTree current = (ElemTree) iter.next(); 322 l.add(current.getTerminal()); 323 } 324 return l; 325 } 326 327 /** 328 * Returns the number of the file that contains the sentence to which 329 * this <code>ElemTree</code> belongs, or -1 if there is no such number. 330 * @return an <code>int</code> 331 */ 332 public int getFileNumber() { 333 return this.getSentence().getFileNumber(); 334 } 335 336 /** 337 * Returns the node representing the foot node (if any) of this tree, 338 * or null if there is no foot node. (Only auxiliary trees have foot nodes.) 339 * @return a <code>SpinalNode</code> 340 */ 341 public SpinalNode getFoot() { 342 Iterator iter = this.getSpine().getAllNodes().iterator(); 343 SpinalNode current; 344 while (iter.hasNext()) { 345 current = (SpinalNode) iter.next(); 346 if (current.isFoot()) { 347 this.foot = current; 348 return current; 349 } 350 } 351 this.foot = null; 352 return null; 353 } 354 355 /** 356 * Method used internally to create the Graphviz representation of this tree. 357 */ 358 String getGraphvizNodeID() { 359 360 return "node_" + this.getNumber(); 361 } 362 363 /** 364 * Returns the number of this tree, corresponding to the position in the sentence. 365 * @return the number, or -1 if unknown 366 */ 367 public int getNumber() { 368 checkCompletion(); 369 return number; 370 } 371 372 373 374 /** 375 * Returns the location of the predicate-argument structure 376 * corresponding to this elementary tree in the Propbank, or null 377 * if this tree has no section and file numbers. This will be a string 378 * of the following shape: 379 * <code>wsj/<section>/wsj_<section><file>.mrg</code>. 380 * @return a string that indicates the location of this word in the Propbank 381 */ 382 public PASLoc getPASLoc() { 383 384 385 if (getSectionNumber() == -1 || getFileNumber() == -1) { 386 return null; 387 } 388 // these both return -1 if no number is available. 389 390 String section = Integer.toString(getSectionNumber()); 391 String file = Integer.toString(getFileNumber()); 392 393 394 if (section.length() == 1) { 395 section = "0" + section; 396 } 397 if (file.length() == 1) { 398 file = "0" + file; 399 } 400 401 String p = "wsj/" + section + "/wsj_" + section + file + ".mrg"; 402 403 return new PASLoc(p, getSentenceNumber(), getNumber()); 404 } 405 406 /** 407 * Returns the part of speech of this elementary tree, or "NA" in the case of a coordination structure 408 * (where part of speech is not really applicable since Coord nodes aren't lexicalized). 409 * @return the POS tag of the current word 410 */ 411 public String getPOS() { 412 if (this.pos != null) return this.pos; 413 414 SpinalNode theAnchor = this.getAnchor(); 415 if (theAnchor == null) { 416 assert this.isCoord(); 417 return "NA"; 418 } else { 419 return theAnchor.getLabel(); 420 } 421 } 422 423 /** 424 * Returns the <code>ElemTree</code> to which this <code>ElemTree</code> attaches, 425 * or null iff this is the root. 426 * @return the parent of this node in the derivation tree 427 */ 428 public ElemTree getParent() { 429 checkCompletion(); 430 return parent; 431 } 432 433 434 /** 435 * Returns the number of the section that contains the sentence to which 436 * this <code>ElemTree</code> belongs, or -1 if there is no such number. 437 * @return an <code>int</code> 438 */ 439 public int getSectionNumber() { 440 return this.getSentence().getSectionNumber(); 441 } 442 443 /** 444 * Return the <code>Sentence</code> to which this <code>ElemTree</code> 445 * belongs. 446 * @return a <code>Sentence</code> object 447 */ 448 public Sentence getSentence() { 449 //checkCompletion(); 450 return containingSentence; 451 } 452 453 /** 454 * Returns the number of the the sentence to which 455 * this <code>ElemTree</code> belongs, or -1 if there is no such number. 456 * @return an <code>int</code> 457 */ 458 public int getSentenceNumber() { 459 return this.getSentence().getSentenceNumber(); 460 } 461 462 /** 463 * Returns whether this elementary tree attaches to the {@link #LEFT} or {@link #RIGHT} 464 * side of its parent. Returns {@link #ROOT} if this tree is the root of the 465 * sentence and {@link #UNKNOWN} if the value is not known. 466 * 467 * @return one of the given values 468 * @see #attachesFromLeft() 469 * @see #attachesFromRight() 470 */ 471 public int getSlot() { 472 if (this.isRoot()) return ROOT; 473 474 return this.slot; 475 } 476 477 /** 478 * Returns the part of the sentence that is spanned by the yield of this 479 * elementary tree and its descendents. If the span is discontinuous, the 480 * left- and rightmost <code>ElemTree</code> locations are given. 481 * @return a <code>WordSpan</code> object 482 */ 483 public WordSpan getSpan() { 484 checkCompletion(); 485 if (span == null) { 486 computeSpan(); 487 } 488 return span; 489 } 490 491 /** 492 * Returns the spinal node at a given Gorn address in this tree. 493 * @param g a {@link GornAddress} 494 * @return a {@link SpinalNode} 495 */ 496 public SpinalNode getSpinalNodeAt(GornAddress g) { 497 checkCompletion(); 498 Iterator parts = g.iterator(); 499 SpinalNode currentNode = this.getSpine(); // current node is root of spine 500 int currentChild = ((Integer) parts.next()).intValue(); 501 if (currentChild != 0) { 502 throw new RuntimeException("Bad GornAddress: "+ g.toString()); 503 } 504 while (parts.hasNext()) { 505 // TODO raise exception if we're given a bad address 506 currentNode = currentNode.getChild(((Integer) parts.next()).intValue()); 507 } 508 return currentNode; 509 } 510 511 /** 512 * Returns the spine of this tree. 513 * @return a {@link SpinalNode} representing the root of the spine 514 */ 515 public SpinalNode getSpine() { 516 checkCompletion(); 517 return rootOfSpine; 518 } 519 520 /** 521 * Returns the yield of the subtree rooted in this elementary tree, with 522 * terminals separated by a single white space. Empty elements are included. 523 * @return a substring corresponding to the current <code>ElemTree</code> 524 * and all its descendents. 525 */ 526 public String getSurfaceString() { 527 String result = ""; 528 Iterator iter = this.getDominatedTerminals().iterator(); 529 while (iter.hasNext()) { 530 if (!result.equals("")) { 531 result += " "; 532 } 533 result += (String) iter.next(); 534 } 535 return result; 536 } 537 538 /** 539 * Returns the actual terminal string (word in most cases) represented by this <code>ElemTree</code>. 540 * @return the terminal string at the fringe of this elementary tree 541 */ 542 public String getTerminal() { 543 checkCompletion(); 544 return terminal; 545 } 546 547 /** 548 * Returns the type of this elementary tree, one of 549 * {@link #UNKNOWN}, {@link #INITIAL}, {@link #AUXILIARY}, and {@link #COORD}. 550 * @return the type of this <code>ElemTree</code> 551 */ 552 public int getType() { 553 checkCompletion(); 554 return type; 555 } 556 557 /** 558 * Returns a string representing the type of this elementary tree, one of 559 * <code>initial</code>, <code>auxiliary</code>, <code>coordination</code>, 560 * and <code>unknown</code>. 561 * @return the type of this <code>ElemTree</code> in string representation 562 */ 563 public String getTypeAsString() { 564 checkCompletion(); 565 switch (this.type) { 566 case INITIAL: 567 return "initial"; 568 case AUXILIARY: 569 return "auxiliary"; 570 case COORD: 571 return "coordination"; 572 case UNKNOWN: 573 return "unknown"; 574 default: 575 assert false; 576 return "unknown"; 577 } 578 } 579 580 /** 581 * Returns true iff this tree is of type "coordination". 582 * @return a boolean value 583 */ 584 public boolean isCoord() { 585 return this.type == COORD; 586 } 587 588 /** 589 * Returns true iff this tree is of type "auxiliary". 590 * @return a boolean value 591 */ 592 public boolean isAuxiliary() { 593 return this.type == AUXILIARY; 594 } 595 596 /** 597 * Returns true iff this tree is of type "initial". 598 * @return a boolean value 599 */ 600 public boolean isInitial() { 601 return this.type == INITIAL; 602 } 603 604 /** 605 * Returns true iff this tree is of unknown type. 606 * @return a boolean value 607 */ 608 public boolean isOfUnknownType() { 609 return this.type == UNKNOWN; 610 } 611 612 613 /** 614 * Returns if the anchor (if there is one) is an empty element 615 * by checking if the terminal has an unescaped asterisk. Returns false 616 * if this is a coordination node, in which case there is no anchor anyway. 617 * @return a boolean value 618 */ 619 public boolean isEmptyElement() { 620 if (this.isCoord()) { 621 return false; 622 } 623 String t = this.getTerminal(); 624 return (t.indexOf("*") != -1 && t.indexOf("\\*") == -1); 625 } 626 627 /** 628 * Returns true iff this elementary tree has no parent. 629 * @return a boolean value 630 */ 631 public boolean isRoot() { 632 return this.getParent() == null; 633 } 634 635 /** 636 * Returns true iff this <code>ElemTree</code> has other 637 * <code>ElemTree</code>s attached to it. 638 * @return a boolean value 639 */ 640 public boolean hasChildren() { 641 return this.getChildren().hasNext(); 642 } 643 644 /** 645 * Method used internally. 646 */ 647 void computeSpan() { 648 int position = this.getNumber(); 649 WordSpan lexicalSpan = new WordSpan(position, position); 650 651 // Coordination trees need special treatment: Their "lexical span" 652 // refers to a dummy node with no lexical content, so we must ignore 653 // that span. 654 655 if (this.getType() == COORD) { 656 this.span = WordSpan.merge(this.getChildrenSpans()); 657 } else if (this.hasChildren()) { 658 this.span = WordSpan.combine(lexicalSpan, WordSpan.merge(this.getChildrenSpans())); 659 } else { // this has no children 660 this.span = lexicalSpan; 661 } 662 } 663 664 /** 665 * Method used internally while building up a derivation tree. 666 */ 667 void complete() { 668 669 // tells each child that this is its parent 670 for (Iterator it = getAttachments().iterator(); it.hasNext();) { 671 TAGAttachment attach = (TAGAttachment) it.next(); 672 673 attach.complete(); 674 } 675 676 completed = true; 677 } 678 679 /** 680 * Method used internally while building up a derivation tree. 681 */ 682 private void checkCompletion() { 683 if (!completed) { 684 throw new UncompletedElemTreeException(); 685 } 686 } 687 688 /** 689 * Method used internally and called from the constructor to parse 690 * a string representation into an ElemTree object. 691 */ 692 void loadFromStringRepresentation(String rep) 693 throws ElemTreeFormatException { 694 String[] lines = rep.split("\\n"); 695 696 // first line has format: 697 // #4 will 698 // or: 699 // &25 700 // The latter case is coordination. 701 // Actually we have already stripped off the first character (& or #). 702 703 String[] firstLinePieces = lines[0].split("\\s+"); 704 705 if (firstLinePieces.length < 1) { 706 throw new ElemTreeFormatException("First line empty."); 707 } 708 709 710 if (firstLinePieces.length == 1) { // coordination doesn't come with a word 711 this.type=COORD; 712 } 713 714 this.number = Integer.parseInt(firstLinePieces[0]); 715 716 terminal = ""; 717 718 for (int i = 1; i < firstLinePieces.length; i++) { 719 terminal += firstLinePieces[i]; 720 } 721 722 // second line has format: 723 // spine: a_( XP NN^ ) // spinePattern 724 // or 725 // coord // coordPattern 726 // or (for bidirectional parser output) 727 // pos: NN // posPattern 728 729 String secondLine = lines[1]; 730 731 Matcher coordMatcher = coordPattern.matcher(secondLine); 732 Matcher posMatcher = posPattern.matcher(secondLine); 733 Matcher spineMatcher = spinePattern.matcher(secondLine); 734 735 if (coordMatcher.matches()) { 736 rootOfSpine = null; 737 type = COORD; 738 } else if (posMatcher.matches()) { 739 rootOfSpine = null; 740 pos = posMatcher.group(1); 741 bidirectionalParserOutput = true; 742 } else if (spineMatcher.matches()) { 743 try { 744 secondLine = spineMatcher.group(1); 745 } catch (StringIndexOutOfBoundsException e) { 746 System.out.println(secondLine); 747 throw e; 748 } 749 750 char typeChar = secondLine.charAt(0); 751 752 switch (typeChar) { 753 case 'a': 754 type = INITIAL; 755 break; 756 case 'b': 757 type = AUXILIARY; 758 break; 759 case 'c': 760 type = COORD; 761 break; 762 } 763 764 // parse the spine 765 rootOfSpine = new SpinalNode(secondLine.substring(ATT_TYPE_OFFSET), this); 766 } // end treatment of spine 767 768 769 // read attachmentToParent lines 770 for (int i = 2; i < lines.length; i++) { 771 getAttachments().add(new TAGAttachment(this, lines[i])); 772 } 773 } 774 775 /** 776 * Converts this tree to a canonical string representation of the kind used in Libin 777 * Shen's thesis. 778 * @return a string representation 779 */ 780 public String toString() { 781 782 checkCompletion(); 783 784 StringBuffer s = new StringBuffer(); 785 // "#20 remained" (hash, number, terminal: this is the normal case) 786 // "&20" (ampersand, number, no terminal: the format for coordination) 787 String nodeno = Integer.toString(this.getNumber()); 788 789 if (this.getType() == COORD) { 790 s.append("&"); 791 s.append(nodeno); 792 } else { 793 s.append("#"); 794 s.append(nodeno); 795 s.append(" "); 796 s.append(this.getTerminal()); 797 } 798 s.append(newline); 799 800 if (this.isBidirectionalParserOutput()) { 801 // no line describing the spine, instead just a line like: 802 // pos: DT 803 // This is typical of the output of Shen's bidirectional parser. 804 s.append(" pos: "); 805 s.append(this.getPOS()); 806 s.append(newline); 807 } else if (this.getSpine() != null) { 808 // Format of the line describing the spine: 809 // spine: b_( VP VB^ VP* ) 810 s.append(" spine: "); 811 switch (this.getType()) { 812 case INITIAL: 813 s.append("a"); 814 break; 815 case AUXILIARY: 816 s.append("b"); 817 break; 818 case COORD: 819 s.append("c"); 820 break; 821 } 822 823 s.append("_"); 824 s.append(this.getSpine().toString()); 825 s.append(newline); 826 } else if (this.getSpine() == null && this.getType() == COORD) { 827 // Mimic the output of Shen's incremental parser 828 s.append(" coord"); 829 s.append(newline); 830 } 831 832 Iterator theAttachments = this.getAttachments().iterator(); 833 834 while (theAttachments.hasNext()) { 835 TAGAttachment current = (TAGAttachment) theAttachments.next(); 836 s.append(current.toString()); 837 s.append(newline); 838 } 839 return s.toString(); 840 } 841 842 843 // TODO maybe move this code to SpinalNode? 844 private void writeSpineToGraphviz(BufferedWriter b, SpinalNode current, 845 boolean isRoot, int start, int end, boolean includeSpans, boolean beanPoleStyle) 846 throws IOException { 847 848 checkCompletion(); 849 850 int n = current.getElemTree().getNumber(); 851 852 if (start != -1 && end != -1) { 853 if (n < start || n > end) throw new IllegalArgumentException(); 854 } 855 856 // create a graphviz node for the current spinal node 857 String currentNode = current.getGraphvizNodeID(); 858 b.write(currentNode + "[label=\"" + current.getLabel()); 859 860 if (includeSpans) { 861 String thespan = this.getSpan().toString(); 862 if (!isRoot) { 863 thespan = ""; 864 } 865 b.write(" " + thespan); 866 } 867 868 if (!current.getType().equals(SpinalNode.ANCHOR)) { 869 b.write(current.getType()); 870 } 871 b.write("\" "); 872 873 // uncomment the next line to make the root of the spine stand out 874 //if (isRoot) b.write("style=bold"); 875 876 b.write("];"); 877 b.newLine(); 878 879 // create edges from the anchor SpinalNode to the actual 880 // lexical anchor, which is represented by the 881 // graphviz node ID of the ElemTree itself (i.e. this) 882 883 if (current.getType().equals(SpinalNode.ANCHOR)) { 884 b.write(currentNode 885 + " -> " 886 + this.getGraphvizNodeID() 887 + "[style=bold arrowhead=none];"); 888 b.newLine(); 889 } 890 891 // create edges from the current node to its children (if any) 892 893 // children are never drawn emphasized (only the root of the 894 // present spine is) 895 896 SpinalNode[] children = current.getChildren(); 897 898 for (int i = 0; i < children.length; i++) { 899 writeSpineToGraphviz(b, children[i], false, start, end, includeSpans, beanPoleStyle); 900 901 // add edges to children 902 b.write(currentNode 903 + " -> " 904 + children[i].getGraphvizNodeID() 905 + "[style=bold arrowhead=none];"); 906 b.newLine(); 907 } 908 909 } 910 911 /** 912 * Method used internally to convert this ElemTree and its subtrees 913 * into Graphviz format. 914 * Should only be called by Sentence class -- 915 * doesn't generate complete graphviz output by itself. 916 */ 917 void writeGraphvizTo(BufferedWriter b, int start, int end, boolean includeSpans, boolean beanPoleStyle, boolean showSpines) 918 throws IOException { 919 920 // If we are in bidirectional parser output, we never show the spine because 921 // it is not known, but we include the part of speech in the terminal node instead 922 // because otherwise it would not show up. 923 if (this.isBidirectionalParserOutput()) { 924 showSpines = false; 925 } 926 927 // We skip any nodes that are not within start and end. 928 int n = this.getNumber(); 929 if ((start != -1 && end != -1) && (n < start || n > end)) { 930 return; 931 } 932 933 if (showSpines) { 934 935 // Make sure that foot nodes appear on the side of the tree on which they should. 936 if (this.isAuxiliary()) { 937 String anchorID = this.getAnchor().getGraphvizNodeID(); 938 String footID = this.getFoot().getGraphvizNodeID(); 939 b.write("subgraph { rank = same; edge [style=invis] "); 940 if (this.attachesFromLeft()) { 941 b.write(anchorID + " -> " + footID); 942 } else if (this.attachesFromRight()) { 943 b.write(footID + " -> " + anchorID); 944 } 945 b.write("; }"); 946 b.newLine(); 947 } 948 949 // Write out the spine of this elementary tree. 950 this.writeSpineToGraphviz(b, this.getSpine(), true, start, end, includeSpans, beanPoleStyle); 951 } 952 953 // Write out the node that represents the current terminal. 954 // This node has a different shape (a rounded box). 955 b.write(this.getGraphvizNodeID() 956 + "[label=\" #" 957 + Integer.toString(this.getNumber())); 958 // If we are in bidirectional parser output, we never show the spine because 959 // it is not known, but we include the part of speech in the terminal node instead 960 // because otherwise it would not show up. 961 if (this.isBidirectionalParserOutput()) { 962 b.write("\\n" 963 + this.getPOS()); 964 } 965 b.write("\\n" 966 + this.getTerminal() 967 + "\" style=rounded shape=box];"); 968 b.newLine(); 969 970 // Create edges to attached ElemTrees 971 // and recursively graphviz them. 972 Iterator theAttachments = this.getAttachments().iterator(); 973 while (theAttachments.hasNext()) { 974 TAGAttachment current = (TAGAttachment) theAttachments.next(); 975 976 // skip any attached ElemTrees that are not in the span 977 int position = current.getElemTree().getNumber(); 978 if ((start != -1 && end != -1) && (position < start || position > end)) { 979 continue; 980 } 981 982 // create links between ElemTrees 983 984 String constraint; 985 if (beanPoleStyle) { 986 constraint = "false"; 987 } else { 988 constraint = "true"; 989 } 990 991 String source = null; 992 String target = null; 993 994 if (showSpines) { 995 source = current.getAttachmentSiteOnParent().getGraphvizNodeID(); 996 target = current.getChild().getSpine().getGraphvizNodeID(); // root of child's spine 997 } else { 998 source = current.getParent().getGraphvizNodeID(); 999 target = current.getChild().getGraphvizNodeID(); 1000 } 1001 1002 String style; 1003 if (showSpines) { 1004 style = "dotted"; 1005 } else { 1006 style = "solid"; 1007 } 1008 b.write(source 1009 + " -> " 1010 + target 1011 + "[style=" + style 1012 + " constraint=" +constraint + "];"); 1013 b.newLine(); 1014 1015 // recursive call 1016 current.getElemTree().writeGraphvizTo(b, start, end, includeSpans, beanPoleStyle, showSpines); 1017 b.newLine(); 1018 } 1019 1020 } 1021 1022 /** 1023 * Returns true iff this <code>ElemTree</code> dominates the other tree 1024 * in the sentence (this includes 1025 * the case in which <code>this.equals(other)</code>). 1026 * @param other the other tree 1027 * @return a boolean value 1028 */ 1029 public boolean dominates(ElemTree other) { 1030 1031 if (this.equals(other)) { 1032 return true; 1033 } 1034 if (this.isParentOf(other)) { 1035 return true; 1036 } 1037 return (!other.isRoot()) && this.dominates(other.getParent()); 1038 } 1039 1040 /** 1041 * Returns true iff this <code>ElemTree</code> is the direct parent of 1042 * the other tree in the derivation tree. 1043 * @param other the other tree 1044 * @return a boolean value 1045 */ 1046 public boolean isParentOf(ElemTree other) { 1047 Iterator iter = this.getChildren(); 1048 boolean result = false; 1049 while (iter.hasNext()) { 1050 result = ((ElemTree) iter.next()).equals(other); 1051 } 1052 return result; 1053 } 1054 1055 /** 1056 * Returns true if this <code>ElemTree</code> has been read in from the 1057 * format used in the output of Shen's bidirectional parser. If this is the case, 1058 * no information about the spine is present. 1059 * @return a boolean value 1060 */ 1061 public boolean isBidirectionalParserOutput() { 1062 return this.bidirectionalParserOutput; 1063 } 1064 1065 /** 1066 * Internal class representing the attachment of another 1067 * <code>ElemTree</code> to this one. 1068 */ 1069 private static class TAGAttachment implements Serializable { 1070 1071 private int type = -1; 1072 private int nodeNumber = -1; 1073 private GornAddress gornAddress; 1074 private int slot = ElemTree.UNKNOWN; 1075 private int order = -1; 1076 private ElemTree child; 1077 private ElemTree parent; 1078 private boolean completed; 1079 1080 /** 1081 * true indicates a format as in the LTAG-spinal treebank or in 1082 * the Shen incremental parser output, where slots and numbers are 1083 * specified. False indicates a format as in the Shen bidirectional parser 1084 * output, where only the node id of the child is specified. 1085 */ 1086 private boolean locationKnown = true; 1087 1088 private void checkCompletion() { 1089 if (!completed) { 1090 throw new UncompletedElemTreeException(); 1091 } 1092 } 1093 1094 /** 1095 * Note: The parent is the current tree (this). The child is the tree 1096 * designated by the attachmentToParent. 1097 */ 1098 public TAGAttachment(ElemTree parent, String representation) 1099 throws ElemTreeFormatException { 1100 this.parent = parent; 1101 completed = false; 1102 1103 loadFromStringRepresentation(representation); 1104 } 1105 1106 1107 1108 public void complete() { 1109 child = getParent().containingSentence.getElemTree(this.getNodeNumber()); 1110 child.parent=(getParent()); 1111 child.attachmentToParent = this; 1112 child.slot=this.slot; 1113 1114 // In the LTAG-spinal treebank format, and in the output format of Shen's 1115 // incremental parser, the information about attachment 1116 // and adjunction 1117 // is recorded redundantly in the spine line (via letter "a" vs. "b") and 1118 // in the attachment line (via "att" vs. "adj"). 1119 1120 // By contrast, in the output format of Shen's bidirectional parser, 1121 // spine lines are not included and so 1122 // we propagate att/adj information from the attachment to the 1123 // elementary tree beneath it. (We know 1124 // which nodes are coordination nodes because they are marked with an 1125 // ampersand (&).) 1126 1127 if (child.isBidirectionalParserOutput()) { 1128 if (this.getType() == ElemTree.ATTACH) { 1129 child.type = ElemTree.INITIAL; 1130 } else if (this.getType() == ElemTree.ADJOIN) { 1131 child.type = ElemTree.AUXILIARY; 1132 } 1133 } 1134 1135 completed = true; 1136 } 1137 1138 void loadFromStringRepresentation(String rep) 1139 throws ElemTreeFormatException { 1140 1141 1142 // Format is: 1143 // att #24, on 0, slot 1, order 2 1144 // or 1145 // crd #6, on 0.0 1146 // or (in the output of Libin's bidirectional parser) 1147 // att #24 1148 1149 rep = rep.trim(); 1150 String[] parts = rep.split("\\s+"); 1151 1152 if ((parts.length != 8) && (parts.length != 4) && (parts.length != 2)) { 1153 throw new ElemTreeFormatException( 1154 "Malformed attachment representation: " + rep); 1155 } 1156 1157 // remove trailing commas 1158 for (int i = 0; i < parts.length; i++) { 1159 if ((parts[i].length() > 0) && 1160 parts[i].charAt(parts[i].length() - 1) == ',') { 1161 parts[i] = parts[i].substring(0, parts[i].length() - 1); 1162 } 1163 } 1164 1165 String typeString = parts[0]; 1166 1167 if (typeString.equals("att")) { 1168 type = ATTACH; 1169 } else if (typeString.equals("adj")) { 1170 type = ADJOIN; 1171 } else if (typeString.equals("crd")) { 1172 type = CONJUNCT; 1173 } else if (typeString.equals("con")) { 1174 type = CONJUNCT_OR_CONNECTIVE; 1175 1176 } else { 1177 throw new ElemTreeFormatException("Unknown attachment type " + typeString 1178 + " at " + parent.getSentence().prettyPrintLocation() + "."); 1179 } 1180 1181 try { 1182 this.nodeNumber = Integer.parseInt(parts[1].substring(1)); 1183 } catch (NumberFormatException e) { 1184 String partsMessage = ""; 1185 1186 for (int i = 0; i < parts.length; i++) { 1187 partsMessage += i + "=" + parts[i] + "\n"; 1188 } 1189 1190 throw new ElemTreeFormatException("Invalid node number " + 1191 parts[1].substring(1) + 1192 ". Attachment representation was " + rep + ". Parts array was " + partsMessage); 1193 } 1194 1195 if (parts.length == 2) { // bidirectional parser output 1196 locationKnown = false; 1197 return; 1198 } 1199 1200 this.gornAddress = new GornAddress(parts[3]); 1201 1202 if (parts.length == 8) { // LTAG-spinal and incremental parser output 1203 int theSlot = Integer.parseInt(parts[5]); 1204 if (theSlot == 0) { 1205 this.slot = ElemTree.LEFT; 1206 } else if (theSlot == 1) { 1207 this.slot = ElemTree.RIGHT; 1208 } 1209 this.order = Integer.parseInt(parts[7]); 1210 } 1211 } 1212 1213 /** 1214 * Returns a string representation of this <code>ElemTree</code> in LTAG-spinal format. 1215 * 1216 * @return a string representing this elementary tree 1217 */ 1218 public String toString() { 1219 checkCompletion(); 1220 StringBuffer s = new StringBuffer(20); 1221 switch (this.getType()) { 1222 case ATTACH: 1223 s.append(" att"); 1224 break; 1225 case ADJOIN: 1226 s.append(" adj"); 1227 break; 1228 case CONJUNCT: 1229 s.append(" crd"); 1230 break; 1231 case CONJUNCT_OR_CONNECTIVE: 1232 s.append(" con"); 1233 break; 1234 } 1235 s.append(" #"); 1236 s.append(Integer.toString(this.getNodeNumber())); 1237 if (locationKnown) { 1238 s.append(", on "); 1239 s.append(this.getGornAddress().toString()); 1240 if (this.getSlot() != -1 && this.getOrder() != -1) { 1241 assert this.getSlot() != -1; 1242 assert this.getOrder() != -1; 1243 s.append(", slot "); 1244 s.append(Integer.toString(this.getSlot())); 1245 s.append(", order "); 1246 s.append(Integer.toString(this.getOrder())); 1247 } 1248 } 1249 return s.toString(); 1250 } 1251 1252 1253 /** 1254 * Returns the ElemTree closer to the root of the sentence 1255 * (the one to which is attached). 1256 */ 1257 public ElemTree getParent() { 1258 return parent; 1259 } 1260 1261 /** 1262 * Returns the ElemTree closer to the fringe of the sentence 1263 * (the one which attaches to something). 1264 */ 1265 public ElemTree getChild() { 1266 return child; 1267 } 1268 1269 /** 1270 * Returns the precise place at which the current attachment 1271 * is linked to its parent (a node in the spine of the parent designated 1272 * by the GornAddress of this attachment). Return null if unknown 1273 * (as in the output of Shen's bidirectional parser). 1274 */ 1275 public SpinalNode getAttachmentSiteOnParent() { 1276 if (!locationKnown) return null; 1277 return this.getParent().getSpinalNodeAt(this.getGornAddress()); 1278 } 1279 1280 1281 /** 1282 * A synonym of getChild(). 1283 */ 1284 public ElemTree getElemTree() { 1285 return getChild(); 1286 } 1287 1288 public int getType() { 1289 return type; 1290 } 1291 1292 public int getNodeNumber() { 1293 // always known, even if !locationKnown 1294 return nodeNumber; 1295 } 1296 1297 public int getSlot() { 1298 if (!locationKnown) return -1; 1299 return slot; 1300 } 1301 1302 public int getOrder() { 1303 if (!locationKnown) return -1; 1304 return order; 1305 } 1306 1307 public GornAddress getGornAddress() { 1308 if (!locationKnown) return null; 1309 return gornAddress; 1310 } 1311 } // end of class TAGAttachment 1312 1313 private static class TAGAttachmentToElemTreeIterator implements ListIterator { 1314 1315 ListIterator wrappedIterator; 1316 1317 public TAGAttachmentToElemTreeIterator(List attachments) { 1318 wrappedIterator = attachments.listIterator(); 1319 } 1320 1321 public void add(Object o) { 1322 throw new UnsupportedOperationException("add not supported."); 1323 } 1324 1325 public void remove() { 1326 throw new UnsupportedOperationException("remove not supported."); 1327 } 1328 1329 public void set(Object o) { 1330 throw new UnsupportedOperationException("set not supported"); 1331 } 1332 1333 public boolean hasPrevious() { 1334 return wrappedIterator.hasPrevious(); 1335 } 1336 1337 public boolean hasNext() { 1338 return wrappedIterator.hasNext(); 1339 } 1340 1341 public int nextIndex() { 1342 return wrappedIterator.nextIndex(); 1343 } 1344 1345 public int previousIndex() { 1346 return wrappedIterator.previousIndex(); 1347 } 1348 1349 //returns ElemTree 1350 public Object next() { 1351 TAGAttachment nextOne = (TAGAttachment) wrappedIterator.next(); 1352 1353 return nextOne.getChild(); 1354 } 1355 1356 //returns ElemTree 1357 public Object previous() { 1358 TAGAttachment previousOne = (TAGAttachment) wrappedIterator.previous(); 1359 1360 return previousOne.getChild(); 1361 } 1362 } 1363 1364 static class TAGAttachmentToWordSpanIterator implements ListIterator { 1365 1366 ListIterator wrappedIterator; 1367 1368 public TAGAttachmentToWordSpanIterator(List attachments) { 1369 wrappedIterator = attachments.listIterator(); 1370 } 1371 1372 public void add(Object o) { 1373 throw new UnsupportedOperationException("add not supported."); 1374 } 1375 1376 public void remove() { 1377 throw new UnsupportedOperationException("remove not supported."); 1378 } 1379 1380 public void set(Object o) { 1381 throw new UnsupportedOperationException("set not supported"); 1382 } 1383 1384 public boolean hasPrevious() { 1385 return wrappedIterator.hasPrevious(); 1386 } 1387 1388 public boolean hasNext() { 1389 return wrappedIterator.hasNext(); 1390 } 1391 1392 public int nextIndex() { 1393 return wrappedIterator.nextIndex(); 1394 } 1395 1396 public int previousIndex() { 1397 return wrappedIterator.previousIndex(); 1398 } 1399 1400 //returns WordSpan 1401 public Object next() { 1402 TAGAttachment nextOne = (TAGAttachment) wrappedIterator.next(); 1403 1404 return nextOne.getChild().getSpan(); 1405 } 1406 1407 //returns WordSpan 1408 public Object previous() { 1409 TAGAttachment previousOne = (TAGAttachment) wrappedIterator.previous(); 1410 1411 return previousOne.getChild().getSpan(); 1412 } 1413 } 1414 }