001 /** 002 * LTAG-spinal API, an interface to the treebank format introduced by Libin Shen. 003 * Copyright (C) 2007 Lucas Champollion 004 * 005 * This program is free software: you can redistribute it and/or modify 006 * it under the terms of the GNU General Public License as published by 007 * the Free Software Foundation, either version 3 of the License, or 008 * (at your option) any later version. 009 * 010 * This program is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU General Public License for more details. 014 * 015 * You should have received a copy of the GNU General Public License 016 * along with this program. If not, see <http://www.gnu.org/licenses/>. 017 * 018 */ 019 package edu.upenn.cis.spinal; 020 021 import java.io.*; 022 import java.util.*; 023 import java.util.regex.*; 024 import edu.upenn.cis.propbank_shen.*; 025 026 /** 027 * Represents a sentence (an LTAG-spinal derivation tree) in Libin Shen's 028 * LTAG-spinal treebank. 029 * 030 * A typical sentence is represented in Libin Shen's thesis, page 73. 031 * 032 * @author Lucas Champollion 033 * @author Ryan Gabbard 034 */ 035 public class Sentence implements Serializable { 036 037 038 /** 039 * Some sentences in the LTAG-spinal Treebank only consist of the word "skip", 040 * i.e. they're not really there -- the word indicates that the corresponding 041 * sentence in the Penn Treebank has not been included in the LTAG-spinal 042 * treebank. 043 */ 044 private boolean skip=false; 045 046 /** 047 * The number of the Penn Treebank section from which this 048 * <code>Sentence</code> has been taken, or -1 if not applicable. 049 */ 050 private int sectionNumber = -1; 051 052 /** 053 * The number of the Penn Treebank file from which this 054 * <code>Sentence</code> has been taken, or -1 if not applicable. 055 */ 056 private int fileNumber=-1; 057 058 /** 059 * The number of the Penn Treebank section from which this 060 * <code>Sentence</code> has been taken, or -1 if not applicable. 061 */ 062 private int sentenceNumber=-1; 063 064 /** 065 * The root of the sentence. 066 */ 067 private ElemTree root; 068 069 /** 070 * Contains all the elementary trees of this sentence in order. 071 */ 072 private ArrayList elemTrees; 073 074 /** 075 * A lexicon mapping word spans to the corresponding nodes. 076 */ 077 private HashMap spanTable; 078 079 /** 080 * First call to {@link #computeSpans()} sets this to true. 081 */ 082 private boolean spansComputed=false; 083 084 /** 085 * First call to {@link #computeSpanTable()} sets this to true. 086 */ 087 private boolean spanTableComputed=false; 088 089 090 /** 091 * Pattern used in method (@link loadFromStringRepresentation(String)} 092 */ 093 private Pattern elemTreePattern = 094 Pattern.compile("^[#|&]",Pattern.MULTILINE); 095 096 // to be deleted if RelativeClauseFixer is 097 static int multirootedParses=0; 098 099 /** 100 * Creates a new <code>Sentence</code> object from a string representation following 101 * the format defined in Libin Shen's thesis. 102 * @param representation a <code>String</code> containing a specification of 103 * a sentence in LTAG-spinal format 104 * @throws edu.upenn.cis.spinal.ElemTreeFormatException if an error occurs while parsing the string representation 105 */ 106 public Sentence(String representation) 107 throws ElemTreeFormatException { 108 elemTrees=new ArrayList(); 109 root=null; 110 sentenceNumber=-1; 111 112 loadFromStringRepresentation(representation); 113 } 114 115 /** 116 * Convenience method that calls the constructor, to follow the conventions in the Propbank API. 117 * @param representation a <code>String</code> containing a specification of 118 * a sentence in LTAG-spinal format 119 * @return a new {@link Sentence} constructed from the given <code>String</code> 120 * @throws edu.upenn.cis.spinal.ElemTreeFormatException if an error occurs while parsing the string representation 121 */ 122 public Sentence ofString(String representation) throws ElemTreeFormatException { 123 return new Sentence(representation); 124 } 125 126 /** 127 * Performs the actual parsing of an LTAG-spinal formatted string 128 * into a <code>Sentence</code>. 129 * @param rep a <code>String</code> containing a specification of a sentence in LTAG-spinal format 130 * @throws edu.upenn.cis.spinal.ElemTreeFormatException if the input is not well-formed 131 */ 132 void loadFromStringRepresentation(String rep) 133 throws ElemTreeFormatException { 134 135 String[] lines=rep.split("\\n", 3); 136 // first line looks like: 137 // 0 1 0 138 // (for LTAG spinal treebank) 139 // or like: 140 // 1 141 // (for parser output) 142 143 String[] locations = lines[0].split(" "); 144 145 try { 146 if (locations.length == 1) { // typical of parser output 147 sentenceNumber=Integer.parseInt(locations[0]); 148 } else if (locations.length == 3) { // typical of LTAG treebank 149 sectionNumber=Integer.parseInt(locations[0]); 150 fileNumber=Integer.parseInt(locations[1]); 151 sentenceNumber=Integer.parseInt(locations[2]); 152 } else throw new ElemTreeFormatException 153 ("Invalid sentence number" + locations); 154 } catch (NumberFormatException e) { 155 throw new ElemTreeFormatException("Invalid sentence number " + 156 locations); 157 } 158 159 // second line looks like: 160 // root 6 161 // or like: 162 // skip 163 164 if (lines[1].equals("skip")) { 165 skip=true; 166 return; 167 } 168 169 170 int rootNumber=-1; 171 172 try { 173 String[] rootParts=lines[1].split("\\s+"); 174 175 if (rootParts.length>2) { 176 177 String location; 178 179 if (sectionNumber != -1 && fileNumber != -1) { 180 location = "section " + sectionNumber 181 + ", file " + fileNumber 182 + ", sentence " + sentenceNumber; 183 184 } else { 185 location = "sentence " + sentenceNumber; 186 } 187 188 System.err.println("WARNING: " 189 + location 190 + " has a multirooted parse (\"" 191 + lines[1] + 192 "\"). This is not supported by the API and not " + 193 "conform to LTAG-spinal standards, although " + 194 "Shen's incremental parser sometimes produces this " + 195 "output. " + 196 "Only the first root has been read in."); 197 multirootedParses++; 198 } 199 200 if (rootParts.length<2) { 201 System.err.println("WARNING: Bad root: " 202 + "section " + sectionNumber 203 + " file " + fileNumber 204 + " sentence " + sentenceNumber); 205 } 206 207 rootNumber= 208 Integer.parseInt(rootParts[1]); 209 } catch (NumberFormatException e) { 210 throw new ElemTreeFormatException("Invalid root number: " + 211 lines[1].substring(5)); 212 } 213 214 // now parse the spinal elementary trees in the file 215 String[] elemTreeRepresentations=elemTreePattern.split(lines[2],0); 216 217 for (int i=1; i<elemTreeRepresentations.length; i++) { 218 elemTrees.add(new ElemTree(this, elemTreeRepresentations[i])); 219 } 220 221 root=(ElemTree)elemTrees.get(rootNumber); 222 223 for (Iterator it=elemTrees.iterator(); it.hasNext(); ) { 224 ElemTree node=(ElemTree)it.next(); 225 226 node.complete(); 227 } 228 } 229 230 /** 231 * Reads a string representation of a derivation tree from the specified 232 * <code>BufferedReader</code>. 233 * @return a <code>Sentence</code> element representing the derivation tree, or null 234 * if the input contained nothing or contained only whitespace 235 * @param inp the <code>BufferedReader</code> from which to read 236 * @throws edu.upenn.cis.spinal.ElemTreeFormatException if an error occurs while parsing the string representation 237 * @throws java.io.IOException if an error occurs while reading 238 */ 239 public static Sentence readTree(BufferedReader inp) 240 throws ElemTreeFormatException, IOException { 241 StringBuffer s=new StringBuffer(""); 242 String in=null; 243 244 while (true) { 245 in=inp.readLine(); 246 247 if ((in==null) || in.matches("^\\s*$")) { // null or whitespace 248 break; 249 } else { 250 s.append(in); 251 s.append("\n"); 252 } 253 } 254 255 if (s.length()==0) { 256 return null; 257 } else { 258 return new Sentence(s.toString()); 259 } 260 } 261 262 /** 263 * Prints this sentence to the specified output in LTAG-spinal format. 264 * @param w the {@link java.io.Writer} to which this sentence is to be written 265 * @throws java.io.IOException if an error occurs during writing 266 */ 267 public void writeTo(Writer w) throws IOException { 268 this.writeTo(new BufferedWriter(w)); 269 } 270 271 /** 272 * Prints this sentence to the specified output in LTAG-spinal format. 273 * @param b the {@link java.io.BufferedWriter} to which this sentence is to be written 274 * @throws java.io.IOException if an error occurs during writing 275 */ 276 public void writeTo(BufferedWriter b) throws IOException { 277 b.write(this.getLocation()); 278 b.newLine(); 279 if (this.isSkipped()) { 280 b.write("skip"); 281 b.newLine(); 282 b.newLine(); 283 284 b.flush(); 285 return; 286 } 287 288 b.write("root " + this.getRoot().getNumber()); 289 b.newLine(); 290 291 Iterator theElemTrees = this.elemTreesIterator(); 292 while (theElemTrees.hasNext()) { 293 ElemTree current = (ElemTree) theElemTrees.next(); 294 b.write(current.toString()); 295 296 } 297 b.newLine(); 298 b.flush(); 299 300 } 301 302 303 /** 304 * Writes a visual representation of a subpart of this sentence in Graphviz format 305 * to the specified {@link java.io.BufferedWriter}. 306 * 307 * 308 * @param b the {@link java.io.BufferedWriter} to which the subsentence is to be written 309 * @param start the first word of the sentence to be included in the graphical output 310 * @param end the last word of the sentence to be included in the graphical output 311 * @param includeSpans if true, the word span of the subtree dominated by a node 312 * is appended to that node's representation; otherwise, it only 313 * consists of the node label 314 * @param beanPoleStyle chooses between two very different styles of output -- if true, 315 * the output looks like beanpoles, if false, it looks like tadpoles. See the 316 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>. 317 * @param showSpines if true, shows the internal structure of the elementary trees; 318 * otherwise, shows each elementary tree as one single node 319 * @throws java.io.IOException if an error occurs during writing 320 */ 321 public void writeGraphvizTo(BufferedWriter b, int start, int end, 322 boolean includeSpans, boolean beanPoleStyle,boolean showSpines) throws IOException { 323 b.write("digraph {"); 324 b.newLine(); 325 326 // write out location 327 b.write("node_location[label=\"" 328 + this.prettyPrintLocation() 329 + "\" shape=box];"); 330 b.newLine(); 331 332 333 if (this.isSkipped()) { 334 b.write("skip;"); 335 } else { 336 337 if (showSpines && !this.isBidirectionalParserOutput()) { 338 // Create cluster with all the terminal nodes 339 // in order to line them up horizontally. 340 // We only do this if we actually have spines to show (i.e. 341 // not if we have bidirectional parser output) and if we are 342 // asked to show the spines (i.e. showSpines == true) because otherwise 343 // the sentence becomes unreadable. 344 b.write("subgraph { rank = same; edge [style=invis] "); 345 Iterator trees; 346 if (start==-1 || end==-1) { 347 trees = this.getElemTrees().listIterator(); 348 } else { 349 trees = this.getElemTrees(start,end).listIterator(); 350 } 351 String previousID = null, currentID = null; 352 while (trees.hasNext()) { 353 ElemTree current = (ElemTree) trees.next(); 354 previousID = currentID; 355 currentID = current.getGraphvizNodeID(); 356 if (previousID != null) { 357 b.write(" -> "); 358 } 359 b.write(currentID); 360 } 361 b.write("; }"); 362 b.newLine(); 363 } 364 365 // try to find appropriate subtree 366 ElemTree subTree = null; 367 if ((start >=0) && (end >= start)) { // if we aren't supposed to output the whole tree 368 subTree = this.getSubTree(start,end); 369 // may return null if can't find anything 370 } 371 372 if (subTree == null) { // we didn't find anything, or we're working on the whole tree 373 subTree = this.getRoot(); 374 } 375 376 377 subTree.writeGraphvizTo(b, start, end, includeSpans, beanPoleStyle, showSpines); 378 } 379 380 b.newLine(); 381 b.write("}"); 382 b.flush(); 383 return; 384 385 } 386 387 /** 388 * Writes a visual representation of this sentence in Graphviz format 389 * to the specified {@link java.io.Writer}. 390 * 391 * @param w the {@link java.io.Writer} to which this sentence is to be written 392 * @param includeSpans if true, the word span of the subtree dominated by a node 393 * is appended to that node's representation; otherwise, it only 394 * consists of the node label 395 * @param beanPoleStyle chooses between two very different styles of output -- if true, 396 * the output looks like beanpoles, if false, it looks like tadpoles. See the 397 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>. 398 * @param showSpines if true, shows the internal structure of the elementary trees; 399 * otherwise, shows each elementary tree as one single node 400 * @throws java.io.IOException if an error occurs while writing 401 */ 402 public void writeGraphvizTo(Writer w, boolean includeSpans, boolean beanPoleStyle,boolean showSpines) throws IOException { 403 this.writeGraphvizTo(new BufferedWriter(w), -1, -1, includeSpans, beanPoleStyle, showSpines); 404 } 405 406 /** 407 * Writes a visual representation of a subpart of this sentence in Graphviz format 408 * to the specified {@link java.io.Writer}. 409 * 410 * 411 * @param w the {@link java.io.Writer} to which the subsentence is to be written 412 * @param start the first word of the sentence to be included in the graphical output 413 * @param end the last word of the sentence to be included in the graphical output 414 * @param includeSpans if true, the word span of the subtree dominated by a node 415 * is appended to that node's representation; otherwise, it only 416 * consists of the node label 417 * @param beanPoleStyle chooses between two very different styles of output -- if true, 418 * the output looks like beanpoles, if false, it looks like tadpoles. See the 419 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>. 420 * @param showSpines if true, shows the internal structure of the elementary trees; 421 * otherwise, shows each elementary tree as one single node 422 * @throws java.io.IOException if an error occurs while writing 423 */ 424 public void writeGraphvizTo(Writer w, int start, int end, boolean includeSpans, boolean beanPoleStyle,boolean showSpines) throws IOException { 425 this.writeGraphvizTo(new BufferedWriter(w), start, end, includeSpans, beanPoleStyle, showSpines); 426 } 427 428 /** 429 * Writes a visual representation of this sentence in Graphviz format 430 * to the specified {@link java.io.BufferedWriter}. 431 * 432 * 433 * @param b the {@link java.io.BufferedWriter} to which this sentence is to be written 434 * @param includeSpans if true, the word span of the subtree dominated by a node 435 * is appended to that node's representation; otherwise, it only 436 * consists of the node label 437 * @param beanPoleStyle chooses between two very different styles of output -- if true, 438 * the output looks like beanpoles, if false, it looks like tadpoles. See the 439 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>. 440 * @param showSpines if true, shows the internal structure of the elementary trees; 441 * otherwise, shows each elementary tree as one single node 442 * @throws java.io.IOException if an error occurs while writing 443 */ 444 public void writeGraphvizTo(BufferedWriter b, boolean includeSpans, boolean beanPoleStyle,boolean showSpines) throws IOException { 445 this.writeGraphvizTo(b, -1, -1, includeSpans, beanPoleStyle, showSpines); 446 } 447 448 /** 449 * Returns a string representation of this sentence in LTAG-spinal format. 450 * 451 * @return a string representing this sentence 452 */ 453 public String toString() { 454 455 StringWriter sw = new StringWriter(100); 456 try { 457 this.writeTo(new BufferedWriter(sw)); 458 } catch (IOException ex) { 459 // can't occur with a StringWriter 460 ex.printStackTrace(); 461 } 462 return sw.toString(); 463 } 464 465 466 /** 467 * Returns a visual representation of this sentence in Graphviz format. 468 * @param includeSpans if true, the word span of the subtree dominated by a node 469 * is appended to that node's representation; otherwise, it only 470 * consists of the node label 471 * @param beanPoleStyle chooses between two very different styles of output -- if true, 472 * the output looks like beanpoles, if false, it looks like tadpoles. See the 473 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>. 474 * @param showSpines if true, shows the internal structure of the elementary trees; 475 * otherwise, shows each elementary tree as one single node 476 * @return a <code>String</code> containing Graphviz format 477 */ 478 public String toGraphviz(boolean includeSpans, boolean beanPoleStyle,boolean showSpines) { 479 return this.toGraphviz(-1,-1, includeSpans, beanPoleStyle, showSpines); 480 } 481 482 483 484 /** 485 * Returns a visual representation of a subpart of this sentence in Graphviz format. 486 * 487 * 488 * @param start the first word of the sentence to be included in the graphical output 489 * @param end the last word of the sentence to be included in the graphical output 490 * @param includeSpans if true, the word span of the subtree dominated by a node 491 * is appended to that node's representation; otherwise, it only 492 * consists of the node label 493 * @param beanPoleStyle chooses between two very different styles of output -- if true, 494 * the output looks like beanpoles, if false, it looks like tadpoles. See the 495 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>. 496 * @param showSpines if true, shows the internal structure of the elementary trees; 497 * otherwise, shows each elementary tree as one single node 498 * 499 * @return a <code>String</code> containing Graphviz format 500 */ 501 public String toGraphviz(int start, int end, boolean includeSpans, boolean beanPoleStyle,boolean showSpines) { 502 StringWriter sw = new StringWriter(100); 503 try { 504 this.writeGraphvizTo(new BufferedWriter(sw), start, end, includeSpans, beanPoleStyle, showSpines); 505 } catch (IOException ex) { 506 // can't occur with a StringWriter 507 ex.printStackTrace(); 508 } 509 return sw.toString(); 510 511 } 512 513 /** 514 * Returns a <code>String</code> representing the location of the current sentence 515 * -- i.e. either a triple of section, file, and sentence number as 516 * in the LTAG-spinal treebank (following the Penn Treebank conventions), 517 * or simply a sentence number if the sentence is not from the 518 * LTAG-spinal treebank. 519 * @return three numbers indicating where this sentence is found in 520 * the input 521 */ 522 523 public String getLocation() { 524 String result = ""; 525 if (sectionNumber != -1 && fileNumber != -1) { 526 result += sectionNumber + " " + fileNumber + " "; 527 } 528 return result + sentenceNumber; 529 } 530 531 /** 532 * Returns a human-readable string representing the location of the current sentence. 533 * If the sentence is taken from the LTAG-spinal treebank, the string looks 534 * as follows: 535 * <pre> 536 * Section: X File: Y Sentence: Z 537 * </pre> 538 * Otherwise, 539 * if the sentence only has a sentence number, the string looks like 540 * <pre> 541 * Sentence: Z 542 * </pre> 543 * @return a human-readable string indicating where this sentence is found in 544 * the input 545 */ 546 public String prettyPrintLocation() { 547 String result = ""; 548 if (sectionNumber != -1 && fileNumber != -1) { 549 result += "Section: " + sectionNumber 550 + " File: " + fileNumber + " "; 551 } 552 return result + "Sentence: " + sentenceNumber; 553 } 554 555 /** 556 * Returns the unique <code>ElemTree</code> that is the root of a subtree whose 557 * yield is the specified word span, or null if there is no such 558 * tree. 559 * 560 * @param w the span from the first up to and including the last word 561 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 562 * is a skipped sentence in the LTAG-spinal treebank 563 * @return an <code>ElemTree</code> or null 564 */ 565 public ElemTree getSubTree(WordSpan w) { 566 return this.getSubTree(w.start(), w.end()); 567 } 568 569 570 571 /** 572 * Returns the unique <code>ElemTree</code> that is the root of a subtree whose 573 * yield is the specified word span, or null if there is no such 574 * tree. 575 * 576 * @param start the first (leftmost) word included in the span 577 * @param end the last (rightmost) word included in the span 578 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 579 * is a skipped sentence in the LTAG-spinal treebank 580 * @return an <code>ElemTree</code> or null 581 */ 582 public ElemTree getSubTree(int start, int end) { 583 584 if (this.isSkipped()) { 585 throw new SkippedSentenceException(this); 586 } 587 588 if (this.length()-1 < end) { 589 throw new IllegalArgumentException("Attempted to retrieve a subtree from" + 590 "a WordSpan that spans outside of the sentence"); 591 } 592 593 // TODO add warnings if start > end, start < 0, end < 0, etc. here and elsewhere 594 595 List candidates = this.getElemTrees(start, end); 596 Iterator iter = candidates.iterator(); 597 ElemTree current; 598 WordSpan span; 599 while (iter.hasNext()) { 600 current = (ElemTree) iter.next(); 601 span = current.getSpan(); 602 if (start == span.start() && end == span.end()) { 603 return current; 604 } 605 } 606 // if we haven't found anything... 607 return null; 608 } 609 610 /** 611 * Returns true if this <code>Sentence</code> has been read in from the 612 * format used in the output of Shen's bidirectional parser. If this is the case, 613 * no information about the spine is present. This is implemented as a simple 614 * lookup of the corresponding property of the <code>ElemTree</code> at 615 * the root of this sentence. 616 * @return a boolean value 617 * @see ElemTree#isBidirectionalParserOutput() 618 */ 619 public boolean isBidirectionalParserOutput() { 620 return this.getRoot().isBidirectionalParserOutput(); 621 } 622 623 624 /** 625 * Returns true iff the annotation for this sentence only consists of the word "skip", 626 * indicating that it is contained in the Penn Treebank but 627 * not in the LTAG-spinal treebank. 628 * @return true iff this sentence is skipped in the LTAG-spinal treebank 629 */ 630 public boolean isSkipped() { 631 return this.skip; 632 } 633 634 /** 635 * Returns the number of the current sentence in the Penn Treebank file or parser output. 636 * @return the sentence number 637 */ 638 public int getSentenceNumber() { 639 return sentenceNumber; 640 } 641 642 /** 643 * Returns the number of the Penn Treebank section in which the current sentence 644 * occurred, or -1 if the sentence is not a Penn Treebank sentence. 645 * @return the section number, or -1 if there is no such number 646 */ 647 public int getSectionNumber() { 648 return sectionNumber; 649 } 650 651 /** 652 * Returns the number of the Penn Treebank file in which the current sentence 653 * occurred, or -1 if the sentence is not a Penn Treebank sentence. 654 * @return the file number, or -1 if there is no such number 655 */ 656 public int getFileNumber() { 657 return fileNumber; 658 } 659 660 661 /** 662 * Force the spans to be computed recursively on every <code>ElemTree</code>. 663 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 664 * is a skipped sentence in the LTAG-spinal treebank 665 */ 666 private void computeSpans() { 667 if (this.isSkipped()) { 668 throw new SkippedSentenceException(this); 669 } 670 671 this.getRoot().computeSpan(); 672 spansComputed = true; 673 } 674 675 /** 676 * Returns the <code>ElemTree</code> whose yield is the given word span, 677 * or null if there isn't one. 678 * @param w the <code>WordSpan</code> for which the dominating tree 679 * is to be returned 680 * @return an <code>ElemTree</code> or null 681 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 682 * is a skipped sentence in the LTAG-spinal treebank 683 */ 684 public ElemTree subTreeForSpan(WordSpan w) { 685 if (this.isSkipped()) { 686 throw new SkippedSentenceException(this); 687 } 688 689 if (!spansComputed) computeSpans(); 690 if (!spanTableComputed) computeSpanTable(); 691 if (spanTable.containsKey(w)) { 692 return (ElemTree) spanTable.get(w); 693 } else { 694 return null; 695 } 696 } 697 698 /** 699 * Computes the span table, a directory whose keys are word spans and whose 700 * values are the corresponding subtrees (if any). 701 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 702 * is a skipped sentence in the LTAG-spinal treebank 703 */ 704 public void computeSpanTable() { 705 if (this.isSkipped()) { 706 throw new SkippedSentenceException(this); 707 } 708 709 if (!spansComputed) computeSpans(); 710 Iterator iter = this.elemTreesIterator(); 711 spanTable = new HashMap(this.length()); 712 ElemTree current; 713 String bugs="/Users/lingrad2/srl/ltagtb/bugs/"; 714 while (iter.hasNext()) { 715 716 current = (ElemTree) iter.next(); 717 718 719 720 // The following code was used to retrieve some buggy sentences. 721 // "There are at most 28 sentences in the corpus like that. So we can 722 //ignore them for now, or fix them by hand when we have the chance 723 //(which would lead us to the question of whether we should put the 724 //treebank under version control). I'm attaching a zipfile with the 725 //sentences in question. These are .dot files that you can open in 726 //graphviz or convert using "dot -Tjpg -o output.jpg filename.dot". 727 //Actually not all of them have the bug. The common property of the 28 728 //sentences is that in each of them there are at least two subtrees of 729 //the derivation tree with identical yield. See the .info files." 730 // (Mail by Lucas Champollion to Joshi and Libin Nov 29 2006) 731 732 733 734 735 // String lws = this.getLocation().replaceAll(" ", "_"); 736 // // location with underscores 737 // if (spanTable.containsKey(current.getSpan())) { 738 // try { 739 // 740 // BufferedWriter f = new BufferedWriter 741 // (new FileWriter(bugs+lws+".dot")); 742 // this.writeGraphvizTo(f); 743 // f.close(); 744 // } catch (IOException ex) { 745 // ex.printStackTrace(); 746 // } 747 // BufferedWriter f; 748 // try { 749 // f = new BufferedWriter(new FileWriter(bugs+lws + ".info")); 750 // f.write("location: " + this.getLocation()); 751 // f.newLine(); 752 // f.write("span: " + current.getSpan()); 753 // f.newLine(); 754 // f.write("current: " + current); 755 // f.newLine(); 756 // f.write("stored: " + (ElemTree) spanTable.get(current.getSpan())); 757 // f.newLine(); 758 // f.close(); 759 // } catch (IOException ex) { 760 // ex.printStackTrace(); 761 // } 762 // 763 // } 764 spanTable.put(current.getSpan(), current); 765 } 766 spanTableComputed=true; 767 } 768 769 /** 770 * Returns true iff at least one of the elementary trees in this 771 * <code>Sentence</code> is an initial tree. 772 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 773 * is a skipped sentence in the LTAG-spinal treebank 774 * @return true iff there is at least one attachment operation in the 775 * present tree 776 */ 777 public boolean containsAttachment() { 778 if (this.isSkipped()) { 779 throw new SkippedSentenceException(this); 780 } 781 Iterator iter = this.elemTreesIterator(); 782 783 boolean result = false; 784 785 while (iter.hasNext()) { 786 ElemTree current = (ElemTree) iter.next(); 787 if (current.isInitial()) result = true; 788 } 789 return result; 790 } 791 792 /** 793 * Returns true iff at least one of the elementary trees in this 794 * <code>Sentence</code> is an auxiliary tree. 795 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 796 * is a skipped sentence in the LTAG-spinal treebank 797 * @return true iff there is at least one adjunction operation in the 798 * present tree 799 */ 800 public boolean containsAdjunction() { 801 if (this.isSkipped()) { 802 throw new SkippedSentenceException(this); 803 } 804 Iterator iter = this.elemTreesIterator(); 805 806 boolean result = false; 807 808 while (iter.hasNext()) { 809 ElemTree current = (ElemTree) iter.next(); 810 if (current.isAuxiliary()) result = true; 811 } 812 return result; 813 } 814 815 /** 816 * Returns true iff at least one of the elementary trees in this 817 * <code>Sentence</code> is a conjunction tree. 818 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 819 * is a skipped sentence in the LTAG-spinal treebank 820 * @return true iff there is at least one coordination operation in the 821 * present tree 822 */ 823 public boolean containsCoordination() { 824 if (this.isSkipped()) { 825 throw new SkippedSentenceException(this); 826 } 827 Iterator iter = this.elemTreesIterator(); 828 829 boolean result = false; 830 831 while (iter.hasNext()) { 832 ElemTree current = (ElemTree) iter.next(); 833 if (current.isCoord()) result = true; 834 } 835 return result; 836 } 837 838 // public void computeExtendedSpanTable() { 839 // computeSpanTable(); 840 // 841 // // extended span table is in the same style but trees occur 842 // // under multiple entries: a tree may be pruned by removing any 843 // // number of children 844 // throw new RuntimeException("Not implemented yet"); 845 // } 846 847 848 /** 849 * Returns the length of this <code>Sentence</code>, that is, the number of elementary 850 * trees in this derivation tree. 851 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 852 * is a skipped sentence in the LTAG-spinal treebank 853 * @return the number of elementary trees in this <code>Sentence</code> 854 */ 855 public int length() { 856 if (this.isSkipped()) { 857 throw new SkippedSentenceException(this); 858 } 859 860 return elemTrees.size(); 861 } 862 863 /** 864 * Returns the elementary tree at the root of this <code>Sentence</code>. 865 * 866 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 867 * is a skipped sentence in the LTAG-spinal treebank 868 * @return the <code>ElemTree</code> in which this derivation tree is rooted 869 */ 870 public ElemTree getRoot() { 871 if (this.isSkipped()) { 872 throw new SkippedSentenceException(this); 873 } 874 875 return root; 876 } 877 878 /** 879 * Iterates over the elementary trees of which this <code>Sentence</code> 880 * consists, in the order in which they are numbered (left to right in the 881 * sentence). 882 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 883 * is a skipped sentence in the LTAG-spinal treebank 884 * @return a <code>ListIterator</code> 885 */ 886 public ListIterator elemTreesIterator() { 887 if (this.isSkipped()) { 888 throw new SkippedSentenceException(this); 889 } 890 891 return elemTrees.listIterator(); 892 } 893 894 /** 895 * Returns the <code>ElemTree</code> associated with the <code>n</code>th word of 896 * the sentence. 897 * @param n a number between 0 and the length of the sentence 898 * @throws IndexOutOfBoundsException if index is out of range <tt>(index 899 * < 0 || index >= size())</tt>. 900 * @return an <code>ElemTree</code> for the <code>n</code>th word of the sentence 901 */ 902 public ElemTree getElemTree(int n) { 903 if (this.isSkipped()) { 904 throw new SkippedSentenceException(this); 905 } 906 907 return (ElemTree)elemTrees.get(n); 908 } 909 910 /** 911 * Returns a <code>List</code> of <code>ElemTree</code>s for 912 * the given word span. 913 * @return an ordered list containing some of the elementary trees of which 914 * this sentence consists 915 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 916 * is a skipped sentence in the LTAG-spinal treebank 917 */ 918 public List getElemTrees() { 919 if (this.isSkipped()) { 920 throw new SkippedSentenceException(this); 921 } 922 923 return this.elemTrees; 924 } 925 926 /** 927 * Returns a <code>List</code> of <code>ElemTree</code>s for 928 * the given word span. 929 * @param from the first word to be included in the list 930 * @param to the last word to be included in the list 931 * @return an ordered list containing some of the elementary trees of which 932 * this sentence consists 933 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence 934 * is a skipped sentence in the LTAG-spinal treebank 935 */ 936 public List getElemTrees(int from, int to) { 937 if (this.isSkipped()) { 938 throw new SkippedSentenceException(this); 939 } 940 941 return this.elemTrees.subList(from, to+1); 942 // the "+1" is because our spans are inclusive but subList is exclusive 943 } 944 945 946 947 948 949 950 951 }