001 /**
002 * LTAG-spinal API, an interface to the treebank format introduced by Libin Shen.
003 * Copyright (C) 2007 Lucas Champollion
004 *
005 * This program is free software: you can redistribute it and/or modify
006 * it under the terms of the GNU General Public License as published by
007 * the Free Software Foundation, either version 3 of the License, or
008 * (at your option) any later version.
009 *
010 * This program is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 * GNU General Public License for more details.
014 *
015 * You should have received a copy of the GNU General Public License
016 * along with this program. If not, see <http://www.gnu.org/licenses/>.
017 *
018 */
019 package edu.upenn.cis.spinal;
020
021 import java.io.*;
022 import java.util.*;
023 import java.util.regex.*;
024 import edu.upenn.cis.propbank_shen.*;
025
026 /**
027 * Represents a sentence (an LTAG-spinal derivation tree) in Libin Shen's
028 * LTAG-spinal treebank.
029 *
030 * A typical sentence is represented in Libin Shen's thesis, page 73.
031 *
032 * @author Lucas Champollion
033 * @author Ryan Gabbard
034 */
035 public class Sentence implements Serializable {
036
037
038 /**
039 * Some sentences in the LTAG-spinal Treebank only consist of the word "skip",
040 * i.e. they're not really there -- the word indicates that the corresponding
041 * sentence in the Penn Treebank has not been included in the LTAG-spinal
042 * treebank.
043 */
044 private boolean skip=false;
045
046 /**
047 * The number of the Penn Treebank section from which this
048 * <code>Sentence</code> has been taken, or -1 if not applicable.
049 */
050 private int sectionNumber = -1;
051
052 /**
053 * The number of the Penn Treebank file from which this
054 * <code>Sentence</code> has been taken, or -1 if not applicable.
055 */
056 private int fileNumber=-1;
057
058 /**
059 * The number of the Penn Treebank section from which this
060 * <code>Sentence</code> has been taken, or -1 if not applicable.
061 */
062 private int sentenceNumber=-1;
063
064 /**
065 * The root of the sentence.
066 */
067 private ElemTree root;
068
069 /**
070 * Contains all the elementary trees of this sentence in order.
071 */
072 private ArrayList elemTrees;
073
074 /**
075 * A lexicon mapping word spans to the corresponding nodes.
076 */
077 private HashMap spanTable;
078
079 /**
080 * First call to {@link #computeSpans()} sets this to true.
081 */
082 private boolean spansComputed=false;
083
084 /**
085 * First call to {@link #computeSpanTable()} sets this to true.
086 */
087 private boolean spanTableComputed=false;
088
089
090 /**
091 * Pattern used in method (@link loadFromStringRepresentation(String)}
092 */
093 private Pattern elemTreePattern =
094 Pattern.compile("^[#|&]",Pattern.MULTILINE);
095
096 // to be deleted if RelativeClauseFixer is
097 static int multirootedParses=0;
098
099 /**
100 * Creates a new <code>Sentence</code> object from a string representation following
101 * the format defined in Libin Shen's thesis.
102 * @param representation a <code>String</code> containing a specification of
103 * a sentence in LTAG-spinal format
104 * @throws edu.upenn.cis.spinal.ElemTreeFormatException if an error occurs while parsing the string representation
105 */
106 public Sentence(String representation)
107 throws ElemTreeFormatException {
108 elemTrees=new ArrayList();
109 root=null;
110 sentenceNumber=-1;
111
112 loadFromStringRepresentation(representation);
113 }
114
115 /**
116 * Convenience method that calls the constructor, to follow the conventions in the Propbank API.
117 * @param representation a <code>String</code> containing a specification of
118 * a sentence in LTAG-spinal format
119 * @return a new {@link Sentence} constructed from the given <code>String</code>
120 * @throws edu.upenn.cis.spinal.ElemTreeFormatException if an error occurs while parsing the string representation
121 */
122 public Sentence ofString(String representation) throws ElemTreeFormatException {
123 return new Sentence(representation);
124 }
125
126 /**
127 * Performs the actual parsing of an LTAG-spinal formatted string
128 * into a <code>Sentence</code>.
129 * @param rep a <code>String</code> containing a specification of a sentence in LTAG-spinal format
130 * @throws edu.upenn.cis.spinal.ElemTreeFormatException if the input is not well-formed
131 */
132 void loadFromStringRepresentation(String rep)
133 throws ElemTreeFormatException {
134
135 String[] lines=rep.split("\\n", 3);
136 // first line looks like:
137 // 0 1 0
138 // (for LTAG spinal treebank)
139 // or like:
140 // 1
141 // (for parser output)
142
143 String[] locations = lines[0].split(" ");
144
145 try {
146 if (locations.length == 1) { // typical of parser output
147 sentenceNumber=Integer.parseInt(locations[0]);
148 } else if (locations.length == 3) { // typical of LTAG treebank
149 sectionNumber=Integer.parseInt(locations[0]);
150 fileNumber=Integer.parseInt(locations[1]);
151 sentenceNumber=Integer.parseInt(locations[2]);
152 } else throw new ElemTreeFormatException
153 ("Invalid sentence number" + locations);
154 } catch (NumberFormatException e) {
155 throw new ElemTreeFormatException("Invalid sentence number " +
156 locations);
157 }
158
159 // second line looks like:
160 // root 6
161 // or like:
162 // skip
163
164 if (lines[1].equals("skip")) {
165 skip=true;
166 return;
167 }
168
169
170 int rootNumber=-1;
171
172 try {
173 String[] rootParts=lines[1].split("\\s+");
174
175 if (rootParts.length>2) {
176
177 String location;
178
179 if (sectionNumber != -1 && fileNumber != -1) {
180 location = "section " + sectionNumber
181 + ", file " + fileNumber
182 + ", sentence " + sentenceNumber;
183
184 } else {
185 location = "sentence " + sentenceNumber;
186 }
187
188 System.err.println("WARNING: "
189 + location
190 + " has a multirooted parse (\""
191 + lines[1] +
192 "\"). This is not supported by the API and not " +
193 "conform to LTAG-spinal standards, although " +
194 "Shen's incremental parser sometimes produces this " +
195 "output. " +
196 "Only the first root has been read in.");
197 multirootedParses++;
198 }
199
200 if (rootParts.length<2) {
201 System.err.println("WARNING: Bad root: "
202 + "section " + sectionNumber
203 + " file " + fileNumber
204 + " sentence " + sentenceNumber);
205 }
206
207 rootNumber=
208 Integer.parseInt(rootParts[1]);
209 } catch (NumberFormatException e) {
210 throw new ElemTreeFormatException("Invalid root number: " +
211 lines[1].substring(5));
212 }
213
214 // now parse the spinal elementary trees in the file
215 String[] elemTreeRepresentations=elemTreePattern.split(lines[2],0);
216
217 for (int i=1; i<elemTreeRepresentations.length; i++) {
218 elemTrees.add(new ElemTree(this, elemTreeRepresentations[i]));
219 }
220
221 root=(ElemTree)elemTrees.get(rootNumber);
222
223 for (Iterator it=elemTrees.iterator(); it.hasNext(); ) {
224 ElemTree node=(ElemTree)it.next();
225
226 node.complete();
227 }
228 }
229
230 /**
231 * Reads a string representation of a derivation tree from the specified
232 * <code>BufferedReader</code>.
233 * @return a <code>Sentence</code> element representing the derivation tree, or null
234 * if the input contained nothing or contained only whitespace
235 * @param inp the <code>BufferedReader</code> from which to read
236 * @throws edu.upenn.cis.spinal.ElemTreeFormatException if an error occurs while parsing the string representation
237 * @throws java.io.IOException if an error occurs while reading
238 */
239 public static Sentence readTree(BufferedReader inp)
240 throws ElemTreeFormatException, IOException {
241 StringBuffer s=new StringBuffer("");
242 String in=null;
243
244 while (true) {
245 in=inp.readLine();
246
247 if ((in==null) || in.matches("^\\s*$")) { // null or whitespace
248 break;
249 } else {
250 s.append(in);
251 s.append("\n");
252 }
253 }
254
255 if (s.length()==0) {
256 return null;
257 } else {
258 return new Sentence(s.toString());
259 }
260 }
261
262 /**
263 * Prints this sentence to the specified output in LTAG-spinal format.
264 * @param w the {@link java.io.Writer} to which this sentence is to be written
265 * @throws java.io.IOException if an error occurs during writing
266 */
267 public void writeTo(Writer w) throws IOException {
268 this.writeTo(new BufferedWriter(w));
269 }
270
271 /**
272 * Prints this sentence to the specified output in LTAG-spinal format.
273 * @param b the {@link java.io.BufferedWriter} to which this sentence is to be written
274 * @throws java.io.IOException if an error occurs during writing
275 */
276 public void writeTo(BufferedWriter b) throws IOException {
277 b.write(this.getLocation());
278 b.newLine();
279 if (this.isSkipped()) {
280 b.write("skip");
281 b.newLine();
282 b.newLine();
283
284 b.flush();
285 return;
286 }
287
288 b.write("root " + this.getRoot().getNumber());
289 b.newLine();
290
291 Iterator theElemTrees = this.elemTreesIterator();
292 while (theElemTrees.hasNext()) {
293 ElemTree current = (ElemTree) theElemTrees.next();
294 b.write(current.toString());
295
296 }
297 b.newLine();
298 b.flush();
299
300 }
301
302
303 /**
304 * Writes a visual representation of a subpart of this sentence in Graphviz format
305 * to the specified {@link java.io.BufferedWriter}.
306 *
307 *
308 * @param b the {@link java.io.BufferedWriter} to which the subsentence is to be written
309 * @param start the first word of the sentence to be included in the graphical output
310 * @param end the last word of the sentence to be included in the graphical output
311 * @param includeSpans if true, the word span of the subtree dominated by a node
312 * is appended to that node's representation; otherwise, it only
313 * consists of the node label
314 * @param beanPoleStyle chooses between two very different styles of output -- if true,
315 * the output looks like beanpoles, if false, it looks like tadpoles. See the
316 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>.
317 * @param showSpines if true, shows the internal structure of the elementary trees;
318 * otherwise, shows each elementary tree as one single node
319 * @throws java.io.IOException if an error occurs during writing
320 */
321 public void writeGraphvizTo(BufferedWriter b, int start, int end,
322 boolean includeSpans, boolean beanPoleStyle,boolean showSpines) throws IOException {
323 b.write("digraph {");
324 b.newLine();
325
326 // write out location
327 b.write("node_location[label=\""
328 + this.prettyPrintLocation()
329 + "\" shape=box];");
330 b.newLine();
331
332
333 if (this.isSkipped()) {
334 b.write("skip;");
335 } else {
336
337 if (showSpines && !this.isBidirectionalParserOutput()) {
338 // Create cluster with all the terminal nodes
339 // in order to line them up horizontally.
340 // We only do this if we actually have spines to show (i.e.
341 // not if we have bidirectional parser output) and if we are
342 // asked to show the spines (i.e. showSpines == true) because otherwise
343 // the sentence becomes unreadable.
344 b.write("subgraph { rank = same; edge [style=invis] ");
345 Iterator trees;
346 if (start==-1 || end==-1) {
347 trees = this.getElemTrees().listIterator();
348 } else {
349 trees = this.getElemTrees(start,end).listIterator();
350 }
351 String previousID = null, currentID = null;
352 while (trees.hasNext()) {
353 ElemTree current = (ElemTree) trees.next();
354 previousID = currentID;
355 currentID = current.getGraphvizNodeID();
356 if (previousID != null) {
357 b.write(" -> ");
358 }
359 b.write(currentID);
360 }
361 b.write("; }");
362 b.newLine();
363 }
364
365 // try to find appropriate subtree
366 ElemTree subTree = null;
367 if ((start >=0) && (end >= start)) { // if we aren't supposed to output the whole tree
368 subTree = this.getSubTree(start,end);
369 // may return null if can't find anything
370 }
371
372 if (subTree == null) { // we didn't find anything, or we're working on the whole tree
373 subTree = this.getRoot();
374 }
375
376
377 subTree.writeGraphvizTo(b, start, end, includeSpans, beanPoleStyle, showSpines);
378 }
379
380 b.newLine();
381 b.write("}");
382 b.flush();
383 return;
384
385 }
386
387 /**
388 * Writes a visual representation of this sentence in Graphviz format
389 * to the specified {@link java.io.Writer}.
390 *
391 * @param w the {@link java.io.Writer} to which this sentence is to be written
392 * @param includeSpans if true, the word span of the subtree dominated by a node
393 * is appended to that node's representation; otherwise, it only
394 * consists of the node label
395 * @param beanPoleStyle chooses between two very different styles of output -- if true,
396 * the output looks like beanpoles, if false, it looks like tadpoles. See the
397 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>.
398 * @param showSpines if true, shows the internal structure of the elementary trees;
399 * otherwise, shows each elementary tree as one single node
400 * @throws java.io.IOException if an error occurs while writing
401 */
402 public void writeGraphvizTo(Writer w, boolean includeSpans, boolean beanPoleStyle,boolean showSpines) throws IOException {
403 this.writeGraphvizTo(new BufferedWriter(w), -1, -1, includeSpans, beanPoleStyle, showSpines);
404 }
405
406 /**
407 * Writes a visual representation of a subpart of this sentence in Graphviz format
408 * to the specified {@link java.io.Writer}.
409 *
410 *
411 * @param w the {@link java.io.Writer} to which the subsentence is to be written
412 * @param start the first word of the sentence to be included in the graphical output
413 * @param end the last word of the sentence to be included in the graphical output
414 * @param includeSpans if true, the word span of the subtree dominated by a node
415 * is appended to that node's representation; otherwise, it only
416 * consists of the node label
417 * @param beanPoleStyle chooses between two very different styles of output -- if true,
418 * the output looks like beanpoles, if false, it looks like tadpoles. See the
419 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>.
420 * @param showSpines if true, shows the internal structure of the elementary trees;
421 * otherwise, shows each elementary tree as one single node
422 * @throws java.io.IOException if an error occurs while writing
423 */
424 public void writeGraphvizTo(Writer w, int start, int end, boolean includeSpans, boolean beanPoleStyle,boolean showSpines) throws IOException {
425 this.writeGraphvizTo(new BufferedWriter(w), start, end, includeSpans, beanPoleStyle, showSpines);
426 }
427
428 /**
429 * Writes a visual representation of this sentence in Graphviz format
430 * to the specified {@link java.io.BufferedWriter}.
431 *
432 *
433 * @param b the {@link java.io.BufferedWriter} to which this sentence is to be written
434 * @param includeSpans if true, the word span of the subtree dominated by a node
435 * is appended to that node's representation; otherwise, it only
436 * consists of the node label
437 * @param beanPoleStyle chooses between two very different styles of output -- if true,
438 * the output looks like beanpoles, if false, it looks like tadpoles. See the
439 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>.
440 * @param showSpines if true, shows the internal structure of the elementary trees;
441 * otherwise, shows each elementary tree as one single node
442 * @throws java.io.IOException if an error occurs while writing
443 */
444 public void writeGraphvizTo(BufferedWriter b, boolean includeSpans, boolean beanPoleStyle,boolean showSpines) throws IOException {
445 this.writeGraphvizTo(b, -1, -1, includeSpans, beanPoleStyle, showSpines);
446 }
447
448 /**
449 * Returns a string representation of this sentence in LTAG-spinal format.
450 *
451 * @return a string representing this sentence
452 */
453 public String toString() {
454
455 StringWriter sw = new StringWriter(100);
456 try {
457 this.writeTo(new BufferedWriter(sw));
458 } catch (IOException ex) {
459 // can't occur with a StringWriter
460 ex.printStackTrace();
461 }
462 return sw.toString();
463 }
464
465
466 /**
467 * Returns a visual representation of this sentence in Graphviz format.
468 * @param includeSpans if true, the word span of the subtree dominated by a node
469 * is appended to that node's representation; otherwise, it only
470 * consists of the node label
471 * @param beanPoleStyle chooses between two very different styles of output -- if true,
472 * the output looks like beanpoles, if false, it looks like tadpoles. See the
473 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>.
474 * @param showSpines if true, shows the internal structure of the elementary trees;
475 * otherwise, shows each elementary tree as one single node
476 * @return a <code>String</code> containing Graphviz format
477 */
478 public String toGraphviz(boolean includeSpans, boolean beanPoleStyle,boolean showSpines) {
479 return this.toGraphviz(-1,-1, includeSpans, beanPoleStyle, showSpines);
480 }
481
482
483
484 /**
485 * Returns a visual representation of a subpart of this sentence in Graphviz format.
486 *
487 *
488 * @param start the first word of the sentence to be included in the graphical output
489 * @param end the last word of the sentence to be included in the graphical output
490 * @param includeSpans if true, the word span of the subtree dominated by a node
491 * is appended to that node's representation; otherwise, it only
492 * consists of the node label
493 * @param beanPoleStyle chooses between two very different styles of output -- if true,
494 * the output looks like beanpoles, if false, it looks like tadpoles. See the
495 * illustrations on the <a href="http://www.cis.upenn.edu/%7Extag/spinal">LTAG-spinal website</a>.
496 * @param showSpines if true, shows the internal structure of the elementary trees;
497 * otherwise, shows each elementary tree as one single node
498 *
499 * @return a <code>String</code> containing Graphviz format
500 */
501 public String toGraphviz(int start, int end, boolean includeSpans, boolean beanPoleStyle,boolean showSpines) {
502 StringWriter sw = new StringWriter(100);
503 try {
504 this.writeGraphvizTo(new BufferedWriter(sw), start, end, includeSpans, beanPoleStyle, showSpines);
505 } catch (IOException ex) {
506 // can't occur with a StringWriter
507 ex.printStackTrace();
508 }
509 return sw.toString();
510
511 }
512
513 /**
514 * Returns a <code>String</code> representing the location of the current sentence
515 * -- i.e. either a triple of section, file, and sentence number as
516 * in the LTAG-spinal treebank (following the Penn Treebank conventions),
517 * or simply a sentence number if the sentence is not from the
518 * LTAG-spinal treebank.
519 * @return three numbers indicating where this sentence is found in
520 * the input
521 */
522
523 public String getLocation() {
524 String result = "";
525 if (sectionNumber != -1 && fileNumber != -1) {
526 result += sectionNumber + " " + fileNumber + " ";
527 }
528 return result + sentenceNumber;
529 }
530
531 /**
532 * Returns a human-readable string representing the location of the current sentence.
533 * If the sentence is taken from the LTAG-spinal treebank, the string looks
534 * as follows:
535 * <pre>
536 * Section: X File: Y Sentence: Z
537 * </pre>
538 * Otherwise,
539 * if the sentence only has a sentence number, the string looks like
540 * <pre>
541 * Sentence: Z
542 * </pre>
543 * @return a human-readable string indicating where this sentence is found in
544 * the input
545 */
546 public String prettyPrintLocation() {
547 String result = "";
548 if (sectionNumber != -1 && fileNumber != -1) {
549 result += "Section: " + sectionNumber
550 + " File: " + fileNumber + " ";
551 }
552 return result + "Sentence: " + sentenceNumber;
553 }
554
555 /**
556 * Returns the unique <code>ElemTree</code> that is the root of a subtree whose
557 * yield is the specified word span, or null if there is no such
558 * tree.
559 *
560 * @param w the span from the first up to and including the last word
561 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
562 * is a skipped sentence in the LTAG-spinal treebank
563 * @return an <code>ElemTree</code> or null
564 */
565 public ElemTree getSubTree(WordSpan w) {
566 return this.getSubTree(w.start(), w.end());
567 }
568
569
570
571 /**
572 * Returns the unique <code>ElemTree</code> that is the root of a subtree whose
573 * yield is the specified word span, or null if there is no such
574 * tree.
575 *
576 * @param start the first (leftmost) word included in the span
577 * @param end the last (rightmost) word included in the span
578 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
579 * is a skipped sentence in the LTAG-spinal treebank
580 * @return an <code>ElemTree</code> or null
581 */
582 public ElemTree getSubTree(int start, int end) {
583
584 if (this.isSkipped()) {
585 throw new SkippedSentenceException(this);
586 }
587
588 if (this.length()-1 < end) {
589 throw new IllegalArgumentException("Attempted to retrieve a subtree from" +
590 "a WordSpan that spans outside of the sentence");
591 }
592
593 // TODO add warnings if start > end, start < 0, end < 0, etc. here and elsewhere
594
595 List candidates = this.getElemTrees(start, end);
596 Iterator iter = candidates.iterator();
597 ElemTree current;
598 WordSpan span;
599 while (iter.hasNext()) {
600 current = (ElemTree) iter.next();
601 span = current.getSpan();
602 if (start == span.start() && end == span.end()) {
603 return current;
604 }
605 }
606 // if we haven't found anything...
607 return null;
608 }
609
610 /**
611 * Returns true if this <code>Sentence</code> has been read in from the
612 * format used in the output of Shen's bidirectional parser. If this is the case,
613 * no information about the spine is present. This is implemented as a simple
614 * lookup of the corresponding property of the <code>ElemTree</code> at
615 * the root of this sentence.
616 * @return a boolean value
617 * @see ElemTree#isBidirectionalParserOutput()
618 */
619 public boolean isBidirectionalParserOutput() {
620 return this.getRoot().isBidirectionalParserOutput();
621 }
622
623
624 /**
625 * Returns true iff the annotation for this sentence only consists of the word "skip",
626 * indicating that it is contained in the Penn Treebank but
627 * not in the LTAG-spinal treebank.
628 * @return true iff this sentence is skipped in the LTAG-spinal treebank
629 */
630 public boolean isSkipped() {
631 return this.skip;
632 }
633
634 /**
635 * Returns the number of the current sentence in the Penn Treebank file or parser output.
636 * @return the sentence number
637 */
638 public int getSentenceNumber() {
639 return sentenceNumber;
640 }
641
642 /**
643 * Returns the number of the Penn Treebank section in which the current sentence
644 * occurred, or -1 if the sentence is not a Penn Treebank sentence.
645 * @return the section number, or -1 if there is no such number
646 */
647 public int getSectionNumber() {
648 return sectionNumber;
649 }
650
651 /**
652 * Returns the number of the Penn Treebank file in which the current sentence
653 * occurred, or -1 if the sentence is not a Penn Treebank sentence.
654 * @return the file number, or -1 if there is no such number
655 */
656 public int getFileNumber() {
657 return fileNumber;
658 }
659
660
661 /**
662 * Force the spans to be computed recursively on every <code>ElemTree</code>.
663 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
664 * is a skipped sentence in the LTAG-spinal treebank
665 */
666 private void computeSpans() {
667 if (this.isSkipped()) {
668 throw new SkippedSentenceException(this);
669 }
670
671 this.getRoot().computeSpan();
672 spansComputed = true;
673 }
674
675 /**
676 * Returns the <code>ElemTree</code> whose yield is the given word span,
677 * or null if there isn't one.
678 * @param w the <code>WordSpan</code> for which the dominating tree
679 * is to be returned
680 * @return an <code>ElemTree</code> or null
681 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
682 * is a skipped sentence in the LTAG-spinal treebank
683 */
684 public ElemTree subTreeForSpan(WordSpan w) {
685 if (this.isSkipped()) {
686 throw new SkippedSentenceException(this);
687 }
688
689 if (!spansComputed) computeSpans();
690 if (!spanTableComputed) computeSpanTable();
691 if (spanTable.containsKey(w)) {
692 return (ElemTree) spanTable.get(w);
693 } else {
694 return null;
695 }
696 }
697
698 /**
699 * Computes the span table, a directory whose keys are word spans and whose
700 * values are the corresponding subtrees (if any).
701 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
702 * is a skipped sentence in the LTAG-spinal treebank
703 */
704 public void computeSpanTable() {
705 if (this.isSkipped()) {
706 throw new SkippedSentenceException(this);
707 }
708
709 if (!spansComputed) computeSpans();
710 Iterator iter = this.elemTreesIterator();
711 spanTable = new HashMap(this.length());
712 ElemTree current;
713 String bugs="/Users/lingrad2/srl/ltagtb/bugs/";
714 while (iter.hasNext()) {
715
716 current = (ElemTree) iter.next();
717
718
719
720 // The following code was used to retrieve some buggy sentences.
721 // "There are at most 28 sentences in the corpus like that. So we can
722 //ignore them for now, or fix them by hand when we have the chance
723 //(which would lead us to the question of whether we should put the
724 //treebank under version control). I'm attaching a zipfile with the
725 //sentences in question. These are .dot files that you can open in
726 //graphviz or convert using "dot -Tjpg -o output.jpg filename.dot".
727 //Actually not all of them have the bug. The common property of the 28
728 //sentences is that in each of them there are at least two subtrees of
729 //the derivation tree with identical yield. See the .info files."
730 // (Mail by Lucas Champollion to Joshi and Libin Nov 29 2006)
731
732
733
734
735 // String lws = this.getLocation().replaceAll(" ", "_");
736 // // location with underscores
737 // if (spanTable.containsKey(current.getSpan())) {
738 // try {
739 //
740 // BufferedWriter f = new BufferedWriter
741 // (new FileWriter(bugs+lws+".dot"));
742 // this.writeGraphvizTo(f);
743 // f.close();
744 // } catch (IOException ex) {
745 // ex.printStackTrace();
746 // }
747 // BufferedWriter f;
748 // try {
749 // f = new BufferedWriter(new FileWriter(bugs+lws + ".info"));
750 // f.write("location: " + this.getLocation());
751 // f.newLine();
752 // f.write("span: " + current.getSpan());
753 // f.newLine();
754 // f.write("current: " + current);
755 // f.newLine();
756 // f.write("stored: " + (ElemTree) spanTable.get(current.getSpan()));
757 // f.newLine();
758 // f.close();
759 // } catch (IOException ex) {
760 // ex.printStackTrace();
761 // }
762 //
763 // }
764 spanTable.put(current.getSpan(), current);
765 }
766 spanTableComputed=true;
767 }
768
769 /**
770 * Returns true iff at least one of the elementary trees in this
771 * <code>Sentence</code> is an initial tree.
772 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
773 * is a skipped sentence in the LTAG-spinal treebank
774 * @return true iff there is at least one attachment operation in the
775 * present tree
776 */
777 public boolean containsAttachment() {
778 if (this.isSkipped()) {
779 throw new SkippedSentenceException(this);
780 }
781 Iterator iter = this.elemTreesIterator();
782
783 boolean result = false;
784
785 while (iter.hasNext()) {
786 ElemTree current = (ElemTree) iter.next();
787 if (current.isInitial()) result = true;
788 }
789 return result;
790 }
791
792 /**
793 * Returns true iff at least one of the elementary trees in this
794 * <code>Sentence</code> is an auxiliary tree.
795 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
796 * is a skipped sentence in the LTAG-spinal treebank
797 * @return true iff there is at least one adjunction operation in the
798 * present tree
799 */
800 public boolean containsAdjunction() {
801 if (this.isSkipped()) {
802 throw new SkippedSentenceException(this);
803 }
804 Iterator iter = this.elemTreesIterator();
805
806 boolean result = false;
807
808 while (iter.hasNext()) {
809 ElemTree current = (ElemTree) iter.next();
810 if (current.isAuxiliary()) result = true;
811 }
812 return result;
813 }
814
815 /**
816 * Returns true iff at least one of the elementary trees in this
817 * <code>Sentence</code> is a conjunction tree.
818 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
819 * is a skipped sentence in the LTAG-spinal treebank
820 * @return true iff there is at least one coordination operation in the
821 * present tree
822 */
823 public boolean containsCoordination() {
824 if (this.isSkipped()) {
825 throw new SkippedSentenceException(this);
826 }
827 Iterator iter = this.elemTreesIterator();
828
829 boolean result = false;
830
831 while (iter.hasNext()) {
832 ElemTree current = (ElemTree) iter.next();
833 if (current.isCoord()) result = true;
834 }
835 return result;
836 }
837
838 // public void computeExtendedSpanTable() {
839 // computeSpanTable();
840 //
841 // // extended span table is in the same style but trees occur
842 // // under multiple entries: a tree may be pruned by removing any
843 // // number of children
844 // throw new RuntimeException("Not implemented yet");
845 // }
846
847
848 /**
849 * Returns the length of this <code>Sentence</code>, that is, the number of elementary
850 * trees in this derivation tree.
851 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
852 * is a skipped sentence in the LTAG-spinal treebank
853 * @return the number of elementary trees in this <code>Sentence</code>
854 */
855 public int length() {
856 if (this.isSkipped()) {
857 throw new SkippedSentenceException(this);
858 }
859
860 return elemTrees.size();
861 }
862
863 /**
864 * Returns the elementary tree at the root of this <code>Sentence</code>.
865 *
866 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
867 * is a skipped sentence in the LTAG-spinal treebank
868 * @return the <code>ElemTree</code> in which this derivation tree is rooted
869 */
870 public ElemTree getRoot() {
871 if (this.isSkipped()) {
872 throw new SkippedSentenceException(this);
873 }
874
875 return root;
876 }
877
878 /**
879 * Iterates over the elementary trees of which this <code>Sentence</code>
880 * consists, in the order in which they are numbered (left to right in the
881 * sentence).
882 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
883 * is a skipped sentence in the LTAG-spinal treebank
884 * @return a <code>ListIterator</code>
885 */
886 public ListIterator elemTreesIterator() {
887 if (this.isSkipped()) {
888 throw new SkippedSentenceException(this);
889 }
890
891 return elemTrees.listIterator();
892 }
893
894 /**
895 * Returns the <code>ElemTree</code> associated with the <code>n</code>th word of
896 * the sentence.
897 * @param n a number between 0 and the length of the sentence
898 * @throws IndexOutOfBoundsException if index is out of range <tt>(index
899 * < 0 || index >= size())</tt>.
900 * @return an <code>ElemTree</code> for the <code>n</code>th word of the sentence
901 */
902 public ElemTree getElemTree(int n) {
903 if (this.isSkipped()) {
904 throw new SkippedSentenceException(this);
905 }
906
907 return (ElemTree)elemTrees.get(n);
908 }
909
910 /**
911 * Returns a <code>List</code> of <code>ElemTree</code>s for
912 * the given word span.
913 * @return an ordered list containing some of the elementary trees of which
914 * this sentence consists
915 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
916 * is a skipped sentence in the LTAG-spinal treebank
917 */
918 public List getElemTrees() {
919 if (this.isSkipped()) {
920 throw new SkippedSentenceException(this);
921 }
922
923 return this.elemTrees;
924 }
925
926 /**
927 * Returns a <code>List</code> of <code>ElemTree</code>s for
928 * the given word span.
929 * @param from the first word to be included in the list
930 * @param to the last word to be included in the list
931 * @return an ordered list containing some of the elementary trees of which
932 * this sentence consists
933 * @throws edu.upenn.cis.spinal.SkippedSentenceException if the current sentence
934 * is a skipped sentence in the LTAG-spinal treebank
935 */
936 public List getElemTrees(int from, int to) {
937 if (this.isSkipped()) {
938 throw new SkippedSentenceException(this);
939 }
940
941 return this.elemTrees.subList(from, to+1);
942 // the "+1" is because our spans are inclusive but subList is exclusive
943 }
944
945
946
947
948
949
950
951 }