001 /**
002 * LTAG-spinal API, an interface to the treebank format introduced by Libin Shen.
003 * Copyright (C) 2007 Lucas Champollion
004 *
005 * This program is free software: you can redistribute it and/or modify
006 * it under the terms of the GNU General Public License as published by
007 * the Free Software Foundation, either version 3 of the License, or
008 * (at your option) any later version.
009 *
010 * This program is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 * GNU General Public License for more details.
014 *
015 * You should have received a copy of the GNU General Public License
016 * along with this program. If not, see <http://www.gnu.org/licenses/>.
017 *
018 */
019 package edu.upenn.cis.spinal;
020
021 import java.util.*;
022
023 /**
024 * Represents the spine of a spinal elementary tree.
025 * A typical line in the treebank that contains a spine looks like this:
026 *
027 * <pre>
028 * b_( VP VB^ VP* )
029 * </pre>
030 *
031 * However, the first letter indicates a property of the elementary tree
032 * (whether it is initial, auxiliary, or coordination) and is strictly speaking
033 * not a part of the spine, nor is the underscore folloing it.
034 * So we only consider the following a spine:
035 *
036 * <pre>
037 * ( VP VB^ VP* )
038 * </pre>
039 *
040 * Other examples of spines:
041 *
042 * <pre>
043 * ( S ( VP VBD^ ) )
044 *
045 * ( S ( VP ( XP JJ^ ) ) )
046 *
047 * ( XP NONE^ )
048 *
049 * JJ^
050 *
051 * ( S S S ) # a spine for predicate coordination
052 * </pre>
053 *
054 * Spines may be of three shapes (Libin Shen's thesis, p. 15):
055 * <ul>
056 * <li>A spinal initial tree is composed of a lexical spine from the root
057 * to the anchor, and nothing else.
058 * <li>A spinal auxiliary tree is composed of a lexical spine and a recursive
059 * spine from the root to the foot node. (The common part of a lexical spine
060 * and a recursive spine is called the shared spine of an auxiliary tree.)
061 * <li>It is not clear what exactly are the constraints on the shape of
062 * coordination trees. However these trees are not really linguistically
063 * meaningful and might be removed or normalized in a future version of the treebank.
064 *</ul>
065 * Instances of this class are immutable as seen from the "public" view.
066 *
067 * @author Lucas Champollion
068 */
069 public class SpinalNode {
070
071
072 /**
073 * Represents the character used to show that a node is an anchor node
074 * (the node where the word is attached), i.e. "<code>^</code>".
075 */
076 public static final String ANCHOR = "^";
077
078 /**
079 * Some alternative characters that might be used for the anchor because they
080 * look similar. These are:
081 *
082 * <ul>
083 * <li><code>\u02C6 \\u02C6</code> -- unicode modifier letter circumflex accent
084 * <li><code>\u005E \\u005E</code> -- unicode circumflex accent
085 * <li><code>\u0302 \\u0302</code> -- unicode combining circumflex accent
086 * </ul>
087 */
088 public static final String ANCHORS
089 = "\u02C6" // unicode modifier letter circumflex accent
090 + "\u005E" // unicode circumflex accent
091 + "\u0302" // unicode combining circumflex accent
092 ;
093 /**
094 * Represents the character used to show that a node is a foot node,
095 * in an auxiliary tree, i.e. "<code>*</code>".
096 */
097 public static final String FOOT = "*";
098 /**
099 * Represents the character used to show that a node is neither an
100 * anchor nor a foot node (this character is the empty string in Libin Shen's
101 * treebank).
102 */
103 public static final String STANDARD = "";
104
105 private ElemTree host;
106
107 private SpinalNode parent; // null indicates that we're at the root of the spine
108
109 private String label;
110
111 private String type; // either anchor or foot or standard
112
113 private SpinalNode[] children = new SpinalNode[0];
114
115
116 private SpinalNode(ElemTree host, SpinalNode parent) {
117 this.host = host;
118 this.parent = parent;
119 label=null;
120 type="";
121 }
122
123
124 /**
125 * Creates a new instance of <code>SpinalNode</code> from a string representation.
126 * @param s the string representation from which to create the <code>SpinalNode</code>
127 * @param host the <code>ElemTree</code> to which the spine belongs of which this
128 * <code>SpinalNode</code> is a member, or null if you don't wish to specify it
129 * @throws ElemTreeFormatException if the input string can't be parsed
130 */
131 public SpinalNode(String s, ElemTree host) throws ElemTreeFormatException {
132
133 this(Arrays.asList(s.split(" ")), host); // tokenize
134 }
135
136 /**
137 * Creates an instance of <code>SpinalNode</code> from a tokenized list of strings.
138 * The list is not modified during the construction of the <code>SpinalNode</code>.
139 *
140 * @param list the list of strings
141 * @param host the <code>ElemTree</code> to which the spine belongs of which this
142 * <code>SpinalNode</code> is a member, or null if you don't wish to specify it
143 * @throws ElemTreeFormatException if list is empty
144 * @throws NullPointerException if list is null
145 */
146 protected SpinalNode(List list, ElemTree host) throws ElemTreeFormatException {
147 if (list == null) throw new NullPointerException();
148 if (list.size() == 0) throw new ElemTreeFormatException("Can't create a " +
149 "SpinalNode from an empty string.");
150
151 this.host=host;
152 this.parent=null;
153
154 if (list.size() == 1) { // list is of shape: JJ^
155 parseLabelAndType((String) list.get(0)); // updates this.type and this.label
156 this.children=new SpinalNode[0];
157 } else { // e.g. list is of shape: ( XP NNS^ )
158
159 list = stripSurroundingBrackets(list); // eg. list is now XP NNS^
160
161 parseLabelAndType((String) list.get(0)); // eg. parse XP
162 this.children = parseChildren(list.subList(1,list.size())); // e.g. parse NNS^
163 // tell the children who is the boss!
164 for (int i = 0; i < this.children.length; i++) {
165 this.children[i].parent = this;
166 }
167 }
168
169 }
170
171
172 /**
173 * Updates the instance variables <code>type</code> and <code>label</code> when passed a label string.
174 * @param label the string representing the label
175 */
176 private void parseLabelAndType(String label) {
177 String labelEnd = label.substring(label.length()-1, label.length());
178 if (label.endsWith(ANCHOR) || (ANCHORS.indexOf(labelEnd) != -1)) {
179 this.type=ANCHOR;
180 this.label=label.substring(0,label.length()-1);
181 } else if (label.endsWith(FOOT)) {
182 this.type=FOOT;
183 this.label=label.substring(0,label.length()-1);
184 } else {
185 this.type=STANDARD;
186 this.label=label;
187 }
188
189 }
190
191
192
193
194 /**
195 * Internal method, called recursively while parsing a <code>SpinalNode</code>
196 * representation.
197 * @param list a list containing string representations of the children
198 * @return an array of spinal nodes representing the children of the current node
199 * @throws ElemTreeFormatException if a parse error occurs
200 */
201 private SpinalNode[] parseChildren(List list) throws ElemTreeFormatException {
202
203 // If the list are surrounded by brackets, then the first element
204 // of the list is the mother of all the other elements. This is also
205 // true if there is only one part (it need not be surrounded by
206 // brackets). Otherwise,
207 // all the elements on the list are siblings.
208
209 // Returns null if list is empty
210
211 if (list.isEmpty()) return null;
212
213 if (list.size() == 1) { // list is of shape: JJ^
214 SpinalNode onlyChild = new SpinalNode(this.host, this);
215 onlyChild.parseLabelAndType((String) list.get(0));
216 onlyChild.setChildren(new SpinalNode[0]);
217 return new SpinalNode[] { onlyChild };
218 } else { // collect children
219 ArrayList childrenList = new ArrayList();
220 while (!list.isEmpty()) {
221 childrenList.add(getNodeFrom(list));
222 list = rest(list);
223 }
224
225 return (SpinalNode[]) childrenList.toArray(new SpinalNode[0]);
226 }
227 }
228
229 /**
230 * Returns the first node from a tokenized spine - that is, the first token,
231 * or if the first token is "(", then the content of the first pair of
232 * brackets.
233 * @param tokens the list
234 * @throws ElemTreeFormatException if a parse error occurs
235 * @return the first node from the list
236 */
237 private SpinalNode getNodeFrom(List tokens)
238 throws ElemTreeFormatException {
239 if (tokens.get(0).equals("(")) {
240 List subList = tokens.subList(0, whereIsClosingBracket(tokens)+1);
241 return new SpinalNode(subList, this.host);
242 } else {
243 LinkedList l = new LinkedList();
244 l.add(tokens.get(0));
245 return new SpinalNode(l, this.host);
246 }
247 }
248
249 /**
250 * Removes the first child from the list and returns the rest of the list,
251 * or returns the empty list if the list is initially empty.
252 * (Actually we don't modify the list, we just provide a view on the
253 * appropriate sublist.)
254 * @param tokens the list
255 * @throws ElemTreeFormatException if a parse error occurs
256 * @return the list with the first element removed
257 */
258 private static List rest(List tokens)
259 throws ElemTreeFormatException {
260 if (tokens.isEmpty()) return tokens;
261 if (tokens.get(0).equals("(")) {
262 return tokens.subList
263 (whereIsClosingBracket(tokens)+1, tokens.size());
264 } else { // just skip the first element
265 return tokens.subList(1, tokens.size());
266 }
267
268 }
269
270 /**
271 * Takes a List of strings. If the first and last elements of
272 * the List are ( and ) respectively, they're stripped off and this
273 * is repeated recursively. Otherwise the List is returned unchanged.
274 * (Actually we don't modify the list, we just provide a view on the
275 * appropriate sublist.)
276 *
277 * @param tokens the list
278 * @return the list with one pair of brackets removed
279 */
280 private static List stripSurroundingBrackets(List tokens) {
281 if (tokens.get(0).equals("(")) {
282 if (tokens.get(tokens.size()-1).equals(")")) { // the last element
283 return stripSurroundingBrackets(tokens.subList(1, tokens.size()-1));
284 } else { // unbalanced brackets
285 return tokens;
286 }
287 } else { // nothing to do
288 return tokens;
289 }
290 }
291
292 /**
293 * Takes a List of strings and returns the position (counting
294 * upwards from zero) of
295 * the bracket that matches the List's initial bracket.
296 * If the list doesn't start with a bracket then -1 is returned.
297 * @param tokens the list
298 * @return the position of the closing bracket matching the initial one
299 */
300 private static int whereIsClosingBracket(List tokens) {
301
302 int pos = -1;
303
304 if (!tokens.get(0).equals("(")) {
305 return -1;
306 } else {
307 int stack = 0;
308 Iterator iter = tokens.iterator();
309 String current;
310 while (iter.hasNext()) {
311 current = (String) iter.next();
312 pos++;
313 if (current.equals("(")) {
314 stack++;
315 } else if (current.equals(")")) {
316 stack--;
317 if (stack == 0) {
318 return pos;
319 } else if (stack < 0) {
320 throw new RuntimeException("Bad spine");
321 } else { // stack > 0
322 // do nothing
323 }
324 } // end if (current == ")")
325 } // end iteration over tokens
326 if (stack != 0) throw new RuntimeException("Bad spine");
327 return tokens.size()-1;
328 } // end else
329
330 } // end whereIsClosingBracket
331
332 /**
333 * Gets the label of this spinal node.
334 * @return A string, e.g. <code>XP</code> or <code>NNS</code>.
335 * Characters that denote foot nodes or
336 * anchors are not returned.
337 */
338 public String getLabel() {
339 return label;
340 }
341
342 /**
343 * Gets the type of this <code>SpinalNode</code>. Returns one of "ANCHOR", "FOOT", or
344 * "STANDARD".
345 * @return Either ANCHOR, FOOT, or STANDARD
346 */
347 public String getType() {
348 return type;
349 }
350
351 /**
352 * Returns whether this node is an anchor node.
353 * @return a boolean value
354 */
355 public boolean isAnchor() {
356 return type.equals(ANCHOR);
357 }
358
359 /**
360 * Returns whether this node is a foot node.
361 * @return a boolean value
362 */
363 public boolean isFoot() {
364 return type.equals(FOOT);
365 }
366
367 /**
368 * Returns whether this node is a standard node (neither foot nor anchor).
369 * @return a boolean value
370 */
371 public boolean isStandard() {
372 return type.equals(STANDARD);
373 }
374
375 /**
376 * Returns an array of children, or null if there are no children
377 * to this node. Children are understood as children nodes within the spine of this
378 * elementary tree, not as other elementary trees.
379 * @return the children of this spinal node
380 */
381 public SpinalNode[] getChildren() {
382
383 return children;
384
385 }
386
387 /**
388 * Returns a list of children, or an empty list if there are no children
389 * to this node. Children are understood as children nodes within the spine of this
390 * elementary tree, not as other elementary trees.
391 * @return the children of this spinal node
392 */
393 public List getChildrenList() {
394 return Arrays.asList(this.getChildren());
395 }
396
397
398 /**
399 * Returns a list of all the descendents of this spinal node, including itself.
400 * @return the descendents of this node
401 */
402 public List getAllNodes() {
403 List result = new ArrayList();
404 result.add(this);
405 Iterator theChildren = Arrays.asList(this.getChildren()).iterator();
406 while (theChildren.hasNext()) {
407 SpinalNode current = (SpinalNode) theChildren.next();
408 result.addAll(current.getAllNodes());
409 }
410 return result;
411 }
412
413 /**
414 * Returns the specified child of this spinal node.
415 * @param n a number specifying the child, starting with zero.
416 * @throws ArrayIndexOutOfBoundsException if this node does not have as many
417 * children as the specified index
418 * @return a <code>SpinalNode</code> for the child
419 */
420 public SpinalNode getChild(int n) {
421 return this.getChildren()[n];
422 }
423
424 /**
425 * Returns the {@link GornAddress} of this spinal node, relative to the
426 * root of the spine in which it is contained (rather than the root of the
427 * derivation tree in which it is contained).
428 * @return a Gorn address, such as <code>0</code> for the root of the spine
429 */
430 public GornAddress getLocationInSpine() {
431 String result = "";
432 SpinalNode current = this;
433 while (!current.isRootOfSpine()) {
434 SpinalNode theParent = current.getParent(false);
435 SpinalNode[] siblings = theParent.getChildren();
436 for (int i = 0; i < siblings.length; i++) {
437 if (current.equals(siblings[i])) {
438 // append to the left as we're walking upward the tree
439 if (!result.equals("")) result = GornAddress.SEPARATOR + result;
440 result = String.valueOf(i) + result;
441 break;
442 }
443 }
444 // move up the tree
445 current = theParent;
446 }
447 // current is now the root
448 // finally, prepend 0 for the root
449 if (!result.equals("")) result = GornAddress.SEPARATOR + result;
450 result = "0" + result;
451
452 return new GornAddress(result);
453 }
454
455 private void setChildren(SpinalNode[] children) {
456 if (children==null) throw new IllegalArgumentException("Attempted to set children of spinal node to null." +
457 "Please provide empty SpinalNode[0] array instead.");
458
459 this.children=children;
460 }
461
462 /**
463 * Returns a canonical representation of the subtree rooted in this
464 * spinal node (e.g. <code>(XP NNS^)</code>).
465 * @return a string.
466 */
467 public String toString() {
468 String result = this.getLabel()+this.getType();
469 SpinalNode[] theChildren = this.getChildren();
470
471 if (theChildren.length==0) {
472 return result;
473 } else {
474 for (int i = 0; i < theChildren.length; i++) {
475 result = result + " " + theChildren[i];
476 }
477 return "( " + result + " )";
478 }
479 }
480
481 /**
482 * Returns a unique identifier for this node based on its location in the spine and
483 * the node ID of its elementary tree, for purposes of constructing Graphviz output.
484 * @return a string acting as the node ID for Graphviz
485 */
486 String getGraphvizNodeID() {
487 return
488 "spine_of_"
489 + this.getElemTree().getGraphvizNodeID().substring(5) // get rid of the word "node" in the node ID
490 + "_at_"
491 + this.getLocationInSpine().toString("_"); // Graphviz doesn't like periods in node names
492 //+ hashCode();
493
494 }
495
496
497 /**
498 * Gets the elementary tree to which this spine or spinal node belongs.
499 * @return the host tree
500 */
501 public ElemTree getElemTree() {
502 return host;
503 }
504
505
506 /**
507 * Returns the <code>SpinalNode</code> that is the parent of this node.
508 *
509 * @param acrossElemTrees if true, then if this node is at the root of
510 * its <code>ElemTree</code>, then the attachment site in the parent <code>ElemTree</code> will be
511 * returned; otherwise if this node is at the root of its <code>ElemTree</code>, null will
512 * be returned
513 *
514 * @return the parent of this node in the spine
515 */
516 public SpinalNode getParent(boolean acrossElemTrees) {
517 if (acrossElemTrees && this.isRootOfSpine()) {
518 return this.getElemTree().getAttachmentSite();
519 } else {
520 return parent;
521 }
522 }
523
524 /**
525 * Returns whether this node is at the root of its spine.
526 * @return a boolean value
527 */
528 public boolean isRootOfSpine() {
529 return parent==null;
530 }
531
532
533 }