001 /** 002 * LTAG-spinal API, an interface to the treebank format introduced by Libin Shen. 003 * Copyright (C) 2007 Lucas Champollion 004 * 005 * This program is free software: you can redistribute it and/or modify 006 * it under the terms of the GNU General Public License as published by 007 * the Free Software Foundation, either version 3 of the License, or 008 * (at your option) any later version. 009 * 010 * This program is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU General Public License for more details. 014 * 015 * You should have received a copy of the GNU General Public License 016 * along with this program. If not, see <http://www.gnu.org/licenses/>. 017 * 018 */ 019 package edu.upenn.cis.spinal; 020 021 import java.util.*; 022 023 /** 024 * Represents the spine of a spinal elementary tree. 025 * A typical line in the treebank that contains a spine looks like this: 026 * 027 * <pre> 028 * b_( VP VB^ VP* ) 029 * </pre> 030 * 031 * However, the first letter indicates a property of the elementary tree 032 * (whether it is initial, auxiliary, or coordination) and is strictly speaking 033 * not a part of the spine, nor is the underscore folloing it. 034 * So we only consider the following a spine: 035 * 036 * <pre> 037 * ( VP VB^ VP* ) 038 * </pre> 039 * 040 * Other examples of spines: 041 * 042 * <pre> 043 * ( S ( VP VBD^ ) ) 044 * 045 * ( S ( VP ( XP JJ^ ) ) ) 046 * 047 * ( XP NONE^ ) 048 * 049 * JJ^ 050 * 051 * ( S S S ) # a spine for predicate coordination 052 * </pre> 053 * 054 * Spines may be of three shapes (Libin Shen's thesis, p. 15): 055 * <ul> 056 * <li>A spinal initial tree is composed of a lexical spine from the root 057 * to the anchor, and nothing else. 058 * <li>A spinal auxiliary tree is composed of a lexical spine and a recursive 059 * spine from the root to the foot node. (The common part of a lexical spine 060 * and a recursive spine is called the shared spine of an auxiliary tree.) 061 * <li>It is not clear what exactly are the constraints on the shape of 062 * coordination trees. However these trees are not really linguistically 063 * meaningful and might be removed or normalized in a future version of the treebank. 064 *</ul> 065 * Instances of this class are immutable as seen from the "public" view. 066 * 067 * @author Lucas Champollion 068 */ 069 public class SpinalNode { 070 071 072 /** 073 * Represents the character used to show that a node is an anchor node 074 * (the node where the word is attached), i.e. "<code>^</code>". 075 */ 076 public static final String ANCHOR = "^"; 077 078 /** 079 * Some alternative characters that might be used for the anchor because they 080 * look similar. These are: 081 * 082 * <ul> 083 * <li><code>\u02C6 \\u02C6</code> -- unicode modifier letter circumflex accent 084 * <li><code>\u005E \\u005E</code> -- unicode circumflex accent 085 * <li><code>\u0302 \\u0302</code> -- unicode combining circumflex accent 086 * </ul> 087 */ 088 public static final String ANCHORS 089 = "\u02C6" // unicode modifier letter circumflex accent 090 + "\u005E" // unicode circumflex accent 091 + "\u0302" // unicode combining circumflex accent 092 ; 093 /** 094 * Represents the character used to show that a node is a foot node, 095 * in an auxiliary tree, i.e. "<code>*</code>". 096 */ 097 public static final String FOOT = "*"; 098 /** 099 * Represents the character used to show that a node is neither an 100 * anchor nor a foot node (this character is the empty string in Libin Shen's 101 * treebank). 102 */ 103 public static final String STANDARD = ""; 104 105 private ElemTree host; 106 107 private SpinalNode parent; // null indicates that we're at the root of the spine 108 109 private String label; 110 111 private String type; // either anchor or foot or standard 112 113 private SpinalNode[] children = new SpinalNode[0]; 114 115 116 private SpinalNode(ElemTree host, SpinalNode parent) { 117 this.host = host; 118 this.parent = parent; 119 label=null; 120 type=""; 121 } 122 123 124 /** 125 * Creates a new instance of <code>SpinalNode</code> from a string representation. 126 * @param s the string representation from which to create the <code>SpinalNode</code> 127 * @param host the <code>ElemTree</code> to which the spine belongs of which this 128 * <code>SpinalNode</code> is a member, or null if you don't wish to specify it 129 * @throws ElemTreeFormatException if the input string can't be parsed 130 */ 131 public SpinalNode(String s, ElemTree host) throws ElemTreeFormatException { 132 133 this(Arrays.asList(s.split(" ")), host); // tokenize 134 } 135 136 /** 137 * Creates an instance of <code>SpinalNode</code> from a tokenized list of strings. 138 * The list is not modified during the construction of the <code>SpinalNode</code>. 139 * 140 * @param list the list of strings 141 * @param host the <code>ElemTree</code> to which the spine belongs of which this 142 * <code>SpinalNode</code> is a member, or null if you don't wish to specify it 143 * @throws ElemTreeFormatException if list is empty 144 * @throws NullPointerException if list is null 145 */ 146 protected SpinalNode(List list, ElemTree host) throws ElemTreeFormatException { 147 if (list == null) throw new NullPointerException(); 148 if (list.size() == 0) throw new ElemTreeFormatException("Can't create a " + 149 "SpinalNode from an empty string."); 150 151 this.host=host; 152 this.parent=null; 153 154 if (list.size() == 1) { // list is of shape: JJ^ 155 parseLabelAndType((String) list.get(0)); // updates this.type and this.label 156 this.children=new SpinalNode[0]; 157 } else { // e.g. list is of shape: ( XP NNS^ ) 158 159 list = stripSurroundingBrackets(list); // eg. list is now XP NNS^ 160 161 parseLabelAndType((String) list.get(0)); // eg. parse XP 162 this.children = parseChildren(list.subList(1,list.size())); // e.g. parse NNS^ 163 // tell the children who is the boss! 164 for (int i = 0; i < this.children.length; i++) { 165 this.children[i].parent = this; 166 } 167 } 168 169 } 170 171 172 /** 173 * Updates the instance variables <code>type</code> and <code>label</code> when passed a label string. 174 * @param label the string representing the label 175 */ 176 private void parseLabelAndType(String label) { 177 String labelEnd = label.substring(label.length()-1, label.length()); 178 if (label.endsWith(ANCHOR) || (ANCHORS.indexOf(labelEnd) != -1)) { 179 this.type=ANCHOR; 180 this.label=label.substring(0,label.length()-1); 181 } else if (label.endsWith(FOOT)) { 182 this.type=FOOT; 183 this.label=label.substring(0,label.length()-1); 184 } else { 185 this.type=STANDARD; 186 this.label=label; 187 } 188 189 } 190 191 192 193 194 /** 195 * Internal method, called recursively while parsing a <code>SpinalNode</code> 196 * representation. 197 * @param list a list containing string representations of the children 198 * @return an array of spinal nodes representing the children of the current node 199 * @throws ElemTreeFormatException if a parse error occurs 200 */ 201 private SpinalNode[] parseChildren(List list) throws ElemTreeFormatException { 202 203 // If the list are surrounded by brackets, then the first element 204 // of the list is the mother of all the other elements. This is also 205 // true if there is only one part (it need not be surrounded by 206 // brackets). Otherwise, 207 // all the elements on the list are siblings. 208 209 // Returns null if list is empty 210 211 if (list.isEmpty()) return null; 212 213 if (list.size() == 1) { // list is of shape: JJ^ 214 SpinalNode onlyChild = new SpinalNode(this.host, this); 215 onlyChild.parseLabelAndType((String) list.get(0)); 216 onlyChild.setChildren(new SpinalNode[0]); 217 return new SpinalNode[] { onlyChild }; 218 } else { // collect children 219 ArrayList childrenList = new ArrayList(); 220 while (!list.isEmpty()) { 221 childrenList.add(getNodeFrom(list)); 222 list = rest(list); 223 } 224 225 return (SpinalNode[]) childrenList.toArray(new SpinalNode[0]); 226 } 227 } 228 229 /** 230 * Returns the first node from a tokenized spine - that is, the first token, 231 * or if the first token is "(", then the content of the first pair of 232 * brackets. 233 * @param tokens the list 234 * @throws ElemTreeFormatException if a parse error occurs 235 * @return the first node from the list 236 */ 237 private SpinalNode getNodeFrom(List tokens) 238 throws ElemTreeFormatException { 239 if (tokens.get(0).equals("(")) { 240 List subList = tokens.subList(0, whereIsClosingBracket(tokens)+1); 241 return new SpinalNode(subList, this.host); 242 } else { 243 LinkedList l = new LinkedList(); 244 l.add(tokens.get(0)); 245 return new SpinalNode(l, this.host); 246 } 247 } 248 249 /** 250 * Removes the first child from the list and returns the rest of the list, 251 * or returns the empty list if the list is initially empty. 252 * (Actually we don't modify the list, we just provide a view on the 253 * appropriate sublist.) 254 * @param tokens the list 255 * @throws ElemTreeFormatException if a parse error occurs 256 * @return the list with the first element removed 257 */ 258 private static List rest(List tokens) 259 throws ElemTreeFormatException { 260 if (tokens.isEmpty()) return tokens; 261 if (tokens.get(0).equals("(")) { 262 return tokens.subList 263 (whereIsClosingBracket(tokens)+1, tokens.size()); 264 } else { // just skip the first element 265 return tokens.subList(1, tokens.size()); 266 } 267 268 } 269 270 /** 271 * Takes a List of strings. If the first and last elements of 272 * the List are ( and ) respectively, they're stripped off and this 273 * is repeated recursively. Otherwise the List is returned unchanged. 274 * (Actually we don't modify the list, we just provide a view on the 275 * appropriate sublist.) 276 * 277 * @param tokens the list 278 * @return the list with one pair of brackets removed 279 */ 280 private static List stripSurroundingBrackets(List tokens) { 281 if (tokens.get(0).equals("(")) { 282 if (tokens.get(tokens.size()-1).equals(")")) { // the last element 283 return stripSurroundingBrackets(tokens.subList(1, tokens.size()-1)); 284 } else { // unbalanced brackets 285 return tokens; 286 } 287 } else { // nothing to do 288 return tokens; 289 } 290 } 291 292 /** 293 * Takes a List of strings and returns the position (counting 294 * upwards from zero) of 295 * the bracket that matches the List's initial bracket. 296 * If the list doesn't start with a bracket then -1 is returned. 297 * @param tokens the list 298 * @return the position of the closing bracket matching the initial one 299 */ 300 private static int whereIsClosingBracket(List tokens) { 301 302 int pos = -1; 303 304 if (!tokens.get(0).equals("(")) { 305 return -1; 306 } else { 307 int stack = 0; 308 Iterator iter = tokens.iterator(); 309 String current; 310 while (iter.hasNext()) { 311 current = (String) iter.next(); 312 pos++; 313 if (current.equals("(")) { 314 stack++; 315 } else if (current.equals(")")) { 316 stack--; 317 if (stack == 0) { 318 return pos; 319 } else if (stack < 0) { 320 throw new RuntimeException("Bad spine"); 321 } else { // stack > 0 322 // do nothing 323 } 324 } // end if (current == ")") 325 } // end iteration over tokens 326 if (stack != 0) throw new RuntimeException("Bad spine"); 327 return tokens.size()-1; 328 } // end else 329 330 } // end whereIsClosingBracket 331 332 /** 333 * Gets the label of this spinal node. 334 * @return A string, e.g. <code>XP</code> or <code>NNS</code>. 335 * Characters that denote foot nodes or 336 * anchors are not returned. 337 */ 338 public String getLabel() { 339 return label; 340 } 341 342 /** 343 * Gets the type of this <code>SpinalNode</code>. Returns one of "ANCHOR", "FOOT", or 344 * "STANDARD". 345 * @return Either ANCHOR, FOOT, or STANDARD 346 */ 347 public String getType() { 348 return type; 349 } 350 351 /** 352 * Returns whether this node is an anchor node. 353 * @return a boolean value 354 */ 355 public boolean isAnchor() { 356 return type.equals(ANCHOR); 357 } 358 359 /** 360 * Returns whether this node is a foot node. 361 * @return a boolean value 362 */ 363 public boolean isFoot() { 364 return type.equals(FOOT); 365 } 366 367 /** 368 * Returns whether this node is a standard node (neither foot nor anchor). 369 * @return a boolean value 370 */ 371 public boolean isStandard() { 372 return type.equals(STANDARD); 373 } 374 375 /** 376 * Returns an array of children, or null if there are no children 377 * to this node. Children are understood as children nodes within the spine of this 378 * elementary tree, not as other elementary trees. 379 * @return the children of this spinal node 380 */ 381 public SpinalNode[] getChildren() { 382 383 return children; 384 385 } 386 387 /** 388 * Returns a list of children, or an empty list if there are no children 389 * to this node. Children are understood as children nodes within the spine of this 390 * elementary tree, not as other elementary trees. 391 * @return the children of this spinal node 392 */ 393 public List getChildrenList() { 394 return Arrays.asList(this.getChildren()); 395 } 396 397 398 /** 399 * Returns a list of all the descendents of this spinal node, including itself. 400 * @return the descendents of this node 401 */ 402 public List getAllNodes() { 403 List result = new ArrayList(); 404 result.add(this); 405 Iterator theChildren = Arrays.asList(this.getChildren()).iterator(); 406 while (theChildren.hasNext()) { 407 SpinalNode current = (SpinalNode) theChildren.next(); 408 result.addAll(current.getAllNodes()); 409 } 410 return result; 411 } 412 413 /** 414 * Returns the specified child of this spinal node. 415 * @param n a number specifying the child, starting with zero. 416 * @throws ArrayIndexOutOfBoundsException if this node does not have as many 417 * children as the specified index 418 * @return a <code>SpinalNode</code> for the child 419 */ 420 public SpinalNode getChild(int n) { 421 return this.getChildren()[n]; 422 } 423 424 /** 425 * Returns the {@link GornAddress} of this spinal node, relative to the 426 * root of the spine in which it is contained (rather than the root of the 427 * derivation tree in which it is contained). 428 * @return a Gorn address, such as <code>0</code> for the root of the spine 429 */ 430 public GornAddress getLocationInSpine() { 431 String result = ""; 432 SpinalNode current = this; 433 while (!current.isRootOfSpine()) { 434 SpinalNode theParent = current.getParent(false); 435 SpinalNode[] siblings = theParent.getChildren(); 436 for (int i = 0; i < siblings.length; i++) { 437 if (current.equals(siblings[i])) { 438 // append to the left as we're walking upward the tree 439 if (!result.equals("")) result = GornAddress.SEPARATOR + result; 440 result = String.valueOf(i) + result; 441 break; 442 } 443 } 444 // move up the tree 445 current = theParent; 446 } 447 // current is now the root 448 // finally, prepend 0 for the root 449 if (!result.equals("")) result = GornAddress.SEPARATOR + result; 450 result = "0" + result; 451 452 return new GornAddress(result); 453 } 454 455 private void setChildren(SpinalNode[] children) { 456 if (children==null) throw new IllegalArgumentException("Attempted to set children of spinal node to null." + 457 "Please provide empty SpinalNode[0] array instead."); 458 459 this.children=children; 460 } 461 462 /** 463 * Returns a canonical representation of the subtree rooted in this 464 * spinal node (e.g. <code>(XP NNS^)</code>). 465 * @return a string. 466 */ 467 public String toString() { 468 String result = this.getLabel()+this.getType(); 469 SpinalNode[] theChildren = this.getChildren(); 470 471 if (theChildren.length==0) { 472 return result; 473 } else { 474 for (int i = 0; i < theChildren.length; i++) { 475 result = result + " " + theChildren[i]; 476 } 477 return "( " + result + " )"; 478 } 479 } 480 481 /** 482 * Returns a unique identifier for this node based on its location in the spine and 483 * the node ID of its elementary tree, for purposes of constructing Graphviz output. 484 * @return a string acting as the node ID for Graphviz 485 */ 486 String getGraphvizNodeID() { 487 return 488 "spine_of_" 489 + this.getElemTree().getGraphvizNodeID().substring(5) // get rid of the word "node" in the node ID 490 + "_at_" 491 + this.getLocationInSpine().toString("_"); // Graphviz doesn't like periods in node names 492 //+ hashCode(); 493 494 } 495 496 497 /** 498 * Gets the elementary tree to which this spine or spinal node belongs. 499 * @return the host tree 500 */ 501 public ElemTree getElemTree() { 502 return host; 503 } 504 505 506 /** 507 * Returns the <code>SpinalNode</code> that is the parent of this node. 508 * 509 * @param acrossElemTrees if true, then if this node is at the root of 510 * its <code>ElemTree</code>, then the attachment site in the parent <code>ElemTree</code> will be 511 * returned; otherwise if this node is at the root of its <code>ElemTree</code>, null will 512 * be returned 513 * 514 * @return the parent of this node in the spine 515 */ 516 public SpinalNode getParent(boolean acrossElemTrees) { 517 if (acrossElemTrees && this.isRootOfSpine()) { 518 return this.getElemTree().getAttachmentSite(); 519 } else { 520 return parent; 521 } 522 } 523 524 /** 525 * Returns whether this node is at the root of its spine. 526 * @return a boolean value 527 */ 528 public boolean isRootOfSpine() { 529 return parent==null; 530 } 531 532 533 }