001    /**
002     * LTAG-spinal API, an interface to the treebank format introduced by Libin Shen.
003     * Copyright (C) 2007  Lucas Champollion
004     *
005     * This program is free software: you can redistribute it and/or modify
006     * it under the terms of the GNU General Public License as published by
007     * the Free Software Foundation, either version 3 of the License, or
008     * (at your option) any later version.
009     * 
010     * This program is distributed in the hope that it will be useful,
011     * but WITHOUT ANY WARRANTY; without even the implied warranty of
012     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013     * GNU General Public License for more details.
014     *
015     * You should have received a copy of the GNU General Public License
016     * along with this program.  If not, see <http://www.gnu.org/licenses/>.
017     *
018     */
019    package edu.upenn.cis.spinal;
020    
021    import java.util.*;
022    
023    /**
024     * Represents the spine of a spinal elementary tree.
025     * A typical line in the treebank that contains a spine looks like this:
026     * 
027     * <pre>
028     * b_( VP VB^ VP* )
029     * </pre>
030     *
031     * However, the first letter indicates a property of the elementary tree
032     * (whether it is initial, auxiliary, or coordination) and is strictly speaking
033     * not a part of the spine, nor is the underscore folloing it. 
034     * So we only consider the following a spine:
035     *
036     * <pre>
037     * ( VP VB^ VP* )
038     * </pre>
039     *
040     * Other examples of spines:
041     *
042     * <pre>
043     * ( S ( VP VBD^ ) )
044     * 
045     * ( S ( VP ( XP JJ^ ) ) )
046     * 
047     * ( XP NONE^ )
048     * 
049     * JJ^                       
050     * 
051     * ( S S S )                 # a spine for predicate coordination
052     * </pre>
053     *
054     * Spines may be of three shapes (Libin Shen's thesis, p. 15):
055     * <ul>
056     *   <li>A spinal initial tree is composed of a lexical spine from the root 
057     * to the anchor, and nothing else. 
058     *   <li>A spinal auxiliary tree is composed of a lexical spine and a recursive
059     * spine from the root to the foot node. (The common part of a lexical spine 
060     * and a recursive spine is called the shared spine of an auxiliary tree.)
061     *   <li>It is not clear what exactly are the constraints on the shape of 
062     * coordination trees. However these trees are not really linguistically
063     * meaningful and might be removed or normalized in a future version of the treebank.
064     *</ul>
065     * Instances of this class are immutable as seen from the "public" view.
066     *
067     * @author Lucas Champollion
068     */
069    public class SpinalNode {
070            
071            
072        /**
073         * Represents the character used to show that a node is an anchor node
074         * (the node where the word is attached), i.e. "<code>^</code>".
075         */
076            public static final String ANCHOR = "^";
077            
078        /**
079         * Some alternative characters that might be used for the anchor because they
080         * look similar. These are:
081         * 
082         * <ul>
083         * <li><code>\u02C6 \\u02C6</code> -- unicode modifier letter circumflex accent
084         * <li><code>\u005E \\u005E</code> -- unicode circumflex accent
085         * <li><code>\u0302 \\u0302</code> -- unicode combining circumflex accent
086         * </ul>
087         */
088            public static final String ANCHORS 
089                    = "\u02C6" // unicode modifier letter circumflex accent
090                    + "\u005E" // unicode circumflex accent
091                    + "\u0302" // unicode combining circumflex accent
092                    ;
093        /**
094         * Represents the character used to show that a node is a foot node,
095         * in an auxiliary tree, i.e. "<code>*</code>".
096         */
097            public static final String FOOT = "*";
098        /**
099         * Represents the character used to show that a node is neither an 
100         * anchor nor a foot node (this character is the empty string in Libin Shen's
101         * treebank).
102         */
103            public static final String STANDARD = "";
104            
105            private ElemTree host;
106            
107            private SpinalNode parent; // null indicates that we're at the root of the spine
108            
109            private String label;
110                   
111            private String type; // either anchor or foot or standard
112            
113            private SpinalNode[] children = new SpinalNode[0];
114    
115            
116            private SpinalNode(ElemTree host, SpinalNode parent) {
117                this.host = host;
118                this.parent = parent;
119                label=null;
120                type="";
121            }
122            
123        
124        /**
125         * Creates a new instance of <code>SpinalNode</code> from a string representation.
126         * @param s the string representation from which to create the <code>SpinalNode</code>
127         * @param host the <code>ElemTree</code> to which the spine belongs of which this 
128         * <code>SpinalNode</code> is a member, or null if you don't wish to specify it
129         * @throws ElemTreeFormatException if the input string can't be parsed
130         */
131        public SpinalNode(String s, ElemTree host) throws ElemTreeFormatException {
132            
133            this(Arrays.asList(s.split(" ")), host); // tokenize
134        }
135        
136        /**
137         * Creates an instance of <code>SpinalNode</code> from a tokenized list of strings.
138         * The list is not modified during the construction of the <code>SpinalNode</code>.
139         * 
140         * @param list the list of strings
141         * @param host the <code>ElemTree</code> to which the spine belongs of which this 
142         * <code>SpinalNode</code> is a member, or null if you don't wish to specify it
143         * @throws ElemTreeFormatException if list is empty
144         * @throws NullPointerException if list is null
145         */
146        protected SpinalNode(List list, ElemTree host) throws ElemTreeFormatException {
147            if (list == null) throw new NullPointerException();
148            if (list.size() == 0) throw new ElemTreeFormatException("Can't create a " +
149                    "SpinalNode from an empty string.");
150            
151            this.host=host;
152            this.parent=null;
153            
154            if (list.size() == 1) { // list is of shape: JJ^
155               parseLabelAndType((String) list.get(0)); // updates this.type and this.label
156               this.children=new SpinalNode[0];
157            } else { // e.g. list is of shape:  ( XP NNS^ )
158                
159                list = stripSurroundingBrackets(list); // eg. list is now XP NNS^
160                
161                parseLabelAndType((String) list.get(0)); // eg. parse XP
162                this.children = parseChildren(list.subList(1,list.size())); //  e.g. parse NNS^
163                // tell the children who is the boss!
164                for (int i = 0; i < this.children.length; i++) {
165                    this.children[i].parent = this;
166                }
167            }
168          
169        }
170        
171        
172        /**
173         * Updates the instance variables <code>type</code> and <code>label</code> when passed a label string.
174         * @param label the string representing the label
175         */
176        private void parseLabelAndType(String label) {
177            String labelEnd = label.substring(label.length()-1, label.length());
178            if (label.endsWith(ANCHOR) || (ANCHORS.indexOf(labelEnd) != -1)) {
179                this.type=ANCHOR;
180                this.label=label.substring(0,label.length()-1);
181            } else if (label.endsWith(FOOT)) {
182                this.type=FOOT;
183                this.label=label.substring(0,label.length()-1);
184            } else {
185                this.type=STANDARD;
186                this.label=label;
187            }
188            
189        }
190        
191     
192        
193        
194        /**
195         * Internal method, called recursively while parsing a <code>SpinalNode</code>
196         * representation.
197         * @param list a list containing string representations of the children
198         * @return an array of spinal nodes representing the children of the current node
199         * @throws ElemTreeFormatException if a parse error occurs
200         */
201        private SpinalNode[] parseChildren(List list) throws ElemTreeFormatException {
202            
203            // If the list are surrounded by brackets, then the first element
204            // of the list is the mother of all the other elements. This is also
205            // true if there is only one part (it need not be surrounded by
206            // brackets). Otherwise,
207            // all the elements on the list are siblings.
208            
209            // Returns null if list is empty
210            
211            if (list.isEmpty()) return null;
212            
213            if (list.size() == 1) { // list is of shape: JJ^
214               SpinalNode onlyChild = new SpinalNode(this.host, this);
215               onlyChild.parseLabelAndType((String) list.get(0));
216               onlyChild.setChildren(new SpinalNode[0]);
217               return new SpinalNode[] { onlyChild };
218            } else { // collect children
219                ArrayList childrenList = new ArrayList();
220                while (!list.isEmpty()) {
221                    childrenList.add(getNodeFrom(list));
222                    list = rest(list);
223                } 
224                
225                return (SpinalNode[]) childrenList.toArray(new SpinalNode[0]);
226            }
227        }
228     
229        /**
230         * Returns the first node from a tokenized spine - that is, the first token,
231         * or if the first token is "(", then the content of the first pair of 
232         * brackets.
233         * @param tokens the list
234         * @throws ElemTreeFormatException if a parse error occurs
235         * @return the first node from the list
236         */
237        private SpinalNode getNodeFrom(List tokens) 
238        throws ElemTreeFormatException {
239            if (tokens.get(0).equals("(")) {
240                List subList = tokens.subList(0, whereIsClosingBracket(tokens)+1);
241                return new SpinalNode(subList, this.host);
242            } else {
243                LinkedList l = new LinkedList();
244                l.add(tokens.get(0));
245                return new SpinalNode(l, this.host);
246            }
247        }
248        
249        /**
250         * Removes the first child from the list and returns the rest of the list,
251         * or returns the empty list if the list is initially empty.
252         * (Actually we don't modify the list, we just provide a view on the 
253         * appropriate sublist.)
254         * @param tokens the list
255         * @throws ElemTreeFormatException if a parse error occurs
256         * @return the list with the first element removed
257         */
258        private static List rest(List tokens)
259        throws ElemTreeFormatException {
260            if (tokens.isEmpty()) return tokens;
261            if (tokens.get(0).equals("(")) {
262                return tokens.subList
263                        (whereIsClosingBracket(tokens)+1, tokens.size());
264            } else { // just skip the first element
265                return tokens.subList(1, tokens.size());
266            }
267            
268        }
269        
270        /**
271         * Takes a List of strings. If the first and last elements of
272         * the List are ( and ) respectively, they're stripped off and this
273         * is repeated recursively. Otherwise the List is returned unchanged.
274         * (Actually we don't modify the list, we just provide a view on the 
275         * appropriate sublist.)
276         *
277         * @param tokens the list
278         * @return the list with one pair of brackets removed
279         */
280        private static List stripSurroundingBrackets(List tokens) {
281            if (tokens.get(0).equals("(")) {
282                if (tokens.get(tokens.size()-1).equals(")")) { // the last element
283                    return stripSurroundingBrackets(tokens.subList(1, tokens.size()-1));
284                } else { // unbalanced brackets
285                    return tokens;
286                }   
287            } else { // nothing to do
288                return tokens;
289            }
290        }
291        
292        /**
293         * Takes a List of strings and returns the position (counting
294         * upwards from zero) of 
295         * the bracket that matches the List's initial bracket.
296         * If the list doesn't start with a bracket then -1 is returned.
297         * @param tokens the list
298         * @return the position of the closing bracket matching the initial one
299         */
300         private static int whereIsClosingBracket(List tokens) {
301            
302            int pos = -1;
303            
304            if (!tokens.get(0).equals("(")) {
305                return -1;
306            } else {
307                int stack = 0;
308                Iterator iter = tokens.iterator();
309                String current;
310                while (iter.hasNext()) {
311                    current = (String) iter.next();
312                    pos++;
313                    if (current.equals("(")) {
314                        stack++;
315                    } else if (current.equals(")")) {
316                        stack--;
317                        if (stack == 0) {
318                            return pos;
319                        } else if (stack < 0) {
320                            throw new RuntimeException("Bad spine");
321                        } else { // stack > 0
322                            // do nothing
323                        }
324                    } // end if (current == ")")
325                } // end iteration over tokens
326                if (stack != 0) throw new RuntimeException("Bad spine");
327                return tokens.size()-1;
328            } // end else
329            
330        } // end whereIsClosingBracket
331    
332        /**
333         * Gets the label of this spinal node.
334         * @return A string, e.g. <code>XP</code> or <code>NNS</code>. 
335         * Characters that denote foot nodes or 
336         * anchors are not returned.
337         */
338        public String getLabel() {
339            return label;
340        }
341    
342        /**
343         * Gets the type of this <code>SpinalNode</code>. Returns one of "ANCHOR", "FOOT", or 
344         * "STANDARD".
345         * @return Either ANCHOR, FOOT, or STANDARD
346         */
347        public String getType() {
348            return type;
349        }
350        
351        /**
352         * Returns whether this node is an anchor node.
353         * @return a boolean value
354         */
355        public boolean isAnchor() {
356            return type.equals(ANCHOR);
357        }
358    
359        /**
360         * Returns whether this node is a foot node.
361         * @return a boolean value
362         */
363        public boolean isFoot() {
364            return type.equals(FOOT);
365        }
366    
367        /**
368         * Returns whether this node is a standard node (neither foot nor anchor).
369         * @return a boolean value
370         */
371        public boolean isStandard() {
372            return type.equals(STANDARD);
373        }
374    
375        /**
376         * Returns an array of children, or null if there are no children 
377         * to this node. Children are understood as children nodes within the spine of this
378         * elementary tree, not as other elementary trees.
379         * @return the children of this spinal node
380         */
381        public SpinalNode[] getChildren() {
382            
383                return children;
384            
385        }
386        
387        /**
388         * Returns a list of children, or an empty list if there are no children 
389         * to this node. Children are understood as children nodes within the spine of this
390         * elementary tree, not as other elementary trees.
391         * @return the children of this spinal node
392         */    
393        public List getChildrenList() {
394            return Arrays.asList(this.getChildren());
395        }
396        
397        
398        /**
399         * Returns a list of all the descendents of this spinal node, including itself.
400         * @return the descendents of this node
401         */
402        public List getAllNodes() {
403            List result = new ArrayList();
404            result.add(this);
405            Iterator theChildren = Arrays.asList(this.getChildren()).iterator();
406            while (theChildren.hasNext()) {
407                SpinalNode current = (SpinalNode) theChildren.next();
408                result.addAll(current.getAllNodes());
409            }
410            return result;
411        }
412        
413        /**
414         * Returns the specified child of this spinal node. 
415         * @param n a number specifying the child, starting with zero.
416         * @throws ArrayIndexOutOfBoundsException if this node does not have as many
417         * children as the specified index
418         * @return a <code>SpinalNode</code> for the child
419         */
420        public SpinalNode getChild(int n) {
421            return this.getChildren()[n];
422        }
423        
424        /**
425         * Returns the {@link GornAddress} of this spinal node, relative to the
426         * root of the spine in which it is contained (rather than the root of the
427         * derivation tree in which it is contained).
428         * @return a Gorn address, such as <code>0</code> for the root of the spine
429         */
430        public GornAddress getLocationInSpine() {
431            String result = "";
432            SpinalNode current = this;
433            while (!current.isRootOfSpine()) {
434                SpinalNode theParent = current.getParent(false);
435                SpinalNode[] siblings = theParent.getChildren();
436                for (int i = 0; i < siblings.length; i++) {
437                    if (current.equals(siblings[i])) {
438                        // append to the left as we're walking upward the tree
439                        if (!result.equals("")) result = GornAddress.SEPARATOR + result;
440                        result = String.valueOf(i) + result;
441                        break;
442                    }
443                }
444                // move up the tree
445                current = theParent;
446            }
447            // current is now the root
448            // finally, prepend 0 for the root
449            if (!result.equals("")) result = GornAddress.SEPARATOR + result;
450            result = "0" + result;
451            
452            return new GornAddress(result);
453        }
454        
455        private void setChildren(SpinalNode[] children) {
456            if (children==null) throw new IllegalArgumentException("Attempted to set children of spinal node to null." +
457                    "Please provide empty SpinalNode[0] array instead.");
458            
459            this.children=children;
460        }
461        
462        /**
463         * Returns a canonical representation of the subtree rooted in this
464         * spinal node (e.g. <code>(XP NNS^)</code>).
465         * @return a string.
466         */
467        public String toString() {
468            String result = this.getLabel()+this.getType();
469            SpinalNode[] theChildren = this.getChildren();
470            
471            if (theChildren.length==0) {
472                return result;
473            } else {
474                for (int i = 0; i < theChildren.length; i++) {
475                    result = result + " " + theChildren[i];
476                }
477                return "( " + result + " )";
478            }
479        }
480    
481        /**
482         * Returns a unique identifier for this node based on its location in the spine and 
483         * the node ID of its elementary tree, for purposes of constructing Graphviz output.
484         * @return a string acting as the node ID for Graphviz
485         */
486        String getGraphvizNodeID() {
487                     return 
488                     "spine_of_"
489                     + this.getElemTree().getGraphvizNodeID().substring(5) // get rid of the word "node" in the node ID
490                     + "_at_"
491                     + this.getLocationInSpine().toString("_"); // Graphviz doesn't like periods in node names
492                     //+ hashCode();
493                     
494        }
495        
496    
497        /**
498         * Gets the elementary tree to which this spine or spinal node belongs.
499         * @return the host tree
500         */
501        public ElemTree getElemTree() {
502            return host;
503        }
504    
505    
506        /**
507         * Returns the <code>SpinalNode</code> that is the parent of this node.
508         * 
509         * @param acrossElemTrees if true, then if this node is at the root of
510         * its <code>ElemTree</code>, then the attachment site in the parent <code>ElemTree</code> will be
511         * returned; otherwise if this node is at the root of its <code>ElemTree</code>, null will
512         * be returned 
513         * 
514         * @return the parent of this node in the spine
515         */
516        public SpinalNode getParent(boolean acrossElemTrees) {
517            if (acrossElemTrees && this.isRootOfSpine()) {
518                return this.getElemTree().getAttachmentSite();
519            } else {
520                return parent;
521            }
522        }
523        
524        /**
525         * Returns whether this node is at the root of its spine.
526         * @return a boolean value
527         */
528        public boolean isRootOfSpine() {
529            return parent==null;
530        }
531        
532        
533    }