001    package edu.upenn.cis.propbank_shen;
002    
003    import java.util.*;
004    //import edu.upenn.cis.treebank.TBNode;
005    //import edu.upenn.cis.treebank.LabelMatcher;
006    
007    import edu.upenn.cis.spinal.*;
008    
009    
010    /**
011     *   This class represents a location of an argument in the text.
012     *   <p>
013     *   Basically, locations just represent spans in the text, but there
014     *   are some complications.  For example, sometimes arguments are associated
015     *   with more than one span, such as the "utterance" argument
016     *   of the verb "say" in the sentence "'I'm going home', John said, 'before 10PM'."
017     *   <p>
018     *   Here, 
019     *   "I'm going home ... before 10PM" is not a continuous span,
020     *   so the <code>ArgLoc</code> will consist of two spans.
021     *   <p>
022     *   The second situation is for resolving empty constituents and sentence
023     *   local anaphora.  We consider the sentence "John is going to swim".  The
024     *   syntactic representation of this sentence adds a pseudo subject to the
025     *   infinitive "to swim", with a pointer to "John".  However,
026     *   since WH-movement was not explicitly marked in the Penn treebank, we have marked
027     *   this situation as well in the form of declaring that a set of nodes 
028     *   have equivalent semantic content.   For example, in the sentence 
029     *    <center>
030     *       "The man who swam the mile died"
031     *   </center>
032     *    Our treebank would declare that "who" is the subject of "swam", but not
033     *    that "who" refered to the same thing as "the man".  So we mark all the nodes,
034     *    the node containing "who", the node containing "the man", and the associated
035     *    empty constituent as nodes which denote the same thing.
036     *    <p>
037     * 
038     *    Thus in short, we may view the data structure for a location in the text
039     *    associated with an argument in a predicate argument structure as exactly one
040     *    of the following three cases:
041     *    <ol>
042     *        <li> a singleton span
043     *        <li> a set of spans arranged together
044     *        <li>a set of spans referring to the same semantic entity.
045     *    </ol>
046     * @author Scott Cotton
047     * @see WordSpan
048     */
049    public class ArgLoc implements Comparable {
050    
051        public static final int SINGLE=0;
052        public static final int CONCAT=1 << 0;
053        public static final int EQUIVA=1 << 1;
054    
055        protected int loc_type;
056        protected List locs;
057        protected WordSpan ta;
058    
059        /** construct an argument location from a basic argument location */
060        public ArgLoc(WordSpan ta) {
061            loc_type=SINGLE;
062            locs = null;
063            this.ta = ta;
064        }
065        
066        /** 
067            construct an argument location from a loc_type and a list
068            of other ArgLoc objects. List must contain at least two
069            elements, and ltype must be CONCAT or EQUIVA.
070        */
071        public ArgLoc(int ltype, List alocs) {
072            if (alocs.size() < 2) 
073                throw new IllegalArgumentException("Illegal call to ArgLoc constructor: "+
074                                                   "length of list muse be >= 2");
075            Iterator iter = alocs.iterator();
076            while (iter.hasNext()) { 
077                if (!(iter.next() instanceof ArgLoc))
078                    throw new IllegalArgumentException("Illegal call to ArgLoc constructor: "+
079                                                       "list must only contain ArgLoc objects");
080            }
081            if (ltype!=CONCAT && ltype!=EQUIVA)
082                throw new IllegalArgumentException("Illegal call to ArgLoc constructor: "+
083                                                   "ltype must be one of EQUIV or CONCAT");
084            loc_type = ltype;
085            //Collections.sort(alocs);
086            locs = alocs;
087            ta = null;
088        }
089    
090        /** return true if this location consists of a single word span. */
091        public boolean isSingle() { return loc_type == SINGLE;  }
092    
093        /** return true if this location consists of several word spans. */
094        public boolean isConcat() { return loc_type == CONCAT; }
095    
096        /** return true if this location consists of a trace chain */
097        public boolean isTraceChain() { return loc_type == EQUIVA; }
098    
099        
100        
101        /**
102         * Returns the unique <code>WordSpan</code> if this location consists of just one span.
103         * Otherwise, returns null.
104         */
105        public WordSpan getWordSpan() { return ta; }
106    
107        /**
108         * Returns a list of other ArgLoc objects if this location consists of multiple nodes.
109         * Otherwise, return null.
110         */
111        public List getLocList() { return locs; }
112    
113        /** 
114         * return a canoncial string for a location type: the empty string
115         * for SINGLE, a comma for CONCAT, an asterisk for EQUIVA. These strings
116         * are used as separators in the representation of word spans.
117         */
118        public String locTypeToString() {
119            switch (loc_type) {
120            case SINGLE: return "";
121            case CONCAT: return ",";
122            case EQUIVA: return "*";
123            }
124            return "compiler needs this return statement because it is stupid.";
125        }
126    
127        /** create a loc_type from a canoncial string or throw CorruptDataException if
128            this is an invalid string */
129        public static int locTypeOfstring(String s) throws CorruptDataException {
130            if (s == ",") { return ArgLoc.CONCAT; }
131            if (s == "*") { return ArgLoc.EQUIVA; }
132            throw new CorruptDataException("invalid string for location type of an argument: " + s);
133        }
134    
135        /**
136           Compares argument locations.
137           We compare by the earliest node anywhere in the (possible nested) structure of the ArgLoc.
138        */
139        public int compareTo(Object o) {
140            if (!(o instanceof ArgLoc)) {
141                throw new IllegalArgumentException("invalid call to ArgLoc.compareTo(), object not" +
142                                                   " an ArgLoc instance.");
143            }
144            ArgLoc al = (ArgLoc) o;
145            
146            List alAddresses = al.getAllWordSpans();
147            List thisAddresses = this.getAllWordSpans();
148            
149            Collections.sort(alAddresses);
150            Collections.sort(thisAddresses);
151            WordSpan thisFirst = (WordSpan) thisAddresses.get(0);
152            WordSpan alFirst = (WordSpan) alAddresses.get(0);
153            int firstComparison =  thisFirst.compareTo(alFirst);
154            if (firstComparison != 0) return firstComparison;
155            if (this.equals(al))
156                return 0;
157            else
158                return this.toString().compareTo(al.toString());
159        }
160        
161        /**
162         * returns a flat list of all WordSpans contained somewhere 
163         * (possibly deeply nested) in this ArgLoc
164         */
165        public List getAllWordSpans() {
166            List res = new LinkedList();
167            if (isSingle()) 
168                res.add(ta);
169            else 
170                for (int i=0; i < locs.size(); i++) 
171                    res.addAll(((ArgLoc)locs.get(i)).getAllWordSpans());
172            
173            return res;
174        }
175    
176        /**
177           create a string representation of the argument location.
178         */
179        public String toString() {
180            if (isSingle()) return ta.toString();  //base case
181            
182            String lts = locTypeToString();
183            ArgLoc fst = (ArgLoc) locs.get(0);
184            String result = fst.toString();
185            for (int i=1; i < locs.size(); i++) {
186                ArgLoc loc = (ArgLoc) locs.get(i);
187                result += lts + loc.toString();    //recursive call to elements in list
188            }
189            return result;
190        }
191        
192        /**
193           convert a string to an ArgLoc if possible, otherwise throw
194           CorruptDataException.
195    
196           @param s  the string to be converted.
197         */
198        public static ArgLoc ofString(String s) throws CorruptDataException {
199            //Note from Ben:  
200            // Since we are checking for the presence of '*' first, this means that
201            // a string consisting of both '*' and ',' (e.g. 4:5*2:3,1:0) will be considered
202            // a trace chain (one of whose elements is a split arg).  In other words,
203            // the ',' operator has precedence over the '*' operator.
204    
205            int idx;
206            idx = s.indexOf('*');
207            if (idx != -1) {
208                StringTokenizer tok = new StringTokenizer(s, "*");
209                LinkedList l = new LinkedList();
210                while (tok.hasMoreElements()) {
211                    ArgLoc loc = ofString(tok.nextToken());
212                    l.add(loc);
213                }
214                return new ArgLoc(ArgLoc.EQUIVA, l);
215            }
216            idx = s.indexOf(',');
217            if (idx != -1) {
218                StringTokenizer tok = new StringTokenizer(s, ",");
219                LinkedList l = new  LinkedList();
220                while(tok.hasMoreElements()) {
221                    ArgLoc loc = ofString(tok.nextToken());
222                    l.add(loc);
223                }
224                return new ArgLoc(ArgLoc.CONCAT, l);
225            }
226            WordSpan ta = WordSpan.ofString(s);
227            return new ArgLoc(ta);
228        }
229    
230    
231        public boolean equals (Object o) {
232            if (!(o instanceof ArgLoc))
233                return false;
234            ArgLoc al = (ArgLoc)o;
235    
236            return al.loc_type == this.loc_type 
237                && ((al.locs == null && this.locs == null) || al.locs.equals(this.locs)) 
238                && ((al.ta == null && ta == null) || al.ta.equals(ta));
239        }
240        
241    //     public int hashCode () {
242    //      if (ta == null)
243    //          return 19 * loc_type + locs.hashCode();
244    //      else
245    //          return 19 * loc_type + ta.hashCode();
246    //     }
247        
248    //    //auxiliary method used in 'getUniqueSurfaceConstituent()'
249    //    private Integer getRank(TBNode root) {
250    //      if (isTraceChain()) return null;
251    //      else if (isConcat()) {
252    //          TreeSet ranks = new TreeSet();
253    //          Iterator iter = getLocList().iterator();
254    //          while (iter.hasNext()) 
255    //              ranks.add(((ArgLoc)iter.next()).getRank(root));
256    //          if (ranks.size() != 1) return null;   //not all constituents have same rank - undefined!
257    //          else return (Integer) ranks.first();
258    //      }
259    //      else {
260    //          WordSpan treeAddress = getWordSpan();
261    //          assert treeAddress != null;
262    //          TBNode tbnode = treeAddress.addressedNode(root);
263    //          if (tbnode.isEmpty()) return new Integer(0);       //It's an empty constituent - rank 0
264    //          String label = tbnode.getLabel().getPrimary();
265    //          if (label.startsWith("W") ||
266    //              (label.equals("IN") &&
267    //               tbnode.leftSibling()==null &&
268    //               tbnode.rightSibling()==null &&
269    //               tbnode.getParent().getLabel().getPrimary().startsWith("WH")))
270    //              return new Integer(1);                        //It's a WH constituent - rank 1
271    //          else return new Integer(2);                       //anything else - rank 2
272    //      }
273    //    }
274    
275    
276    //    // possibly null if none can be found
277    //    // TODO compare with WordSpan.getSubTree() - are these the same semantics?
278    //    
279    //    public ElemTree getDominatingNode(Sentence s) {
280    //        
281    //        if (this.isTraceChain()) {
282    //            return this.getUniqueSurfaceConstituent(s);
283    //        } else if (this.isConcat()) {
284    //            // we know that there are no deeply nested WordSpans 
285    //            // (see propbank README.txt case 3)
286    //            return WordSpan.merge(this.getAllWordSpans().iterator())
287    //            .getSubTree(s); // possibly null
288    //        } else {
289    //            assert this.isSingle();
290    //            return this.getWordSpan().getSubTree(s);
291    //        }
292    //    }
293    //    
294    //    /**
295    //     * 
296    //     * @param s
297    //     * @return
298    //     */
299    //    public ElemTree getUniqueSurfaceConstituent(Sentence s) {
300    //        if (this.isSingle()) throw new RuntimeException("Implement this case");
301    //        if (this.isConcat()) throw new RuntimeException("Implement this case");
302    //        assert this.isTraceChain();
303    //        
304    //        Iterator chain = this.getAllWordSpans().iterator();
305    //        
306    //        ElemTree e = null;
307    //        while (chain.hasNext()) {
308    //            e = ((WordSpan) chain.next()).
309    //                    getSubTree(s);
310    //            if (e == null) // no dominating node could be found
311    //                continue;
312    //            if (e.isEmptyElement()) continue;
313    //        }
314    //        if (e == null) {
315    //            throw new IllegalStateException("trace chain without a surface constituent"
316    //                    + this.toString());
317    //        }
318    //        return e;
319    //
320    //    }
321        
322       
323        
324    //    /**
325    //     * Attempts to find and return a unique surface constituent ArgLoc if this is a trace chain.
326    //     * If this is not a trace chain, or if a unique surface constituent ArgLoc cannot be found,
327    //     * returns null.
328    //     *
329    //     * @deprecated -- only here so code compiles
330    //     */
331    //    public ArgLoc getUniqueSurfaceConstituent(TBNode verbnode) {
332    //      TBNode root = verbnode.getRoot();
333    //      Vector[] rankArray = new Vector[3];
334    //      Iterator locIterator = getLocList().iterator();
335    //      ArgLoc constituent;
336    //      Vector rankVector;
337    //
338    //      if (!isTraceChain()) return null;
339    //      for (int i=0; i<rankArray.length; i++) {
340    //          rankArray[i] = new Vector();
341    //      }
342    //      while (locIterator.hasNext()) {
343    //          constituent = (ArgLoc) locIterator.next();      
344    //          Integer rank = constituent.getRank(root);
345    //          if (rank == null) return null;         //undefined rank!
346    //          rankArray[rank.intValue()].add(constituent);
347    //      }
348    //      for (int i=rankArray.length-1; i>=0; i--) {
349    //          rankVector = rankArray[i];
350    //          if (rankVector.size() == 0) return null;  //no unique surface element!
351    //          else return (ArgLoc) rankVector.get(0);
352    //      }
353    //      assert false: "shouldn't get here!";
354    //      return null;
355    //    }
356        
357    //    /**
358    //     * given the node associated with the verb in a sentence, 
359    //     * make this ArgLoc object contain TreeAddresss in the order    
360    //     * from source to target through a trace.
361    //     * 
362    //     * we use the following algorithm: find all empty consituents which 
363    //     * are not wh consituents, sort them by proximity to verb.  find all wh
364    //     * constiuents (who, which, etc), and sort them by proximity to verb
365    //     * .  Find everything else, sort them by proximity to verb, and then
366    //     * the resulting locs are in the order empty constituents, wh constituents,
367    //     * everything-else.
368    //     * 
369    //     * @param verbnode a TBNode object representing the verb for this annotation. 
370    //     * @see edu.upenn.cis.treebank.TBNode
371    //     */
372    //    // XXX this could be made even better if we separated out empty constituents 
373    //    // into those which satisfy tbnode.isTrace() and those which don't.
374    //    // -scott
375    //    public void sortMotion(TBNode verbnode)
376    //    {
377    //        // this doesn't make any sense unless this a "trace" or movement
378    //        // argument location type.
379    //        if (loc_type != EQUIVA) {
380    //            return;
381    //        }      
382    //        //
383    //        // first we grab the TBNodes we need, and record which ones are empty
384    //        // and which ones have WH labels and which one is closest to the verb
385    //        //
386    //      // Note from Ben:  In cases where an element in the trace chain itself
387    //      // contains multiple nodes (i.e. a split argument), we will just use 
388    //      // the earliest of those nodes. (But now we have a new problem....
389    //      // how do we put humpty dumpy back together again?  Let's use a hashMap
390    //      // to map TreeAddresses to ArgLoc constituents)
391    //      //
392    //      HashMap map = new HashMap();
393    //        LinkedList tbnodes = new LinkedList();
394    //        Iterator lociter = locs.iterator();
395    //        BitSet isempty = new BitSet();
396    //        BitSet iswh = new BitSet();
397    //        LabelMatcher whmatch = new LabelMatcher("^WH");
398    //        int current_index = 0;
399    //        TBNode root = verbnode.getRoot();
400    //      List addressList = new LinkedList();
401    //        while(lociter.hasNext()) {
402    //          ArgLoc loc = (ArgLoc)lociter.next();
403    //          List addresses = loc.getAllWordSpans();
404    //          Collections.sort(addresses);
405    //          WordSpan treeAddress = (WordSpan)addresses.get(0);
406    //          assert !map.containsKey(treeAddress); 
407    //          map.put(treeAddress, loc);
408    //          addressList.add(treeAddress);
409    //            TBNode tbn = treeAddress.addressedNode(root);
410    //            if (tbn.isEmpty()) {
411    //                isempty.set(current_index);
412    //            }
413    //            if (tbn.findFirst(whmatch) != null) {
414    //                iswh.set(current_index);
415    //            }
416    //            current_index++;
417    //        }
418    //        //
419    //        // find empty consitutents first, then sort them by proximity to verb
420    //        //
421    //        LinkedList empty_locs = new LinkedList();
422    //        for (int i=0; i<isempty.length(); ++i) {
423    //            if (isempty.get(i) && !iswh.get(i)) {
424    //                empty_locs.add(addressList.get(i));
425    //            }
426    //        }
427    //        final WordSpan verbloc = new WordSpan(verbnode.getTerminalNum(), 
428    //                                                    verbnode.getHeight());
429    //        Comparator verbprox = new Comparator() {
430    //                public int compare(Object o1, Object o2) {
431    //                    WordSpan aloc1 = (WordSpan) o1;
432    //                    WordSpan aloc2 = (WordSpan) o2;
433    //                    int dist1 = Math.abs(verbloc.compareTo(aloc1));
434    //                    int dist2 = Math.abs(verbloc.compareTo(aloc2));
435    //                    return dist1 - dist2;
436    //                }
437    //            };
438    //        Collections.sort(empty_locs, verbprox);
439    //        //
440    //        // find wh consituents and sort them by proximity to verb
441    //        //
442    //        LinkedList wh_locs = new LinkedList();
443    //        for (int j=0; j<iswh.length(); ++j) {
444    //            if (iswh.get(j)) {
445    //                wh_locs.add(addressList.get(j));
446    //            }
447    //        }
448    //        Collections.sort(wh_locs, verbprox);
449    //        LinkedList non_empty_non_wh = new LinkedList();
450    //        for (int k=0; k<locs.size(); ++k) {
451    //            if (!iswh.get(k) && !isempty.get(k)) {
452    //                non_empty_non_wh.add(addressList.get(k));
453    //            }
454    //        }
455    //        Collections.sort(non_empty_non_wh, verbprox);
456    //        //
457    //        // put it all back together, first empty, then wh, then everything else
458    //        //
459    //        LinkedList new_locs = new LinkedList();
460    //        new_locs.addAll(empty_locs);
461    //        new_locs.addAll(wh_locs);
462    //        new_locs.addAll(non_empty_non_wh);
463    //        locs.clear();
464    //      for (int i=0; i<new_locs.size(); i++) 
465    //          locs.add(map.get((WordSpan)new_locs.get(i)));
466    //    }
467    
468    //    /**
469    //       a simple unit test, not interesting for use.
470    //     */
471    //    public static void main(String args[]) throws CorruptDataException {
472    //      ArgLoc loc1 = ofString("1:2,2:3");
473    //      ArgLoc loc2 = ofString("1:2*2:3");
474    //      ArgLoc loc3 = ofString("1:2*2:3,3:4");
475    //      ArgLoc loc4 = ofString("1:2,2:3*3:4");
476    //      ArgLoc loc5 = ofString("1:2,2:3*3:4");
477    //      System.out.println(loc1);
478    //      System.out.println(loc2);
479    //      System.out.println(loc3);
480    //      System.out.println(loc4);
481    //      
482    //      assert loc5.equals(loc4);
483    //
484    //    }
485    }
486