001 package edu.upenn.cis.propbank_shen;
002
003 import java.util.*;
004 //import edu.upenn.cis.treebank.TBNode;
005 //import edu.upenn.cis.treebank.LabelMatcher;
006
007 import edu.upenn.cis.spinal.*;
008
009
010 /**
011 * This class represents a location of an argument in the text.
012 * <p>
013 * Basically, locations just represent spans in the text, but there
014 * are some complications. For example, sometimes arguments are associated
015 * with more than one span, such as the "utterance" argument
016 * of the verb "say" in the sentence "'I'm going home', John said, 'before 10PM'."
017 * <p>
018 * Here,
019 * "I'm going home ... before 10PM" is not a continuous span,
020 * so the <code>ArgLoc</code> will consist of two spans.
021 * <p>
022 * The second situation is for resolving empty constituents and sentence
023 * local anaphora. We consider the sentence "John is going to swim". The
024 * syntactic representation of this sentence adds a pseudo subject to the
025 * infinitive "to swim", with a pointer to "John". However,
026 * since WH-movement was not explicitly marked in the Penn treebank, we have marked
027 * this situation as well in the form of declaring that a set of nodes
028 * have equivalent semantic content. For example, in the sentence
029 * <center>
030 * "The man who swam the mile died"
031 * </center>
032 * Our treebank would declare that "who" is the subject of "swam", but not
033 * that "who" refered to the same thing as "the man". So we mark all the nodes,
034 * the node containing "who", the node containing "the man", and the associated
035 * empty constituent as nodes which denote the same thing.
036 * <p>
037 *
038 * Thus in short, we may view the data structure for a location in the text
039 * associated with an argument in a predicate argument structure as exactly one
040 * of the following three cases:
041 * <ol>
042 * <li> a singleton span
043 * <li> a set of spans arranged together
044 * <li>a set of spans referring to the same semantic entity.
045 * </ol>
046 * @author Scott Cotton
047 * @see WordSpan
048 */
049 public class ArgLoc implements Comparable {
050
051 public static final int SINGLE=0;
052 public static final int CONCAT=1 << 0;
053 public static final int EQUIVA=1 << 1;
054
055 protected int loc_type;
056 protected List locs;
057 protected WordSpan ta;
058
059 /** construct an argument location from a basic argument location */
060 public ArgLoc(WordSpan ta) {
061 loc_type=SINGLE;
062 locs = null;
063 this.ta = ta;
064 }
065
066 /**
067 construct an argument location from a loc_type and a list
068 of other ArgLoc objects. List must contain at least two
069 elements, and ltype must be CONCAT or EQUIVA.
070 */
071 public ArgLoc(int ltype, List alocs) {
072 if (alocs.size() < 2)
073 throw new IllegalArgumentException("Illegal call to ArgLoc constructor: "+
074 "length of list muse be >= 2");
075 Iterator iter = alocs.iterator();
076 while (iter.hasNext()) {
077 if (!(iter.next() instanceof ArgLoc))
078 throw new IllegalArgumentException("Illegal call to ArgLoc constructor: "+
079 "list must only contain ArgLoc objects");
080 }
081 if (ltype!=CONCAT && ltype!=EQUIVA)
082 throw new IllegalArgumentException("Illegal call to ArgLoc constructor: "+
083 "ltype must be one of EQUIV or CONCAT");
084 loc_type = ltype;
085 //Collections.sort(alocs);
086 locs = alocs;
087 ta = null;
088 }
089
090 /** return true if this location consists of a single word span. */
091 public boolean isSingle() { return loc_type == SINGLE; }
092
093 /** return true if this location consists of several word spans. */
094 public boolean isConcat() { return loc_type == CONCAT; }
095
096 /** return true if this location consists of a trace chain */
097 public boolean isTraceChain() { return loc_type == EQUIVA; }
098
099
100
101 /**
102 * Returns the unique <code>WordSpan</code> if this location consists of just one span.
103 * Otherwise, returns null.
104 */
105 public WordSpan getWordSpan() { return ta; }
106
107 /**
108 * Returns a list of other ArgLoc objects if this location consists of multiple nodes.
109 * Otherwise, return null.
110 */
111 public List getLocList() { return locs; }
112
113 /**
114 * return a canoncial string for a location type: the empty string
115 * for SINGLE, a comma for CONCAT, an asterisk for EQUIVA. These strings
116 * are used as separators in the representation of word spans.
117 */
118 public String locTypeToString() {
119 switch (loc_type) {
120 case SINGLE: return "";
121 case CONCAT: return ",";
122 case EQUIVA: return "*";
123 }
124 return "compiler needs this return statement because it is stupid.";
125 }
126
127 /** create a loc_type from a canoncial string or throw CorruptDataException if
128 this is an invalid string */
129 public static int locTypeOfstring(String s) throws CorruptDataException {
130 if (s == ",") { return ArgLoc.CONCAT; }
131 if (s == "*") { return ArgLoc.EQUIVA; }
132 throw new CorruptDataException("invalid string for location type of an argument: " + s);
133 }
134
135 /**
136 Compares argument locations.
137 We compare by the earliest node anywhere in the (possible nested) structure of the ArgLoc.
138 */
139 public int compareTo(Object o) {
140 if (!(o instanceof ArgLoc)) {
141 throw new IllegalArgumentException("invalid call to ArgLoc.compareTo(), object not" +
142 " an ArgLoc instance.");
143 }
144 ArgLoc al = (ArgLoc) o;
145
146 List alAddresses = al.getAllWordSpans();
147 List thisAddresses = this.getAllWordSpans();
148
149 Collections.sort(alAddresses);
150 Collections.sort(thisAddresses);
151 WordSpan thisFirst = (WordSpan) thisAddresses.get(0);
152 WordSpan alFirst = (WordSpan) alAddresses.get(0);
153 int firstComparison = thisFirst.compareTo(alFirst);
154 if (firstComparison != 0) return firstComparison;
155 if (this.equals(al))
156 return 0;
157 else
158 return this.toString().compareTo(al.toString());
159 }
160
161 /**
162 * returns a flat list of all WordSpans contained somewhere
163 * (possibly deeply nested) in this ArgLoc
164 */
165 public List getAllWordSpans() {
166 List res = new LinkedList();
167 if (isSingle())
168 res.add(ta);
169 else
170 for (int i=0; i < locs.size(); i++)
171 res.addAll(((ArgLoc)locs.get(i)).getAllWordSpans());
172
173 return res;
174 }
175
176 /**
177 create a string representation of the argument location.
178 */
179 public String toString() {
180 if (isSingle()) return ta.toString(); //base case
181
182 String lts = locTypeToString();
183 ArgLoc fst = (ArgLoc) locs.get(0);
184 String result = fst.toString();
185 for (int i=1; i < locs.size(); i++) {
186 ArgLoc loc = (ArgLoc) locs.get(i);
187 result += lts + loc.toString(); //recursive call to elements in list
188 }
189 return result;
190 }
191
192 /**
193 convert a string to an ArgLoc if possible, otherwise throw
194 CorruptDataException.
195
196 @param s the string to be converted.
197 */
198 public static ArgLoc ofString(String s) throws CorruptDataException {
199 //Note from Ben:
200 // Since we are checking for the presence of '*' first, this means that
201 // a string consisting of both '*' and ',' (e.g. 4:5*2:3,1:0) will be considered
202 // a trace chain (one of whose elements is a split arg). In other words,
203 // the ',' operator has precedence over the '*' operator.
204
205 int idx;
206 idx = s.indexOf('*');
207 if (idx != -1) {
208 StringTokenizer tok = new StringTokenizer(s, "*");
209 LinkedList l = new LinkedList();
210 while (tok.hasMoreElements()) {
211 ArgLoc loc = ofString(tok.nextToken());
212 l.add(loc);
213 }
214 return new ArgLoc(ArgLoc.EQUIVA, l);
215 }
216 idx = s.indexOf(',');
217 if (idx != -1) {
218 StringTokenizer tok = new StringTokenizer(s, ",");
219 LinkedList l = new LinkedList();
220 while(tok.hasMoreElements()) {
221 ArgLoc loc = ofString(tok.nextToken());
222 l.add(loc);
223 }
224 return new ArgLoc(ArgLoc.CONCAT, l);
225 }
226 WordSpan ta = WordSpan.ofString(s);
227 return new ArgLoc(ta);
228 }
229
230
231 public boolean equals (Object o) {
232 if (!(o instanceof ArgLoc))
233 return false;
234 ArgLoc al = (ArgLoc)o;
235
236 return al.loc_type == this.loc_type
237 && ((al.locs == null && this.locs == null) || al.locs.equals(this.locs))
238 && ((al.ta == null && ta == null) || al.ta.equals(ta));
239 }
240
241 // public int hashCode () {
242 // if (ta == null)
243 // return 19 * loc_type + locs.hashCode();
244 // else
245 // return 19 * loc_type + ta.hashCode();
246 // }
247
248 // //auxiliary method used in 'getUniqueSurfaceConstituent()'
249 // private Integer getRank(TBNode root) {
250 // if (isTraceChain()) return null;
251 // else if (isConcat()) {
252 // TreeSet ranks = new TreeSet();
253 // Iterator iter = getLocList().iterator();
254 // while (iter.hasNext())
255 // ranks.add(((ArgLoc)iter.next()).getRank(root));
256 // if (ranks.size() != 1) return null; //not all constituents have same rank - undefined!
257 // else return (Integer) ranks.first();
258 // }
259 // else {
260 // WordSpan treeAddress = getWordSpan();
261 // assert treeAddress != null;
262 // TBNode tbnode = treeAddress.addressedNode(root);
263 // if (tbnode.isEmpty()) return new Integer(0); //It's an empty constituent - rank 0
264 // String label = tbnode.getLabel().getPrimary();
265 // if (label.startsWith("W") ||
266 // (label.equals("IN") &&
267 // tbnode.leftSibling()==null &&
268 // tbnode.rightSibling()==null &&
269 // tbnode.getParent().getLabel().getPrimary().startsWith("WH")))
270 // return new Integer(1); //It's a WH constituent - rank 1
271 // else return new Integer(2); //anything else - rank 2
272 // }
273 // }
274
275
276 // // possibly null if none can be found
277 // // TODO compare with WordSpan.getSubTree() - are these the same semantics?
278 //
279 // public ElemTree getDominatingNode(Sentence s) {
280 //
281 // if (this.isTraceChain()) {
282 // return this.getUniqueSurfaceConstituent(s);
283 // } else if (this.isConcat()) {
284 // // we know that there are no deeply nested WordSpans
285 // // (see propbank README.txt case 3)
286 // return WordSpan.merge(this.getAllWordSpans().iterator())
287 // .getSubTree(s); // possibly null
288 // } else {
289 // assert this.isSingle();
290 // return this.getWordSpan().getSubTree(s);
291 // }
292 // }
293 //
294 // /**
295 // *
296 // * @param s
297 // * @return
298 // */
299 // public ElemTree getUniqueSurfaceConstituent(Sentence s) {
300 // if (this.isSingle()) throw new RuntimeException("Implement this case");
301 // if (this.isConcat()) throw new RuntimeException("Implement this case");
302 // assert this.isTraceChain();
303 //
304 // Iterator chain = this.getAllWordSpans().iterator();
305 //
306 // ElemTree e = null;
307 // while (chain.hasNext()) {
308 // e = ((WordSpan) chain.next()).
309 // getSubTree(s);
310 // if (e == null) // no dominating node could be found
311 // continue;
312 // if (e.isEmptyElement()) continue;
313 // }
314 // if (e == null) {
315 // throw new IllegalStateException("trace chain without a surface constituent"
316 // + this.toString());
317 // }
318 // return e;
319 //
320 // }
321
322
323
324 // /**
325 // * Attempts to find and return a unique surface constituent ArgLoc if this is a trace chain.
326 // * If this is not a trace chain, or if a unique surface constituent ArgLoc cannot be found,
327 // * returns null.
328 // *
329 // * @deprecated -- only here so code compiles
330 // */
331 // public ArgLoc getUniqueSurfaceConstituent(TBNode verbnode) {
332 // TBNode root = verbnode.getRoot();
333 // Vector[] rankArray = new Vector[3];
334 // Iterator locIterator = getLocList().iterator();
335 // ArgLoc constituent;
336 // Vector rankVector;
337 //
338 // if (!isTraceChain()) return null;
339 // for (int i=0; i<rankArray.length; i++) {
340 // rankArray[i] = new Vector();
341 // }
342 // while (locIterator.hasNext()) {
343 // constituent = (ArgLoc) locIterator.next();
344 // Integer rank = constituent.getRank(root);
345 // if (rank == null) return null; //undefined rank!
346 // rankArray[rank.intValue()].add(constituent);
347 // }
348 // for (int i=rankArray.length-1; i>=0; i--) {
349 // rankVector = rankArray[i];
350 // if (rankVector.size() == 0) return null; //no unique surface element!
351 // else return (ArgLoc) rankVector.get(0);
352 // }
353 // assert false: "shouldn't get here!";
354 // return null;
355 // }
356
357 // /**
358 // * given the node associated with the verb in a sentence,
359 // * make this ArgLoc object contain TreeAddresss in the order
360 // * from source to target through a trace.
361 // *
362 // * we use the following algorithm: find all empty consituents which
363 // * are not wh consituents, sort them by proximity to verb. find all wh
364 // * constiuents (who, which, etc), and sort them by proximity to verb
365 // * . Find everything else, sort them by proximity to verb, and then
366 // * the resulting locs are in the order empty constituents, wh constituents,
367 // * everything-else.
368 // *
369 // * @param verbnode a TBNode object representing the verb for this annotation.
370 // * @see edu.upenn.cis.treebank.TBNode
371 // */
372 // // XXX this could be made even better if we separated out empty constituents
373 // // into those which satisfy tbnode.isTrace() and those which don't.
374 // // -scott
375 // public void sortMotion(TBNode verbnode)
376 // {
377 // // this doesn't make any sense unless this a "trace" or movement
378 // // argument location type.
379 // if (loc_type != EQUIVA) {
380 // return;
381 // }
382 // //
383 // // first we grab the TBNodes we need, and record which ones are empty
384 // // and which ones have WH labels and which one is closest to the verb
385 // //
386 // // Note from Ben: In cases where an element in the trace chain itself
387 // // contains multiple nodes (i.e. a split argument), we will just use
388 // // the earliest of those nodes. (But now we have a new problem....
389 // // how do we put humpty dumpy back together again? Let's use a hashMap
390 // // to map TreeAddresses to ArgLoc constituents)
391 // //
392 // HashMap map = new HashMap();
393 // LinkedList tbnodes = new LinkedList();
394 // Iterator lociter = locs.iterator();
395 // BitSet isempty = new BitSet();
396 // BitSet iswh = new BitSet();
397 // LabelMatcher whmatch = new LabelMatcher("^WH");
398 // int current_index = 0;
399 // TBNode root = verbnode.getRoot();
400 // List addressList = new LinkedList();
401 // while(lociter.hasNext()) {
402 // ArgLoc loc = (ArgLoc)lociter.next();
403 // List addresses = loc.getAllWordSpans();
404 // Collections.sort(addresses);
405 // WordSpan treeAddress = (WordSpan)addresses.get(0);
406 // assert !map.containsKey(treeAddress);
407 // map.put(treeAddress, loc);
408 // addressList.add(treeAddress);
409 // TBNode tbn = treeAddress.addressedNode(root);
410 // if (tbn.isEmpty()) {
411 // isempty.set(current_index);
412 // }
413 // if (tbn.findFirst(whmatch) != null) {
414 // iswh.set(current_index);
415 // }
416 // current_index++;
417 // }
418 // //
419 // // find empty consitutents first, then sort them by proximity to verb
420 // //
421 // LinkedList empty_locs = new LinkedList();
422 // for (int i=0; i<isempty.length(); ++i) {
423 // if (isempty.get(i) && !iswh.get(i)) {
424 // empty_locs.add(addressList.get(i));
425 // }
426 // }
427 // final WordSpan verbloc = new WordSpan(verbnode.getTerminalNum(),
428 // verbnode.getHeight());
429 // Comparator verbprox = new Comparator() {
430 // public int compare(Object o1, Object o2) {
431 // WordSpan aloc1 = (WordSpan) o1;
432 // WordSpan aloc2 = (WordSpan) o2;
433 // int dist1 = Math.abs(verbloc.compareTo(aloc1));
434 // int dist2 = Math.abs(verbloc.compareTo(aloc2));
435 // return dist1 - dist2;
436 // }
437 // };
438 // Collections.sort(empty_locs, verbprox);
439 // //
440 // // find wh consituents and sort them by proximity to verb
441 // //
442 // LinkedList wh_locs = new LinkedList();
443 // for (int j=0; j<iswh.length(); ++j) {
444 // if (iswh.get(j)) {
445 // wh_locs.add(addressList.get(j));
446 // }
447 // }
448 // Collections.sort(wh_locs, verbprox);
449 // LinkedList non_empty_non_wh = new LinkedList();
450 // for (int k=0; k<locs.size(); ++k) {
451 // if (!iswh.get(k) && !isempty.get(k)) {
452 // non_empty_non_wh.add(addressList.get(k));
453 // }
454 // }
455 // Collections.sort(non_empty_non_wh, verbprox);
456 // //
457 // // put it all back together, first empty, then wh, then everything else
458 // //
459 // LinkedList new_locs = new LinkedList();
460 // new_locs.addAll(empty_locs);
461 // new_locs.addAll(wh_locs);
462 // new_locs.addAll(non_empty_non_wh);
463 // locs.clear();
464 // for (int i=0; i<new_locs.size(); i++)
465 // locs.add(map.get((WordSpan)new_locs.get(i)));
466 // }
467
468 // /**
469 // a simple unit test, not interesting for use.
470 // */
471 // public static void main(String args[]) throws CorruptDataException {
472 // ArgLoc loc1 = ofString("1:2,2:3");
473 // ArgLoc loc2 = ofString("1:2*2:3");
474 // ArgLoc loc3 = ofString("1:2*2:3,3:4");
475 // ArgLoc loc4 = ofString("1:2,2:3*3:4");
476 // ArgLoc loc5 = ofString("1:2,2:3*3:4");
477 // System.out.println(loc1);
478 // System.out.println(loc2);
479 // System.out.println(loc3);
480 // System.out.println(loc4);
481 //
482 // assert loc5.equals(loc4);
483 //
484 // }
485 }
486