001 package edu.upenn.cis.propbank_shen; 002 003 import java.util.*; 004 //import edu.upenn.cis.treebank.TBNode; 005 //import edu.upenn.cis.treebank.LabelMatcher; 006 007 import edu.upenn.cis.spinal.*; 008 009 010 /** 011 * This class represents a location of an argument in the text. 012 * <p> 013 * Basically, locations just represent spans in the text, but there 014 * are some complications. For example, sometimes arguments are associated 015 * with more than one span, such as the "utterance" argument 016 * of the verb "say" in the sentence "'I'm going home', John said, 'before 10PM'." 017 * <p> 018 * Here, 019 * "I'm going home ... before 10PM" is not a continuous span, 020 * so the <code>ArgLoc</code> will consist of two spans. 021 * <p> 022 * The second situation is for resolving empty constituents and sentence 023 * local anaphora. We consider the sentence "John is going to swim". The 024 * syntactic representation of this sentence adds a pseudo subject to the 025 * infinitive "to swim", with a pointer to "John". However, 026 * since WH-movement was not explicitly marked in the Penn treebank, we have marked 027 * this situation as well in the form of declaring that a set of nodes 028 * have equivalent semantic content. For example, in the sentence 029 * <center> 030 * "The man who swam the mile died" 031 * </center> 032 * Our treebank would declare that "who" is the subject of "swam", but not 033 * that "who" refered to the same thing as "the man". So we mark all the nodes, 034 * the node containing "who", the node containing "the man", and the associated 035 * empty constituent as nodes which denote the same thing. 036 * <p> 037 * 038 * Thus in short, we may view the data structure for a location in the text 039 * associated with an argument in a predicate argument structure as exactly one 040 * of the following three cases: 041 * <ol> 042 * <li> a singleton span 043 * <li> a set of spans arranged together 044 * <li>a set of spans referring to the same semantic entity. 045 * </ol> 046 * @author Scott Cotton 047 * @see WordSpan 048 */ 049 public class ArgLoc implements Comparable { 050 051 public static final int SINGLE=0; 052 public static final int CONCAT=1 << 0; 053 public static final int EQUIVA=1 << 1; 054 055 protected int loc_type; 056 protected List locs; 057 protected WordSpan ta; 058 059 /** construct an argument location from a basic argument location */ 060 public ArgLoc(WordSpan ta) { 061 loc_type=SINGLE; 062 locs = null; 063 this.ta = ta; 064 } 065 066 /** 067 construct an argument location from a loc_type and a list 068 of other ArgLoc objects. List must contain at least two 069 elements, and ltype must be CONCAT or EQUIVA. 070 */ 071 public ArgLoc(int ltype, List alocs) { 072 if (alocs.size() < 2) 073 throw new IllegalArgumentException("Illegal call to ArgLoc constructor: "+ 074 "length of list muse be >= 2"); 075 Iterator iter = alocs.iterator(); 076 while (iter.hasNext()) { 077 if (!(iter.next() instanceof ArgLoc)) 078 throw new IllegalArgumentException("Illegal call to ArgLoc constructor: "+ 079 "list must only contain ArgLoc objects"); 080 } 081 if (ltype!=CONCAT && ltype!=EQUIVA) 082 throw new IllegalArgumentException("Illegal call to ArgLoc constructor: "+ 083 "ltype must be one of EQUIV or CONCAT"); 084 loc_type = ltype; 085 //Collections.sort(alocs); 086 locs = alocs; 087 ta = null; 088 } 089 090 /** return true if this location consists of a single word span. */ 091 public boolean isSingle() { return loc_type == SINGLE; } 092 093 /** return true if this location consists of several word spans. */ 094 public boolean isConcat() { return loc_type == CONCAT; } 095 096 /** return true if this location consists of a trace chain */ 097 public boolean isTraceChain() { return loc_type == EQUIVA; } 098 099 100 101 /** 102 * Returns the unique <code>WordSpan</code> if this location consists of just one span. 103 * Otherwise, returns null. 104 */ 105 public WordSpan getWordSpan() { return ta; } 106 107 /** 108 * Returns a list of other ArgLoc objects if this location consists of multiple nodes. 109 * Otherwise, return null. 110 */ 111 public List getLocList() { return locs; } 112 113 /** 114 * return a canoncial string for a location type: the empty string 115 * for SINGLE, a comma for CONCAT, an asterisk for EQUIVA. These strings 116 * are used as separators in the representation of word spans. 117 */ 118 public String locTypeToString() { 119 switch (loc_type) { 120 case SINGLE: return ""; 121 case CONCAT: return ","; 122 case EQUIVA: return "*"; 123 } 124 return "compiler needs this return statement because it is stupid."; 125 } 126 127 /** create a loc_type from a canoncial string or throw CorruptDataException if 128 this is an invalid string */ 129 public static int locTypeOfstring(String s) throws CorruptDataException { 130 if (s == ",") { return ArgLoc.CONCAT; } 131 if (s == "*") { return ArgLoc.EQUIVA; } 132 throw new CorruptDataException("invalid string for location type of an argument: " + s); 133 } 134 135 /** 136 Compares argument locations. 137 We compare by the earliest node anywhere in the (possible nested) structure of the ArgLoc. 138 */ 139 public int compareTo(Object o) { 140 if (!(o instanceof ArgLoc)) { 141 throw new IllegalArgumentException("invalid call to ArgLoc.compareTo(), object not" + 142 " an ArgLoc instance."); 143 } 144 ArgLoc al = (ArgLoc) o; 145 146 List alAddresses = al.getAllWordSpans(); 147 List thisAddresses = this.getAllWordSpans(); 148 149 Collections.sort(alAddresses); 150 Collections.sort(thisAddresses); 151 WordSpan thisFirst = (WordSpan) thisAddresses.get(0); 152 WordSpan alFirst = (WordSpan) alAddresses.get(0); 153 int firstComparison = thisFirst.compareTo(alFirst); 154 if (firstComparison != 0) return firstComparison; 155 if (this.equals(al)) 156 return 0; 157 else 158 return this.toString().compareTo(al.toString()); 159 } 160 161 /** 162 * returns a flat list of all WordSpans contained somewhere 163 * (possibly deeply nested) in this ArgLoc 164 */ 165 public List getAllWordSpans() { 166 List res = new LinkedList(); 167 if (isSingle()) 168 res.add(ta); 169 else 170 for (int i=0; i < locs.size(); i++) 171 res.addAll(((ArgLoc)locs.get(i)).getAllWordSpans()); 172 173 return res; 174 } 175 176 /** 177 create a string representation of the argument location. 178 */ 179 public String toString() { 180 if (isSingle()) return ta.toString(); //base case 181 182 String lts = locTypeToString(); 183 ArgLoc fst = (ArgLoc) locs.get(0); 184 String result = fst.toString(); 185 for (int i=1; i < locs.size(); i++) { 186 ArgLoc loc = (ArgLoc) locs.get(i); 187 result += lts + loc.toString(); //recursive call to elements in list 188 } 189 return result; 190 } 191 192 /** 193 convert a string to an ArgLoc if possible, otherwise throw 194 CorruptDataException. 195 196 @param s the string to be converted. 197 */ 198 public static ArgLoc ofString(String s) throws CorruptDataException { 199 //Note from Ben: 200 // Since we are checking for the presence of '*' first, this means that 201 // a string consisting of both '*' and ',' (e.g. 4:5*2:3,1:0) will be considered 202 // a trace chain (one of whose elements is a split arg). In other words, 203 // the ',' operator has precedence over the '*' operator. 204 205 int idx; 206 idx = s.indexOf('*'); 207 if (idx != -1) { 208 StringTokenizer tok = new StringTokenizer(s, "*"); 209 LinkedList l = new LinkedList(); 210 while (tok.hasMoreElements()) { 211 ArgLoc loc = ofString(tok.nextToken()); 212 l.add(loc); 213 } 214 return new ArgLoc(ArgLoc.EQUIVA, l); 215 } 216 idx = s.indexOf(','); 217 if (idx != -1) { 218 StringTokenizer tok = new StringTokenizer(s, ","); 219 LinkedList l = new LinkedList(); 220 while(tok.hasMoreElements()) { 221 ArgLoc loc = ofString(tok.nextToken()); 222 l.add(loc); 223 } 224 return new ArgLoc(ArgLoc.CONCAT, l); 225 } 226 WordSpan ta = WordSpan.ofString(s); 227 return new ArgLoc(ta); 228 } 229 230 231 public boolean equals (Object o) { 232 if (!(o instanceof ArgLoc)) 233 return false; 234 ArgLoc al = (ArgLoc)o; 235 236 return al.loc_type == this.loc_type 237 && ((al.locs == null && this.locs == null) || al.locs.equals(this.locs)) 238 && ((al.ta == null && ta == null) || al.ta.equals(ta)); 239 } 240 241 // public int hashCode () { 242 // if (ta == null) 243 // return 19 * loc_type + locs.hashCode(); 244 // else 245 // return 19 * loc_type + ta.hashCode(); 246 // } 247 248 // //auxiliary method used in 'getUniqueSurfaceConstituent()' 249 // private Integer getRank(TBNode root) { 250 // if (isTraceChain()) return null; 251 // else if (isConcat()) { 252 // TreeSet ranks = new TreeSet(); 253 // Iterator iter = getLocList().iterator(); 254 // while (iter.hasNext()) 255 // ranks.add(((ArgLoc)iter.next()).getRank(root)); 256 // if (ranks.size() != 1) return null; //not all constituents have same rank - undefined! 257 // else return (Integer) ranks.first(); 258 // } 259 // else { 260 // WordSpan treeAddress = getWordSpan(); 261 // assert treeAddress != null; 262 // TBNode tbnode = treeAddress.addressedNode(root); 263 // if (tbnode.isEmpty()) return new Integer(0); //It's an empty constituent - rank 0 264 // String label = tbnode.getLabel().getPrimary(); 265 // if (label.startsWith("W") || 266 // (label.equals("IN") && 267 // tbnode.leftSibling()==null && 268 // tbnode.rightSibling()==null && 269 // tbnode.getParent().getLabel().getPrimary().startsWith("WH"))) 270 // return new Integer(1); //It's a WH constituent - rank 1 271 // else return new Integer(2); //anything else - rank 2 272 // } 273 // } 274 275 276 // // possibly null if none can be found 277 // // TODO compare with WordSpan.getSubTree() - are these the same semantics? 278 // 279 // public ElemTree getDominatingNode(Sentence s) { 280 // 281 // if (this.isTraceChain()) { 282 // return this.getUniqueSurfaceConstituent(s); 283 // } else if (this.isConcat()) { 284 // // we know that there are no deeply nested WordSpans 285 // // (see propbank README.txt case 3) 286 // return WordSpan.merge(this.getAllWordSpans().iterator()) 287 // .getSubTree(s); // possibly null 288 // } else { 289 // assert this.isSingle(); 290 // return this.getWordSpan().getSubTree(s); 291 // } 292 // } 293 // 294 // /** 295 // * 296 // * @param s 297 // * @return 298 // */ 299 // public ElemTree getUniqueSurfaceConstituent(Sentence s) { 300 // if (this.isSingle()) throw new RuntimeException("Implement this case"); 301 // if (this.isConcat()) throw new RuntimeException("Implement this case"); 302 // assert this.isTraceChain(); 303 // 304 // Iterator chain = this.getAllWordSpans().iterator(); 305 // 306 // ElemTree e = null; 307 // while (chain.hasNext()) { 308 // e = ((WordSpan) chain.next()). 309 // getSubTree(s); 310 // if (e == null) // no dominating node could be found 311 // continue; 312 // if (e.isEmptyElement()) continue; 313 // } 314 // if (e == null) { 315 // throw new IllegalStateException("trace chain without a surface constituent" 316 // + this.toString()); 317 // } 318 // return e; 319 // 320 // } 321 322 323 324 // /** 325 // * Attempts to find and return a unique surface constituent ArgLoc if this is a trace chain. 326 // * If this is not a trace chain, or if a unique surface constituent ArgLoc cannot be found, 327 // * returns null. 328 // * 329 // * @deprecated -- only here so code compiles 330 // */ 331 // public ArgLoc getUniqueSurfaceConstituent(TBNode verbnode) { 332 // TBNode root = verbnode.getRoot(); 333 // Vector[] rankArray = new Vector[3]; 334 // Iterator locIterator = getLocList().iterator(); 335 // ArgLoc constituent; 336 // Vector rankVector; 337 // 338 // if (!isTraceChain()) return null; 339 // for (int i=0; i<rankArray.length; i++) { 340 // rankArray[i] = new Vector(); 341 // } 342 // while (locIterator.hasNext()) { 343 // constituent = (ArgLoc) locIterator.next(); 344 // Integer rank = constituent.getRank(root); 345 // if (rank == null) return null; //undefined rank! 346 // rankArray[rank.intValue()].add(constituent); 347 // } 348 // for (int i=rankArray.length-1; i>=0; i--) { 349 // rankVector = rankArray[i]; 350 // if (rankVector.size() == 0) return null; //no unique surface element! 351 // else return (ArgLoc) rankVector.get(0); 352 // } 353 // assert false: "shouldn't get here!"; 354 // return null; 355 // } 356 357 // /** 358 // * given the node associated with the verb in a sentence, 359 // * make this ArgLoc object contain TreeAddresss in the order 360 // * from source to target through a trace. 361 // * 362 // * we use the following algorithm: find all empty consituents which 363 // * are not wh consituents, sort them by proximity to verb. find all wh 364 // * constiuents (who, which, etc), and sort them by proximity to verb 365 // * . Find everything else, sort them by proximity to verb, and then 366 // * the resulting locs are in the order empty constituents, wh constituents, 367 // * everything-else. 368 // * 369 // * @param verbnode a TBNode object representing the verb for this annotation. 370 // * @see edu.upenn.cis.treebank.TBNode 371 // */ 372 // // XXX this could be made even better if we separated out empty constituents 373 // // into those which satisfy tbnode.isTrace() and those which don't. 374 // // -scott 375 // public void sortMotion(TBNode verbnode) 376 // { 377 // // this doesn't make any sense unless this a "trace" or movement 378 // // argument location type. 379 // if (loc_type != EQUIVA) { 380 // return; 381 // } 382 // // 383 // // first we grab the TBNodes we need, and record which ones are empty 384 // // and which ones have WH labels and which one is closest to the verb 385 // // 386 // // Note from Ben: In cases where an element in the trace chain itself 387 // // contains multiple nodes (i.e. a split argument), we will just use 388 // // the earliest of those nodes. (But now we have a new problem.... 389 // // how do we put humpty dumpy back together again? Let's use a hashMap 390 // // to map TreeAddresses to ArgLoc constituents) 391 // // 392 // HashMap map = new HashMap(); 393 // LinkedList tbnodes = new LinkedList(); 394 // Iterator lociter = locs.iterator(); 395 // BitSet isempty = new BitSet(); 396 // BitSet iswh = new BitSet(); 397 // LabelMatcher whmatch = new LabelMatcher("^WH"); 398 // int current_index = 0; 399 // TBNode root = verbnode.getRoot(); 400 // List addressList = new LinkedList(); 401 // while(lociter.hasNext()) { 402 // ArgLoc loc = (ArgLoc)lociter.next(); 403 // List addresses = loc.getAllWordSpans(); 404 // Collections.sort(addresses); 405 // WordSpan treeAddress = (WordSpan)addresses.get(0); 406 // assert !map.containsKey(treeAddress); 407 // map.put(treeAddress, loc); 408 // addressList.add(treeAddress); 409 // TBNode tbn = treeAddress.addressedNode(root); 410 // if (tbn.isEmpty()) { 411 // isempty.set(current_index); 412 // } 413 // if (tbn.findFirst(whmatch) != null) { 414 // iswh.set(current_index); 415 // } 416 // current_index++; 417 // } 418 // // 419 // // find empty consitutents first, then sort them by proximity to verb 420 // // 421 // LinkedList empty_locs = new LinkedList(); 422 // for (int i=0; i<isempty.length(); ++i) { 423 // if (isempty.get(i) && !iswh.get(i)) { 424 // empty_locs.add(addressList.get(i)); 425 // } 426 // } 427 // final WordSpan verbloc = new WordSpan(verbnode.getTerminalNum(), 428 // verbnode.getHeight()); 429 // Comparator verbprox = new Comparator() { 430 // public int compare(Object o1, Object o2) { 431 // WordSpan aloc1 = (WordSpan) o1; 432 // WordSpan aloc2 = (WordSpan) o2; 433 // int dist1 = Math.abs(verbloc.compareTo(aloc1)); 434 // int dist2 = Math.abs(verbloc.compareTo(aloc2)); 435 // return dist1 - dist2; 436 // } 437 // }; 438 // Collections.sort(empty_locs, verbprox); 439 // // 440 // // find wh consituents and sort them by proximity to verb 441 // // 442 // LinkedList wh_locs = new LinkedList(); 443 // for (int j=0; j<iswh.length(); ++j) { 444 // if (iswh.get(j)) { 445 // wh_locs.add(addressList.get(j)); 446 // } 447 // } 448 // Collections.sort(wh_locs, verbprox); 449 // LinkedList non_empty_non_wh = new LinkedList(); 450 // for (int k=0; k<locs.size(); ++k) { 451 // if (!iswh.get(k) && !isempty.get(k)) { 452 // non_empty_non_wh.add(addressList.get(k)); 453 // } 454 // } 455 // Collections.sort(non_empty_non_wh, verbprox); 456 // // 457 // // put it all back together, first empty, then wh, then everything else 458 // // 459 // LinkedList new_locs = new LinkedList(); 460 // new_locs.addAll(empty_locs); 461 // new_locs.addAll(wh_locs); 462 // new_locs.addAll(non_empty_non_wh); 463 // locs.clear(); 464 // for (int i=0; i<new_locs.size(); i++) 465 // locs.add(map.get((WordSpan)new_locs.get(i))); 466 // } 467 468 // /** 469 // a simple unit test, not interesting for use. 470 // */ 471 // public static void main(String args[]) throws CorruptDataException { 472 // ArgLoc loc1 = ofString("1:2,2:3"); 473 // ArgLoc loc2 = ofString("1:2*2:3"); 474 // ArgLoc loc3 = ofString("1:2*2:3,3:4"); 475 // ArgLoc loc4 = ofString("1:2,2:3*3:4"); 476 // ArgLoc loc5 = ofString("1:2,2:3*3:4"); 477 // System.out.println(loc1); 478 // System.out.println(loc2); 479 // System.out.println(loc3); 480 // System.out.println(loc4); 481 // 482 // assert loc5.equals(loc4); 483 // 484 // } 485 } 486