001 package edu.upenn.cis.propbank_shen; 002 003 import java.io.*; 004 import java.util.*; 005 006 //import edu.upenn.cis.treebank.TBNode; 007 //import edu.upenn.cis.treebank.InvalidAddressException; 008 009 import edu.upenn.cis.spinal.*; 010 011 012 /** 013 This class represents an annotation in the propbank. An annotation 014 represents a predicate argument structure together with an optional 015 roleset done by some annotator (usually a person) and with some 016 inflectional/morphological information. 017 018 @author Scott Cotton 019 020 @see edu.upenn.cis.propbank_shen.PAStruct 021 @see edu.upenn.cis.propbank_shen.PASLoc 022 @see edu.upenn.cis.propbank_shen.Inflection 023 @see edu.upenn.cis.propbank_shen.RoleSet 024 @see edu.upenn.cis.propbank_shen.Argument 025 026 */ 027 028 public class Annotation { 029 030 /** the predicate argument structure */ 031 protected PAStruct pas; 032 /** the location in the treebank */ 033 protected PASLoc pasloc; 034 /** the annotator who is responsible. */ 035 protected String annotator; 036 /** the inflectional information */ 037 protected Inflection inflection; 038 /** the roleset identifier */ 039 protected String rolesetid; 040 /** the roleset, if identified or null otherwise */ 041 protected RoleSet roleset; 042 043 /** 044 * construct an Annotation from a line of text. 045 * 046 * Here is an example line of text for input to the constructor. 047 * <pre> 048 * wsj/00/wsj_0002.mrg 0 16 gold name.01 ----- 16_16-rel 0_14*17_17-ARG1 18_25-ARG2 049 * </pre> 050 * 051 * The first field is the relative path of the Wall Street Journal corpus 052 * file. The second field is the sentence number. The third field is 053 * the number of the terminal (treebank leaf) representing the annotated 054 * verb. The fourth field is the annotator name. 055 * (In Libin Shen's version of the Propbank, this is always either 056 * <code>gold</code> or <code>mimic</code>, depending on whether the origin 057 * is the actual Propbank or Libin's automatic annotation for the verb "be".) 058 * The fifth field is the 059 * roleset identifier (with .XX indicating this identifier is incomplete 060 * and only refers to the verb, not to a particular roleset 061 * associated with that verb). The fifth field describes the verb's inflection 062 * in the original Propbank, but is left blank (<code>-----</code>) in Libin's 063 * versoin. 064 * The remaining fields describe the predicate-argument structure. The main 065 * difference to the original Propbank here is that locations are indicated 066 * as word spans rather than as nodes in the Penn Treebank annotation. 067 */ 068 public Annotation (String ln) 069 throws CorruptDataException 070 { 071 String parts[] = ln.trim().split(" "); 072 if (parts.length < 7) { 073 throw new CorruptDataException("invalid annotation line: " + ln); 074 } 075 pasloc = PASLoc.ofString(parts[0] + " " + parts[1] + " " + parts[2]); 076 annotator = parts[3]; 077 rolesetid = parts[4]; 078 inflection = new Inflection(parts[5]); 079 int dotidx = rolesetid.indexOf("."); 080 if (dotidx == -1) { 081 throw new CorruptDataException("invalid annotation line (bad roleset): " + ln); 082 } 083 String lemma = rolesetid.substring(0, dotidx); 084 pas = new PAStruct(lemma); 085 for (int i=6; i < parts.length; i++) { 086 pas.addArg(Argument.ofString(parts[i])); 087 } 088 } 089 090 public Annotation (PASLoc loc, String annotator, RoleSet roleset, 091 Inflection inflection, PAStruct structure) { 092 this(loc, annotator, roleset.getId(), inflection, structure); 093 this.roleset = roleset; 094 } 095 096 public Annotation (PASLoc loc, String annotator, String rolesetId, 097 Inflection inflection, PAStruct structure) { 098 this.pasloc = loc; 099 this.annotator = annotator; 100 this.rolesetid = rolesetId; 101 this.inflection = inflection; 102 this.pas = structure; 103 } 104 105 /** 106 * Return a canonical string representation of this class, suitable for 107 * passing to the class's constructor. 108 */ 109 public String toString () 110 { 111 return (pasloc.toString() + " " 112 + annotator + " " 113 + rolesetid + " " 114 + inflection.toString() + " " 115 + pas.toString()); 116 } 117 118 /** 119 * Return the predicate argument structure portion of the annotation. 120 * @see edu.upenn.cis.propbank_shen.PAStruct 121 */ 122 public PAStruct getPAStruct() 123 { 124 return pas; 125 } 126 127 /** 128 * Return the predicate argument structure location. 129 * @see edu.upenn.cis.propbank_shen.PASLoc 130 */ 131 public PASLoc getPASLoc() 132 { 133 return pasloc; 134 } 135 136 private Argument relation = null; 137 138 139 // // convenience method -- possibly null if none can be found 140 // public ElemTree getRelationSubtree(Sentence s) { 141 // return this.getRelation().getLocation().getDominatingNode(s); 142 // } 143 144 public Argument getRelation() { 145 if (this.relation != null) return this.relation; 146 Iterator args = this.getPAStruct().getArgs().iterator(); 147 148 while (args.hasNext()) { 149 Argument current = (Argument) args.next(); 150 if (current.arg_label.isRel()) { 151 this.relation = current; 152 return current; 153 } 154 } 155 // shouldn't get here because every Annotation must have a relation 156 assert false; 157 throw new IllegalStateException("Bad annotation -- doesn't have a relation: " 158 + this.toString()); 159 160 } 161 162 163 164 /** 165 * Return the inflection part of the annotation. 166 * @see edu.upenn.cis.propbank_shen.Inflection 167 */ 168 public Inflection getInflection() 169 { 170 return inflection; 171 } 172 173 /** 174 * Return the RoleSet of the annotation, if disambiguated, otherwise 175 * return <code>null</code>. 176 * @see edu.upenn.cis.propbank_shen.RoleSet 177 * @throws edu.upenn.cis.propbank_shen.CorruptDataException if Roleset xml file is bad 178 */ 179 public RoleSet getRoleSet() throws CorruptDataException 180 { 181 if (roleset != null) { 182 return roleset; 183 } 184 if (rolesetid.endsWith(".XX")) { 185 return null; 186 } else { 187 roleset = RoleSet.ofId(rolesetid); 188 return roleset; 189 } 190 } 191 192 /** 193 * Return the ID of the roleset, such as run.02 or find.XX. 194 */ 195 public String getRoleSetId () { 196 return rolesetid; 197 } 198 199 /** 200 * Return the lemma of the annotation 201 */ 202 public String getLemma() 203 { 204 return pas.getLemma(); 205 } 206 207 /** 208 * Return the annotator 209 */ 210 public String getAnnotator() 211 { 212 return annotator; 213 } 214 215 // /** 216 // * Return the edu.upenn.cis.treebank node associated with this 217 // * annotation -- the node representing the main predicate. 218 // * @see edu.upenn.cis.treebank.TBNode 219 // */ 220 // public TBNode getTBNode() throws InvalidAddressException 221 // { 222 // return pasloc.getTBNode(); 223 // } 224 225 /** a unit test */ 226 public static void main(String args[]) 227 throws IOException 228 //, InvalidAddressException 229 { 230 if (args.length != 1) { 231 System.err.println("error: give me a file like prop-all.idx!"); 232 System.exit(1); 233 } 234 BufferedReader in = new BufferedReader(new FileReader(args[0])); 235 Annotation a; 236 while (true) { 237 String ln = in.readLine(); 238 if (ln == null) { break; } 239 try { 240 a = new Annotation(ln); 241 // TBNode tbn = a.getTBNode(); 242 PAStruct pas = a.getPAStruct(); 243 Iterator argiter = pas.getArgs().iterator(); 244 while (argiter.hasNext()) { 245 Argument arg = (Argument) argiter.next(); 246 if (arg.location.loc_type == ArgLoc.EQUIVA) { 247 // arg.location.sortMotion(tbn); 248 System.out.println(a.getPASLoc() + " " + arg.location); 249 } 250 } 251 } catch (CorruptDataException cd) { 252 System.err.println(cd); 253 } 254 } 255 } 256 }