001 package edu.upenn.cis.propbank_shen;
002
003 import java.io.*;
004 import java.util.*;
005
006 //import edu.upenn.cis.treebank.TBNode;
007 //import edu.upenn.cis.treebank.InvalidAddressException;
008
009 import edu.upenn.cis.spinal.*;
010
011
012 /**
013 This class represents an annotation in the propbank. An annotation
014 represents a predicate argument structure together with an optional
015 roleset done by some annotator (usually a person) and with some
016 inflectional/morphological information.
017
018 @author Scott Cotton
019
020 @see edu.upenn.cis.propbank_shen.PAStruct
021 @see edu.upenn.cis.propbank_shen.PASLoc
022 @see edu.upenn.cis.propbank_shen.Inflection
023 @see edu.upenn.cis.propbank_shen.RoleSet
024 @see edu.upenn.cis.propbank_shen.Argument
025
026 */
027
028 public class Annotation {
029
030 /** the predicate argument structure */
031 protected PAStruct pas;
032 /** the location in the treebank */
033 protected PASLoc pasloc;
034 /** the annotator who is responsible. */
035 protected String annotator;
036 /** the inflectional information */
037 protected Inflection inflection;
038 /** the roleset identifier */
039 protected String rolesetid;
040 /** the roleset, if identified or null otherwise */
041 protected RoleSet roleset;
042
043 /**
044 * construct an Annotation from a line of text.
045 *
046 * Here is an example line of text for input to the constructor.
047 * <pre>
048 * wsj/00/wsj_0002.mrg 0 16 gold name.01 ----- 16_16-rel 0_14*17_17-ARG1 18_25-ARG2
049 * </pre>
050 *
051 * The first field is the relative path of the Wall Street Journal corpus
052 * file. The second field is the sentence number. The third field is
053 * the number of the terminal (treebank leaf) representing the annotated
054 * verb. The fourth field is the annotator name.
055 * (In Libin Shen's version of the Propbank, this is always either
056 * <code>gold</code> or <code>mimic</code>, depending on whether the origin
057 * is the actual Propbank or Libin's automatic annotation for the verb "be".)
058 * The fifth field is the
059 * roleset identifier (with .XX indicating this identifier is incomplete
060 * and only refers to the verb, not to a particular roleset
061 * associated with that verb). The fifth field describes the verb's inflection
062 * in the original Propbank, but is left blank (<code>-----</code>) in Libin's
063 * versoin.
064 * The remaining fields describe the predicate-argument structure. The main
065 * difference to the original Propbank here is that locations are indicated
066 * as word spans rather than as nodes in the Penn Treebank annotation.
067 */
068 public Annotation (String ln)
069 throws CorruptDataException
070 {
071 String parts[] = ln.trim().split(" ");
072 if (parts.length < 7) {
073 throw new CorruptDataException("invalid annotation line: " + ln);
074 }
075 pasloc = PASLoc.ofString(parts[0] + " " + parts[1] + " " + parts[2]);
076 annotator = parts[3];
077 rolesetid = parts[4];
078 inflection = new Inflection(parts[5]);
079 int dotidx = rolesetid.indexOf(".");
080 if (dotidx == -1) {
081 throw new CorruptDataException("invalid annotation line (bad roleset): " + ln);
082 }
083 String lemma = rolesetid.substring(0, dotidx);
084 pas = new PAStruct(lemma);
085 for (int i=6; i < parts.length; i++) {
086 pas.addArg(Argument.ofString(parts[i]));
087 }
088 }
089
090 public Annotation (PASLoc loc, String annotator, RoleSet roleset,
091 Inflection inflection, PAStruct structure) {
092 this(loc, annotator, roleset.getId(), inflection, structure);
093 this.roleset = roleset;
094 }
095
096 public Annotation (PASLoc loc, String annotator, String rolesetId,
097 Inflection inflection, PAStruct structure) {
098 this.pasloc = loc;
099 this.annotator = annotator;
100 this.rolesetid = rolesetId;
101 this.inflection = inflection;
102 this.pas = structure;
103 }
104
105 /**
106 * Return a canonical string representation of this class, suitable for
107 * passing to the class's constructor.
108 */
109 public String toString ()
110 {
111 return (pasloc.toString() + " "
112 + annotator + " "
113 + rolesetid + " "
114 + inflection.toString() + " "
115 + pas.toString());
116 }
117
118 /**
119 * Return the predicate argument structure portion of the annotation.
120 * @see edu.upenn.cis.propbank_shen.PAStruct
121 */
122 public PAStruct getPAStruct()
123 {
124 return pas;
125 }
126
127 /**
128 * Return the predicate argument structure location.
129 * @see edu.upenn.cis.propbank_shen.PASLoc
130 */
131 public PASLoc getPASLoc()
132 {
133 return pasloc;
134 }
135
136 private Argument relation = null;
137
138
139 // // convenience method -- possibly null if none can be found
140 // public ElemTree getRelationSubtree(Sentence s) {
141 // return this.getRelation().getLocation().getDominatingNode(s);
142 // }
143
144 public Argument getRelation() {
145 if (this.relation != null) return this.relation;
146 Iterator args = this.getPAStruct().getArgs().iterator();
147
148 while (args.hasNext()) {
149 Argument current = (Argument) args.next();
150 if (current.arg_label.isRel()) {
151 this.relation = current;
152 return current;
153 }
154 }
155 // shouldn't get here because every Annotation must have a relation
156 assert false;
157 throw new IllegalStateException("Bad annotation -- doesn't have a relation: "
158 + this.toString());
159
160 }
161
162
163
164 /**
165 * Return the inflection part of the annotation.
166 * @see edu.upenn.cis.propbank_shen.Inflection
167 */
168 public Inflection getInflection()
169 {
170 return inflection;
171 }
172
173 /**
174 * Return the RoleSet of the annotation, if disambiguated, otherwise
175 * return <code>null</code>.
176 * @see edu.upenn.cis.propbank_shen.RoleSet
177 * @throws edu.upenn.cis.propbank_shen.CorruptDataException if Roleset xml file is bad
178 */
179 public RoleSet getRoleSet() throws CorruptDataException
180 {
181 if (roleset != null) {
182 return roleset;
183 }
184 if (rolesetid.endsWith(".XX")) {
185 return null;
186 } else {
187 roleset = RoleSet.ofId(rolesetid);
188 return roleset;
189 }
190 }
191
192 /**
193 * Return the ID of the roleset, such as run.02 or find.XX.
194 */
195 public String getRoleSetId () {
196 return rolesetid;
197 }
198
199 /**
200 * Return the lemma of the annotation
201 */
202 public String getLemma()
203 {
204 return pas.getLemma();
205 }
206
207 /**
208 * Return the annotator
209 */
210 public String getAnnotator()
211 {
212 return annotator;
213 }
214
215 // /**
216 // * Return the edu.upenn.cis.treebank node associated with this
217 // * annotation -- the node representing the main predicate.
218 // * @see edu.upenn.cis.treebank.TBNode
219 // */
220 // public TBNode getTBNode() throws InvalidAddressException
221 // {
222 // return pasloc.getTBNode();
223 // }
224
225 /** a unit test */
226 public static void main(String args[])
227 throws IOException
228 //, InvalidAddressException
229 {
230 if (args.length != 1) {
231 System.err.println("error: give me a file like prop-all.idx!");
232 System.exit(1);
233 }
234 BufferedReader in = new BufferedReader(new FileReader(args[0]));
235 Annotation a;
236 while (true) {
237 String ln = in.readLine();
238 if (ln == null) { break; }
239 try {
240 a = new Annotation(ln);
241 // TBNode tbn = a.getTBNode();
242 PAStruct pas = a.getPAStruct();
243 Iterator argiter = pas.getArgs().iterator();
244 while (argiter.hasNext()) {
245 Argument arg = (Argument) argiter.next();
246 if (arg.location.loc_type == ArgLoc.EQUIVA) {
247 // arg.location.sortMotion(tbn);
248 System.out.println(a.getPASLoc() + " " + arg.location);
249 }
250 }
251 } catch (CorruptDataException cd) {
252 System.err.println(cd);
253 }
254 }
255 }
256 }