001    package edu.upenn.cis.propbank_shen;
002    
003    import java.io.*;
004    import java.util.*;
005    
006    //import edu.upenn.cis.treebank.TBNode;
007    //import edu.upenn.cis.treebank.InvalidAddressException;
008    
009    import edu.upenn.cis.spinal.*;
010    
011    
012    /**
013       This class represents an annotation in the propbank.  An annotation
014       represents a predicate argument structure together with an optional 
015       roleset done by some annotator (usually a person) and with some
016       inflectional/morphological information.
017      
018       @author Scott Cotton
019    
020       @see edu.upenn.cis.propbank_shen.PAStruct
021       @see edu.upenn.cis.propbank_shen.PASLoc
022       @see edu.upenn.cis.propbank_shen.Inflection
023       @see edu.upenn.cis.propbank_shen.RoleSet
024       @see edu.upenn.cis.propbank_shen.Argument
025       
026     */
027    
028    public class Annotation {
029    
030        /** the predicate argument structure */
031        protected PAStruct pas;
032        /** the location in the treebank */
033        protected PASLoc pasloc;
034        /** the annotator who is responsible. */
035        protected String annotator;
036        /** the inflectional information */
037        protected Inflection inflection;
038        /** the roleset identifier */
039        protected String rolesetid;
040        /** the roleset, if identified or null otherwise */
041        protected RoleSet roleset;
042        
043        /** 
044         * construct an Annotation from a line of text.
045         *
046         * Here is an example line of text for input to the constructor.
047         * <pre>
048         * wsj/00/wsj_0002.mrg 0 16 gold name.01 ----- 16_16-rel 0_14*17_17-ARG1 18_25-ARG2
049         * </pre>
050         *
051         * The first field is the relative path of the Wall Street Journal corpus
052         * file.  The second field is the sentence number.  The third field is
053         * the number of the terminal (treebank leaf) representing the annotated
054         * verb.  The fourth field is the annotator name. 
055         * (In Libin Shen's version of the Propbank, this is always either 
056         * <code>gold</code> or <code>mimic</code>, depending on whether the origin
057         * is the actual Propbank or Libin's automatic annotation for the verb "be".)
058         * The fifth field is the
059         * roleset identifier (with .XX indicating this identifier is incomplete 
060         * and only refers to the verb, not to a particular roleset
061         * associated with that verb). The fifth field describes the verb's inflection
062         * in the original Propbank, but is left blank (<code>-----</code>) in Libin's
063         * versoin.
064         * The remaining fields describe the predicate-argument structure. The main 
065         * difference to the original Propbank here is that locations are indicated
066         * as word spans rather than as nodes in the Penn Treebank annotation. 
067         */
068        public Annotation (String ln)
069            throws CorruptDataException
070        {
071            String parts[] = ln.trim().split(" ");
072            if (parts.length < 7) {
073                throw new CorruptDataException("invalid annotation line: " + ln);
074            }
075            pasloc = PASLoc.ofString(parts[0] + " " + parts[1] + " " +  parts[2]);
076            annotator = parts[3];
077            rolesetid = parts[4];
078            inflection = new Inflection(parts[5]);
079            int dotidx = rolesetid.indexOf(".");
080            if (dotidx == -1) {
081                throw new CorruptDataException("invalid annotation line (bad roleset): " + ln);
082            }
083            String lemma = rolesetid.substring(0, dotidx);
084            pas = new PAStruct(lemma);
085            for (int i=6; i < parts.length; i++) {
086                pas.addArg(Argument.ofString(parts[i]));
087            }
088        }
089    
090        public Annotation (PASLoc loc, String annotator, RoleSet roleset,
091                           Inflection inflection, PAStruct structure) {
092            this(loc, annotator, roleset.getId(), inflection, structure);
093            this.roleset = roleset;
094        }
095        
096        public Annotation (PASLoc loc, String annotator, String rolesetId,
097                           Inflection inflection, PAStruct structure) {
098            this.pasloc = loc;
099            this.annotator = annotator;
100            this.rolesetid = rolesetId;
101            this.inflection = inflection;
102            this.pas = structure;
103        }
104    
105        /**
106         * Return a canonical string representation of this class, suitable for
107         * passing to the class's constructor.
108         */
109        public String toString ()
110        {
111            return (pasloc.toString() + " " 
112                    + annotator + " " 
113                    + rolesetid + " " 
114                    + inflection.toString() + " "
115                    + pas.toString());
116        }
117        
118        /**
119         * Return the predicate argument structure portion of the annotation.
120         * @see edu.upenn.cis.propbank_shen.PAStruct
121         */
122        public PAStruct getPAStruct()
123        {
124            return pas;
125        }
126        
127        /**
128         * Return the predicate argument structure location.
129         * @see edu.upenn.cis.propbank_shen.PASLoc
130         */
131        public PASLoc getPASLoc()
132        {
133            return pasloc;
134        }
135        
136        private Argument relation = null;
137        
138        
139    //    // convenience method -- possibly null if none can be found
140    //    public ElemTree getRelationSubtree(Sentence s) {
141    //        return this.getRelation().getLocation().getDominatingNode(s);
142    //    }
143        
144        public Argument getRelation() {
145            if (this.relation != null) return this.relation;
146            Iterator args = this.getPAStruct().getArgs().iterator();
147            
148            while (args.hasNext()) {
149                Argument current = (Argument) args.next();
150                if (current.arg_label.isRel()) {
151                    this.relation = current;
152                    return current;
153                }
154            }
155            // shouldn't get here because every Annotation must have a relation
156            assert false;
157            throw new IllegalStateException("Bad annotation -- doesn't have a relation: "
158                    + this.toString());
159            
160        }
161        
162    
163        
164        /**
165         * Return the inflection part of the annotation.
166         * @see edu.upenn.cis.propbank_shen.Inflection
167         */
168        public Inflection getInflection()
169        {
170            return inflection;
171        }
172        
173        /**
174         * Return the RoleSet of the annotation, if disambiguated, otherwise
175         * return <code>null</code>.
176         * @see edu.upenn.cis.propbank_shen.RoleSet
177         * @throws edu.upenn.cis.propbank_shen.CorruptDataException if Roleset xml file is bad
178         */
179        public RoleSet getRoleSet() throws CorruptDataException
180        {
181            if (roleset != null) {
182                return roleset;
183            }
184            if (rolesetid.endsWith(".XX")) {
185                return null;
186            } else {
187                roleset = RoleSet.ofId(rolesetid);
188                return roleset;
189            }
190        }
191    
192        /**
193         * Return the ID of the roleset, such as run.02 or find.XX.
194         */
195        public String getRoleSetId () {
196      return rolesetid;
197        }
198        
199        /**
200         * Return the lemma of the annotation
201         */
202        public String getLemma()
203        {
204            return pas.getLemma();
205        }
206    
207        /** 
208         * Return the annotator
209         */
210        public String getAnnotator()
211        {
212            return annotator;
213        }
214    
215    //    /**
216    //     * Return the edu.upenn.cis.treebank node associated with this 
217    //     * annotation -- the node representing the main predicate.
218    //     * @see edu.upenn.cis.treebank.TBNode
219    //     */
220    //    public TBNode getTBNode() throws InvalidAddressException
221    //    {
222    //        return pasloc.getTBNode();
223    //    }
224    
225        /**  a unit test */
226        public static void main(String args[]) 
227            throws IOException
228            //, InvalidAddressException
229        {
230            if (args.length != 1) {
231                System.err.println("error: give me a file like prop-all.idx!");
232                System.exit(1);
233            }
234            BufferedReader in = new BufferedReader(new FileReader(args[0]));
235            Annotation a;
236            while (true) {
237                String ln = in.readLine();
238                if (ln == null) { break; }
239                try {
240                    a = new Annotation(ln);
241                //    TBNode tbn = a.getTBNode();
242                    PAStruct pas = a.getPAStruct();
243                    Iterator argiter = pas.getArgs().iterator();
244                    while (argiter.hasNext()) {
245                        Argument arg = (Argument) argiter.next();
246                        if (arg.location.loc_type == ArgLoc.EQUIVA) {
247                  //        arg.location.sortMotion(tbn);
248                          System.out.println(a.getPASLoc() + " " + arg.location);
249                        }
250                    }
251                } catch (CorruptDataException cd) {
252                    System.err.println(cd);
253                }
254            }
255        }
256    }